In [1]:
import pandas as pd
import torch.nn as nn
import torch
from torch import optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable

C:\Users\ASUS\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\ASUS\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
def load_data(path):
    Data = pd.read_csv(path, nrows=50000)
    Data = pd.pivot_table(Data, index='userId', columns='movieId', values='rating')
    return Data
UserItemMatrix = load_data(r'G:\github项目\Recomendation_system\Data\movie\ratings.csv')

In [3]:
UserItemMatrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [4]:
def divide(Data, k):
    """
    将用户评分矩阵分解成用户向量和物品向量, k为隐特征的个数
    返回所有值初始化的用户和物品隐矩阵
    """
    
    # 这里先用随机化用户和物品向量代替Embedding过程，重点是完成模型的建立，之后进行到第三部分Embedding再补充
    users = Data.index
    items = Data.columns
    features = ['feature' + str(i + 1) for i in range(k)]
    Users = pd.DataFrame(np.random.rand(len(users), k), index=users, columns=features)
    Items = pd.DataFrame(np.random.rand(k, len(items)), index=features, columns=items)
    return Users, Items
Users, Items = divide(UserItemMatrix, 20)

In [5]:
def get_data(UserItemMatrix, Users, Items):
    """根据用户-物品评分矩阵中确切已知的值，分别得到用户隐向量和物品隐向量，分别构建数据"""
    hold_user_item = [] # 记录对应的用户id和物品id
    users_data = [] 
    items_data = [] # 分别记录下未缺失值的用户和物品隐向量
    score = [] # 记录对应的评分，用作训练的标签
    
    for uid in UserItemMatrix.index:
        for iid in UserItemMatrix.columns:
            p = UserItemMatrix.loc[uid].loc[iid]
            if pd.isnull(p) == False:
                hold_user_item.append([uid, iid])
                users_data.append(Users.loc[uid].values)
                items_data.append(Items.loc[:, iid].values)
                score.append(p)
    
    # 将数据转换为tensor格式，以便于之后训练
    users_data = np.array(users_data)
    items_data = np.array(items_data)
    score = np.array(score)
    users_data = torch.from_numpy(users_data)
    items_data = torch.from_numpy(items_data)
    score = torch.from_numpy(score)
    return hold_user_item, users_data, items_data, score

In [6]:
def split_data(UserItemMatrix, ratio=0.8):
    
    # 得到用户和物品的隐向量，以及预测的评分
    hold_user_item, users_data, items_data, score = get_data(UserItemMatrix, Users, Items)
    
    # 得到样本数
    sample_number = len(users_data)
    
    # 划分训练数据和验证数据
    Train_users_data = users_data[:int(sample_number*ratio), :]
    Train_items_data = items_data[:int(sample_number*ratio), :]
    Train_target = score[:int(sample_number*ratio)]
    
    Val_users_data = users_data[int(sample_number*ratio):, :]
    Val_items_data = items_data[int(sample_number*ratio):, :]
    Val_target = score[int(sample_number*ratio):]
    
    return Train_users_data, Train_items_data, Train_target, Val_users_data, Val_items_data, Val_target
Train_users_data, Train_items_data, Train_target, Val_users_data, Val_items_data, Val_target = split_data(UserItemMatrix)

In [7]:
class MyDataset(Dataset):
    # 构建数据集
    def __init__(self, User_Data, Item_Data, Target):
        self.User_Data = User_Data
        self.Item_Data = Item_Data
        self.Target = Target
        
    def __getitem__(self, index):
        sample = {'user': self.User_Data[index], 'item': self.Item_Data[index], 'target': self.Target[index]}
        return sample
        
    def __len__(self):
        return len(self.User_Data)

In [8]:
Train_Dataset = MyDataset(Train_users_data, Train_items_data, Train_target)
Val_Dataset = MyDataset(Val_users_data, Val_items_data, Val_target)

In [9]:
class NeuralCF(nn.Module):
    def __init__(self, hidden_units, fea_number):
        super(NeuralCF, self).__init__()
        self.hidden_units = hidden_units
        self.fea_number = fea_number
        
        # MLP部分
        self.L1 = nn.Linear(2 * self.fea_number, hidden_units[0])
        self.relu1 = nn.ReLU()
        self.L2 = nn.Linear(hidden_units[0], hidden_units[1])
        self.relu2 = nn.ReLU()
        self.L3 = nn.Linear(hidden_units[1], hidden_units[2])
        self.relu3 = nn.ReLU()
        self.L4 = nn.Linear(hidden_units[2], hidden_units[3])
        self.relu4 = nn.ReLU()
        self.L5 = nn.Linear(hidden_units[3], hidden_units[4])
        self.relu5 = nn.ReLU()
    
        # NeuralCF层
        self.NeuralCF = nn.Linear(hidden_units[4], 1)
        
    def forward(self, user, item):
        # 进入GMF部分
        x1 = torch.multiply(user, item)
        
        # 进入MLP部分
        x2 = torch.cat((user, item), 1)  # 按列合并
        x2 = self.relu1(self.L1(x2))
        x2 = self.relu2(self.L2(x2))
        x2 = self.relu3(self.L3(x2))
        x2 = self.relu4(self.L4(x2))
        x2 = self.relu5(self.L5(x2))
        
        # 共同进入NeuralCF层
        x3 = x1 + x2
        output = self.NeuralCF(x3)
        
        return output

In [10]:
# 定义优化器、损失函数、学习率、batchsize
hidden_units = [40, 35, 30, 25, 20]
model = NeuralCF(hidden_units, len(Train_items_data[0]))

loss_fn = nn.MSELoss()

batchsize = 64
Train_Dataloader = DataLoader(Train_Dataset, batch_size=batchsize)
Val_Dataloader = DataLoader(Val_Dataset, batch_size=batchsize)

lr = 0.005
optimizer = optim.SGD(model.parameters(), lr)

In [11]:
model

NeuralCF(
  (L1): Linear(in_features=40, out_features=40, bias=True)
  (relu1): ReLU()
  (L2): Linear(in_features=40, out_features=35, bias=True)
  (relu2): ReLU()
  (L3): Linear(in_features=35, out_features=30, bias=True)
  (relu3): ReLU()
  (L4): Linear(in_features=30, out_features=25, bias=True)
  (relu4): ReLU()
  (L5): Linear(in_features=25, out_features=20, bias=True)
  (relu5): ReLU()
  (NeuralCF): Linear(in_features=20, out_features=1, bias=True)
)

In [12]:
def train(model, Train_Loader, Val_loader, loss_fn, optim, epoches=100):
    model.train()
    for epoch in range(epoches):
        print('第{}轮训练开始, 共{}轮'.format(epoch+1, epoches))
        train_loss = 0
        for i, sample in enumerate(Train_Loader):
            user, item = Variable(sample['user'].type(torch.FloatTensor)), Variable(sample['item'].type(torch.FloatTensor))
            target = Variable(sample['target'])
            output = model(user, item)
            loss = loss_fn(output, target.float())
            
            # 优化器优化模型
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            train_loss += loss
        
        model.eval()
        eval_loss = 0
        for i, sample in enumerate(Val_loader):
            user, item = Variable(sample['user'].type(torch.FloatTensor)), Variable(sample['item'].type(torch.FloatTensor))
            target = Variable(sample['target'])
            output = model(user, item)
            loss = loss_fn(output, target.float())
            
            eval_loss += loss
        
        print("第{}轮训练完成, 训练损失为{}, 验证损失为{}".format(epoch+1, train_loss, eval_loss))
    return model

In [13]:
trained_model = train(model, Train_Dataloader, Val_Dataloader, loss_fn, optimizer)

第1轮训练开始, 共100轮


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


第1轮训练完成, 训练损失为871.9234008789062, 验证损失为226.93972778320312
第2轮训练开始, 共100轮
第2轮训练完成, 训练损失为672.9725341796875, 验证损失为222.42926025390625
第3轮训练开始, 共100轮
第3轮训练完成, 训练损失为664.5673217773438, 验证损失为219.36814880371094
第4轮训练开始, 共100轮
第4轮训练完成, 训练损失为658.5121459960938, 验证损失为217.1940460205078
第5轮训练开始, 共100轮
第5轮训练完成, 训练损失为653.91943359375, 验证损失为215.6185302734375
第6轮训练开始, 共100轮
第6轮训练完成, 训练损失为650.2606201171875, 验证损失为214.46090698242188
第7轮训练开始, 共100轮
第7轮训练完成, 训练损失为647.2781982421875, 验证损失为213.5484161376953
第8轮训练开始, 共100轮
第8轮训练完成, 训练损失为644.833984375, 验证损失为212.78189086914062
第9轮训练开始, 共100轮
第9轮训练完成, 训练损失为642.7901000976562, 验证损失为212.13436889648438
第10轮训练开始, 共100轮
第10轮训练完成, 训练损失为641.037353515625, 验证损失为211.57933044433594
第11轮训练开始, 共100轮
第11轮训练完成, 训练损失为639.50830078125, 验证损失为211.08323669433594
第12轮训练开始, 共100轮
第12轮训练完成, 训练损失为638.1583862304688, 验证损失为210.61866760253906
第13轮训练开始, 共100轮
第13轮训练完成, 训练损失为636.964599609375, 验证损失为210.17982482910156
第14轮训练开始, 共100轮
第14轮训练完成, 训练损失为635.899658203125, 验证损失为209.75161743164062
第15轮训练开始, 共