In [6]:
import pandas as pd
import torch.nn as nn
import torch
from torch import optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
import random

In [7]:
def load_data(path):
    Data = pd.read_csv(path, nrows=10000)
    Data = pd.pivot_table(Data, index='userId', columns='movieId', values='rating')
    return Data
UserItemMatrix = load_data(r'G:\github项目\Recomendation_system\Data\movie\ratings.csv')

In [8]:
UserItemMatrix.head()

movieId,1,2,3,4,5,6,7,8,10,11,...,182715,183611,184471,185031,185135,187541,187593,187595,188301,190183
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [9]:
class LFM(object):
    """构建隐语义模型类"""
    def __init__(self, Rating_Data, F, lr=0.0001, lmbd=0.000001, max_iteration=25):
        """
        Rating_Data: 用户评分矩阵
        F: 隐特征数量
        lr: 学习率
        lmbd: 正则化系数
        max_iteration: 最大迭代次数
        """
        self.F = F
        self.Rating_Data = Rating_Data
        self.lr = lr
        self.lmbd = lmbd
        self.max_iteration = max_iteration
        
        self.features = []
        for i in range(self.F):
            self.features.append("feature{}".format(i+1))
        user_id = Rating_Data.index
        item_id = Rating_Data.columns
        
        # 随机初始化用户和物品隐语义矩阵
        self.P = pd.DataFrame(np.random.rand(len(user_id), len(self.features)), index=user_id, columns=self.features)
        self.Q = pd.DataFrame(np.random.rand(len(self.features), len(item_id)), index=self.features, columns=item_id)
        
    def train(self):
        """使用随机梯度下降法训练用户和物品的隐矩阵"""
        for step in range(self.max_iteration):
            total_error = 0
            for uid in self.Rating_Data.index:
                random_iids = random.sample(list(self.Rating_Data.loc[uid].dropna().index), 1) # 因为不是所有的用户都评价了多个物品，这里只取一个物品进行随机梯度下降
                for random_iid in random_iids:
                    predict_point = self.predict(uid, random_iid)
                    true_point = self.Rating_Data.loc[uid].loc[random_iid]
                    user_error = true_point - predict_point
                    total_error += abs(user_error)
                    for f in range(self.F):
                        self.P.loc[uid].iloc[f] += self.lr * (user_error * self.Q.iloc[f].loc[random_iid] - 
                                                             self.lmbd * self.P.loc[uid].iloc[f])
                        self.Q.iloc[f].loc[random_iid] += self.lr * (user_error * self.P.loc[uid].iloc[f] - 
                                                            self.lmbd * self.Q.iloc[f].loc[random_iid])
            for iid in self.Rating_Data.columns:
                random_uids = random.sample(list(self.Rating_Data.loc[:, iid].dropna().index), 1)
                for random_uid in random_uids:
                    predict_point = self.predict(random_uid, iid)
                    true_point = self.Rating_Data.loc[random_uid].loc[iid]
                    item_error = true_point - predict_point
                    total_error += abs(item_error)
                    for f in range(self.F):
                        self.P.loc[random_uid].iloc[f] += self.lr * (item_error * self.Q.iloc[f].loc[iid] - 
                                                            self.lmbd * self.P.loc[random_uid].iloc[f])
                        self.Q.iloc[f].loc[iid] += self.lr * (item_error * self.P.loc[random_uid].iloc[f] - 
                                                            self.lmbd * self.Q.iloc[f].loc[iid])
            print("{}次迭代------------error:{}".format(step+1, total_error))
        return self.P, self.Q
                    
    def predict(self, uid, iid):
        """返回指定用户对指定物品的预测评分"""
        return sum([self.P.loc[uid].iloc[f] * self.Q.iloc[f].loc[iid] for f in range(self.F)])

In [10]:
lfm = LFM(UserItemMatrix, 10)
Users, Items = lfm.train()

1次迭代------------error:4179.493052216622
2次迭代------------error:4110.287704562116
3次迭代------------error:4086.1584148886786
4次迭代------------error:4033.2997999398794
5次迭代------------error:4040.9869831615724
6次迭代------------error:3972.813826699831
7次迭代------------error:3945.1253356245315
8次迭代------------error:3895.6905789553516
9次迭代------------error:3922.559395717543
10次迭代------------error:3820.056306102683
11次迭代------------error:3893.1710868637524
12次迭代------------error:3802.9778062100986
13次迭代------------error:3808.8608630329777
14次迭代------------error:3797.3000738865717
15次迭代------------error:3769.879414078378
16次迭代------------error:3760.239263566485
17次迭代------------error:3773.5288569705035
18次迭代------------error:3659.7085273210487
19次迭代------------error:3648.4247132040723
20次迭代------------error:3642.732793573847
21次迭代------------error:3648.0419979866238
22次迭代------------error:3640.527150449196
23次迭代------------error:3614.3879172325796
24次迭代------------error:3616.007152690906
25次迭代------

In [4]:
# def divide(Data, k):
#     """
#     将用户评分矩阵分解成用户向量和物品向量, k为隐特征的个数
#     返回所有值初始化的用户和物品隐矩阵
#     """
    
#     # 这里先用随机化用户和物品向量代替Embedding过程，重点是完成模型的建立，之后进行到第三部分Embedding再补充
#     users = Data.index
#     items = Data.columns
#     features = ['feature' + str(i + 1) for i in range(k)]
#     Users = pd.DataFrame(np.random.rand(len(users), k), index=users, columns=features)
#     Items = pd.DataFrame(np.random.rand(k, len(items)), index=features, columns=items)
#     return Users, Items
# Users, Items = divide(UserItemMatrix, 20)

In [11]:
def get_data(UserItemMatrix, Users, Items):
    """根据用户-物品评分矩阵中确切已知的值，分别得到用户隐向量和物品隐向量，分别构建数据"""
    hold_user_item = [] # 记录对应的用户id和物品id
    users_data = [] 
    items_data = [] # 分别记录下未缺失值的用户和物品隐向量
    score = [] # 记录对应的评分，用作训练的标签
    
    for uid in UserItemMatrix.index:
        for iid in UserItemMatrix.columns:
            p = UserItemMatrix.loc[uid].loc[iid]
            if pd.isnull(p) == False:
                hold_user_item.append([uid, iid])
                users_data.append(Users.loc[uid].values)
                items_data.append(Items.loc[:, iid].values)
                score.append(p)
    
    # 将数据转换为tensor格式，以便于之后训练
    users_data = np.array(users_data)
    items_data = np.array(items_data)
    score = np.array(score)
    users_data = torch.from_numpy(users_data)
    items_data = torch.from_numpy(items_data)
    score = torch.from_numpy(score)
    return hold_user_item, users_data, items_data, score

In [12]:
def split_data(UserItemMatrix, ratio=0.8):
    
    # 得到用户和物品的隐向量，以及预测的评分
    hold_user_item, users_data, items_data, score = get_data(UserItemMatrix, Users, Items)
    
    # 得到样本数
    sample_number = len(users_data)
    
    # 划分训练数据和验证数据
    Train_users_data = users_data[:int(sample_number*ratio), :]
    Train_items_data = items_data[:int(sample_number*ratio), :]
    Train_target = score[:int(sample_number*ratio)]
    
    Val_users_data = users_data[int(sample_number*ratio):, :]
    Val_items_data = items_data[int(sample_number*ratio):, :]
    Val_target = score[int(sample_number*ratio):]
    
    return Train_users_data, Train_items_data, Train_target, Val_users_data, Val_items_data, Val_target
Train_users_data, Train_items_data, Train_target, Val_users_data, Val_items_data, Val_target = split_data(UserItemMatrix)

In [13]:
class MyDataset(Dataset):
    # 构建数据集
    def __init__(self, User_Data, Item_Data, Target):
        self.User_Data = User_Data
        self.Item_Data = Item_Data
        self.Target = Target
        
    def __getitem__(self, index):
        sample = {'user': self.User_Data[index], 'item': self.Item_Data[index], 'target': self.Target[index]}
        return sample
        
    def __len__(self):
        return len(self.User_Data)

In [15]:
Train_Dataset = MyDataset(Train_users_data, Train_items_data, Train_target)
Val_Dataset = MyDataset(Val_users_data, Val_items_data, Val_target)

In [16]:
class NeuralCF(nn.Module):
    def __init__(self, hidden_units, fea_number):
        super(NeuralCF, self).__init__()
        self.hidden_units = hidden_units
        self.fea_number = fea_number
        
        # MLP部分
        self.L1 = nn.Linear(2 * self.fea_number, hidden_units[0])
        self.relu1 = nn.ReLU()
        self.L2 = nn.Linear(hidden_units[0], hidden_units[1])
        self.relu2 = nn.ReLU()
        self.L3 = nn.Linear(hidden_units[1], hidden_units[2])
        self.relu3 = nn.ReLU()
        self.L4 = nn.Linear(hidden_units[2], hidden_units[3])
        self.relu4 = nn.ReLU()
        self.L5 = nn.Linear(hidden_units[3], hidden_units[4])
        self.relu5 = nn.ReLU()
    
        # NeuralCF层
        self.NeuralCF = nn.Linear(hidden_units[4], 1)
        
    def forward(self, user, item):
        # 进入GMF部分
        x1 = torch.multiply(user, item)
        
        # 进入MLP部分
        x2 = torch.cat((user, item), 1)  # 按列合并
        x2 = self.relu1(self.L1(x2))
        x2 = self.relu2(self.L2(x2))
        x2 = self.relu3(self.L3(x2))
        x2 = self.relu4(self.L4(x2))
        x2 = self.relu5(self.L5(x2))
        
        # 共同进入NeuralCF层
        x3 = x1 + x2
        output = self.NeuralCF(x3)
        
        return output

In [21]:
# 定义优化器、损失函数、学习率、batchsize
hidden_units = [20, 18, 15, 12, 10]
model = NeuralCF(hidden_units, len(Train_items_data[0]))

loss_fn = nn.MSELoss()

batchsize = 64
Train_Dataloader = DataLoader(Train_Dataset, batch_size=batchsize)
Val_Dataloader = DataLoader(Val_Dataset, batch_size=batchsize)

lr = 0.005
optimizer = optim.SGD(model.parameters(), lr)

In [22]:
model

NeuralCF(
  (L1): Linear(in_features=20, out_features=20, bias=True)
  (relu1): ReLU()
  (L2): Linear(in_features=20, out_features=18, bias=True)
  (relu2): ReLU()
  (L3): Linear(in_features=18, out_features=15, bias=True)
  (relu3): ReLU()
  (L4): Linear(in_features=15, out_features=12, bias=True)
  (relu4): ReLU()
  (L5): Linear(in_features=12, out_features=10, bias=True)
  (relu5): ReLU()
  (NeuralCF): Linear(in_features=10, out_features=1, bias=True)
)

In [23]:
def train(model, Train_Loader, Val_loader, loss_fn, optim, epoches=100):
    model.train()
    for epoch in range(epoches):
        print('第{}轮训练开始, 共{}轮'.format(epoch+1, epoches))
        train_loss = 0
        for i, sample in enumerate(Train_Loader):
            user, item = Variable(sample['user'].type(torch.FloatTensor)), Variable(sample['item'].type(torch.FloatTensor))
            target = Variable(sample['target'])
            output = model(user, item)
            loss = loss_fn(output, target.float())
            
            # 优化器优化模型
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            train_loss += loss
        
        model.eval()
        eval_loss = 0
        for i, sample in enumerate(Val_loader):
            user, item = Variable(sample['user'].type(torch.FloatTensor)), Variable(sample['item'].type(torch.FloatTensor))
            target = Variable(sample['target'])
            output = model(user, item)
            loss = loss_fn(output, target.float())
            
            eval_loss += loss
        
        print("第{}轮训练完成, 训练损失为{}, 验证损失为{}".format(epoch+1, train_loss, eval_loss))
    return model

In [24]:
trained_model = train(model, Train_Dataloader, Val_Dataloader, loss_fn, optimizer)

第1轮训练开始, 共100轮


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


第1轮训练完成, 训练损失为342.8271484375, 验证损失为32.112274169921875
第2轮训练开始, 共100轮
第2轮训练完成, 训练损失为155.79225158691406, 验证损失为31.953536987304688
第3轮训练开始, 共100轮
第3轮训练完成, 训练损失为154.95779418945312, 验证损失为31.816207885742188
第4轮训练开始, 共100轮
第4轮训练完成, 训练损失为154.16648864746094, 验证损失为31.699312210083008
第5轮训练开始, 共100轮
第5轮训练完成, 训练损失为153.432861328125, 验证损失为31.598737716674805
第6轮训练开始, 共100轮
第6轮训练完成, 训练损失为152.75807189941406, 验证损失为31.512771606445312
第7轮训练开始, 共100轮
第7轮训练完成, 训练损失为152.1376495361328, 验证损失为31.438249588012695
第8轮训练开始, 共100轮
第8轮训练完成, 训练损失为151.56451416015625, 验证损失为31.374378204345703
第9轮训练开始, 共100轮
第9轮训练完成, 训练损失为151.034912109375, 验证损失为31.32046890258789
第10轮训练开始, 共100轮
第10轮训练完成, 训练损失为150.5392303466797, 验证损失为31.274784088134766
第11轮训练开始, 共100轮
第11轮训练完成, 训练损失为150.07379150390625, 验证损失为31.23748779296875
第12轮训练开始, 共100轮
第12轮训练完成, 训练损失为149.63525390625, 验证损失为31.208515167236328
第13轮训练开始, 共100轮
第13轮训练完成, 训练损失为149.21937561035156, 验证损失为31.18693733215332
第14轮训练开始, 共100轮
第14轮训练完成, 训练损失为148.81747436523438, 验证损失为31.17281150817871
