## Description:
这个Jupyter用Pytorch实现GMF模型， 完成该模型的预训练过程。

## 导入包

In [1]:
import datetime
import numpy as np
import pandas as pd
from collections import Counter
import heapq

import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchkeras import summary

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# 一些超参数设置
topK = 10
num_factors = 8
num_negatives = 4
batch_size = 64
lr = 0.001

## 导入数据

In [3]:
# 数据在processed Data里面
train = np.load('ProcessedData/train.npy', allow_pickle=True).tolist()
testRatings = np.load('ProcessedData/testRatings.npy').tolist()
testNegatives = np.load('ProcessedData/testNegatives.npy').tolist()

In [4]:
num_users, num_items = train.shape

In [9]:
# 制作数据   用户打过分的为正样本， 用户没打分的为负样本， 负样本这里采用的采样的方式
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [], [], []
    num_items = train.shape[1]
    for (u, i) in train.keys():  # train.keys()是打分的用户和商品       
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        
        # negative instance
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            #print(u, j)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [10]:
user_input, item_input, labels = get_train_instances(train, num_negatives)

In [11]:
train_x = np.vstack([user_input, item_input]).T
labels = np.array(labels)

In [12]:
# 构建成Dataset和DataLoader
train_dataset = TensorDataset(torch.tensor(train_x), torch.tensor(labels).float())
dl_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [13]:
# 测试一下
for (x, y) in iter(dl_train):
    print(x, y)
    break

tensor([[4185, 3539],
        [1447, 1935],
        [2204, 3042],
        [3271, 3290],
        [4597, 1610],
        [5549, 3139],
        [2985, 2961],
        [5479, 2677],
        [1544, 1395],
        [ 238, 3352],
        [2856,  480],
        [3031, 3544],
        [4472,  585],
        [ 283, 2517],
        [1802,  183],
        [1578,   53],
        [ 523, 1020],
        [ 857,  432],
        [2416, 3636],
        [1736, 3470],
        [1192, 2078],
        [3258, 2478],
        [2041, 2051],
        [2040, 2637],
        [1907,  121],
        [6005, 3545],
        [ 296,  833],
        [ 549, 2785],
        [5219, 1803],
        [2637, 2670],
        [3689, 1164],
        [3383,  669],
        [1328, 1299],
        [4823,  233],
        [3258,   17],
        [2531, 2753],
        [1861, 2212],
        [1180, 3295],
        [2179, 2268],
        [3401,  567],
        [2637, 1712],
        [4025, 3466],
        [3625, 2279],
        [5774, 1301],
        [5130, 1070],
        [1

## GMF模型
这里建立GMF模型， 这个模型的输入就是用户和物品的ID， 然后通过Embedding层得到它的向量， 然后就可以加权(过一个全连接层)得到最后的输出。<br>

![](img/3.png)

In [48]:
class GMF(nn.Module):
    
    def __init__(self, num_users, num_items, latent_dim, regs=[0, 0]):
        super(GMF, self).__init__()
        self.MF_Embedding_User = nn.Embedding(num_embeddings=num_users, embedding_dim=latent_dim)
        self.MF_Embedding_Item = nn.Embedding(num_embeddings=num_items, embedding_dim=latent_dim)
        self.linear = nn.Linear(latent_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, inputs):
        # 这个inputs是一个批次的数据， 所以后面的操作切记写成inputs[0], [1]这种， 这是针对某个样本了， 我们都是对列进行的操作
        
        # 先把输入转成long类型
        inputs = inputs.long()
        
        # 用户和物品的embedding
        MF_Embedding_User = self.MF_Embedding_User(inputs[:, 0])  # 这里踩了个坑， 千万不要写成[0]， 我们这里是第一列
        MF_Embedding_Item = self.MF_Embedding_Item(inputs[:, 1])
        
        # 两个隐向量点积
        predict_vec = torch.mul(MF_Embedding_User, MF_Embedding_Item)
        
        # liner
        linear = self.linear(predict_vec)
        output = self.sigmoid(linear)
        output =output.squeeze(-1)
        return output

In [49]:
# 看一下这个网络
model = GMF(1, 1, 10)
summary(model, input_shape=(2,))

--------------------------------------------------------------------------
Layer (type)                            Output Shape              Param #
Embedding-1                                 [-1, 10]                   10
Embedding-2                                 [-1, 10]                   10
Linear-3                                     [-1, 1]                   11
Sigmoid-4                                    [-1, 1]                    0
Total params: 31
Trainable params: 31
Non-trainable params: 0
--------------------------------------------------------------------------
Input size (MB): 0.000008
Forward/backward pass size (MB): 0.000168
Params size (MB): 0.000118
Estimated Total Size (MB): 0.000294
--------------------------------------------------------------------------




## 建立模型 

In [50]:
## 设置
model = GMF(num_users, num_items, num_factors)
model.to(device)

GMF(
  (MF_Embedding_User): Embedding(6040, 8)
  (MF_Embedding_Item): Embedding(3706, 8)
  (linear): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [51]:
num_items

3706

In [52]:
# 简单测试一下模型
for (x, y) in iter(dl_train):
    #x = x.cuda()
    print(model(x))
    break

tensor([0.3570, 0.6983, 0.4466, 0.4896, 0.7143, 0.5156, 0.6362, 0.3560, 0.4343,
        0.4789, 0.4140, 0.6207, 0.3772, 0.4780, 0.5658, 0.4496, 0.2109, 0.4025,
        0.5719, 0.5585, 0.6264, 0.5230, 0.6047, 0.7486, 0.5111, 0.4424, 0.3227,
        0.6975, 0.5880, 0.2927, 0.5204, 0.4563, 0.2791, 0.2496, 0.3337, 0.4371,
        0.3836, 0.3396, 0.2935, 0.5373, 0.3725, 0.7447, 0.3686, 0.5609, 0.3476,
        0.5131, 0.4727, 0.6708, 0.4959, 0.6723, 0.3579, 0.3320, 0.5321, 0.4030,
        0.3399, 0.3496, 0.4444, 0.7119, 0.3895, 0.4645, 0.4726, 0.3690, 0.3440,
        0.2827], grad_fn=<SqueezeBackward1>)


## 模型的训练与评估

### 模型评估函数

In [56]:
# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

# HitRation
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

# NDCG
def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return np.log(2) / np.log(i+2)
    return 0

def eval_one_rating(idx):   # 一次评分预测
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype='int32')
    
    test_data = torch.tensor(np.vstack([users, np.array(items)]).T).to(device)
    predictions = _model(test_data)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i].data.cpu().numpy()
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=lambda k: map_item_score[k])  # heapq是堆排序算法， 取前K个
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return hr, ndcg

def evaluate_model(model, testRatings, testNegatives, K):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    
    _model = model
    _testNegatives = testNegatives
    _testRatings = testRatings
    _K = K
    
    hits, ndcgs = [], []
    for idx in range(len(_testRatings)):
        (hr, ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)
    return hits, ndcgs   

### 模型的训练

In [57]:
# 训练参数设置
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

In [58]:
# 计算出初始的评估
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK)

In [59]:
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('Init: HR=%.4f, NDCG=%.4f' %(hr, ndcg))

Init: HR=0.1028, NDCG=0.0467


In [None]:
# 模型训练 
best_hr, best_ndcg, best_iter = hr, ndcg, -1

epochs = 20
log_step_freq = 10000

for epoch in range(epochs):
    
    # 训练阶段
    model.train()
    loss_sum = 0.0
    for step, (features, labels) in enumerate(dl_train, 1):
        
        #features, labels = features.cuda(), labels.cuda()
        # 梯度清零
        optimizer.zero_grad()
        
        # 正向传播
        predictions = model(features)
        loss = loss_func(predictions, labels)
        
        # 反向传播求梯度
        loss.backward()
        optimizer.step()
        
        # 打印batch级别日志
        loss_sum += loss.item()
        if step % log_step_freq == 0:
            print(("[step = %d] loss: %.3f") %
                  (step, loss_sum/step))
    
    # 验证阶段
    model.eval()
    (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK)
    hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    if hr > best_hr:
        best_hr, best_ndcg, best_iter = hr, ndcg, epoch
        torch.save(model.state_dict(), 'Pre_train/m1-1m_GMF.pkl')  
        
    info = (epoch, loss_sum/step, hr, ndcg)
    print(("\nEPOCH = %d, loss = %.3f, hr = %.3f, ndcg = %.3f") %info)
print('Finished Training...') 

[step = 10000] loss: 0.508
[step = 20000] loss: 0.504
[step = 30000] loss: 0.503
[step = 40000] loss: 0.502
[step = 50000] loss: 0.502
[step = 60000] loss: 0.502
[step = 70000] loss: 0.502

EPOCH = 0, loss = 0.502, hr = 0.097, ndcg = 0.043
[step = 10000] loss: 0.500
[step = 20000] loss: 0.500
[step = 30000] loss: 0.501
[step = 40000] loss: 0.500
[step = 50000] loss: 0.500
[step = 60000] loss: 0.498
[step = 70000] loss: 0.491

EPOCH = 1, loss = 0.482, hr = 0.346, ndcg = 0.185
[step = 10000] loss: 0.382
[step = 20000] loss: 0.377
[step = 30000] loss: 0.373
[step = 40000] loss: 0.370
[step = 50000] loss: 0.367
[step = 60000] loss: 0.366
[step = 70000] loss: 0.364

EPOCH = 2, loss = 0.363, hr = 0.445, ndcg = 0.248
[step = 10000] loss: 0.353
[step = 20000] loss: 0.353
[step = 30000] loss: 0.352
[step = 40000] loss: 0.352
[step = 50000] loss: 0.351
[step = 60000] loss: 0.350
[step = 70000] loss: 0.348
