In [None]:
import os
import sys
import time
import random
import argparse
import numpy as np
import pandas as pd
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn

from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
print("Python 版本:", sys.version)
print("PyTorch 版本:", torch.__version__)

输出：   
Python 版本: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]           
PyTorch 版本: 2.3.0a0+6ddf5cf85e.nv24.04

In [None]:
!pip install torch torchvision torchaudio --upgrade

In [None]:
print("Python 版本:", sys.version)
print("PyTorch 版本:", torch.__version__)

### 开始处理数据

In [None]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header = None, names=["UserID", "MovieID", "Rating", "Timestamp"])
users = pd.read_csv('ml-1m/users.dat', sep='::', header = None, names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header = None, names=["MovieID", "Title", "Genres"], encoding='latin1')

In [None]:
print("ratings:\n", ratings.head(3), "\nusers:\n", users.head(3), "\nmovies:\n", movies.head(3))

### 下面的chunk是为了在创建稀疏矩阵的时候避免第0行第0列全部为0，浪费资源    
### 把UserID和MoovieID的索引都变成从0开始
### 注意：只能运行一次

In [None]:
ratings["UserID"] = ratings["UserID"]-1
ratings["MovieID"] = ratings["MovieID"]-1
ratings.head(5), ratings.shape

### 测试集是每个UserID中最新的一个交互，也就是Timestamp最大的

In [None]:
# 对每个 UserID 分组，并找到最大 Timestamp 所在的索引
idx_test = ratings.groupby('UserID')['Timestamp'].idxmax()

# 使用这些索引来获取对应的行
df_testing_ps = ratings.loc[idx_test]

df_testing_ps.head(3)

### 把原始数据集ratings去掉测试集的数据，再分为训练集和验证集
### 验证集对于每个UserID，随机一个即可

In [None]:
# ratings去掉测试集剩下的数据
ratings_left = ratings.drop(idx_test)

# 验证集
df_validation_ps = ratings_left.groupby('UserID').apply(lambda x: x.sample(1))
df_validation_ps.index.get_level_values(1)

In [None]:
df_training_ps = ratings_left.drop(df_validation_ps.index.get_level_values(1))
df_training_ps.shape

### 现在已经获得了训练，验证，测试集的正样本。只选取前两列UserID和MovieID

In [None]:
df_testing_ps = df_testing_ps.iloc[:, 0:2]
df_validation_ps = df_validation_ps.iloc[:, 0:2]
df_training_ps = df_training_ps.iloc[:, 0:2]

df_testing_ps, df_validation_ps, df_training_ps

### 开始创建稀疏矩阵

In [None]:
list_training_ps = df_training_ps.values.tolist()
len(list_training_ps)

In [None]:
UserID_num = ratings["UserID"].max()
MovieID_num = ratings["MovieID"].max()
UserID_num, MovieID_num

In [None]:
# 创建稀疏矩阵
training_mat = sp.dok_matrix((UserID_num+1, MovieID_num+1), dtype=np.float32)
# 往稀疏矩阵中添加元素
for UserID,MovieID in list_training_ps:
    training_mat[UserID, MovieID] = 1.0

In [None]:
# print(type(training_mat.keys()), type(training_mat.values()))
# for UserID,MovieID in training_mat.keys():
#     print(UserID, MovieID)
#     break

### 先要找出每个UserID对应的所有MovieID，并储存在字典中

In [None]:
# 找到给定data数据集下，每个UserID对应的所有MovieID
def obtain_dict_UserIDMovieID(data, dictionary):
    for index,row in tqdm(data.iterrows(), total=data.shape[0]):
        UserID = row[0]
        MovieID = row[1]
        if UserID not in dictionary.keys():
            dictionary[UserID] = []
            dictionary[UserID].append(MovieID)  
        else:
            dictionary[UserID].append(MovieID)
    return dictionary

In [None]:
# 初始化字典
dict_total_UserIDMovieID = {}

# 运行方法
dict_total_UserIDMovieID = obtain_dict_UserIDMovieID(ratings.iloc[:, 0:2], dict_total_UserIDMovieID)

In [None]:
# 查看下结果
len(dict_total_UserIDMovieID[0]), len(dict_total_UserIDMovieID[1])

### 创建正样本的字典

In [None]:
dict_training = {}
dict_validation = {}
dict_testing = {}

dict_training_ps = obtain_dict_UserIDMovieID(df_training_ps, dict_training)
dict_validation_ps = obtain_dict_UserIDMovieID(df_validation_ps, dict_validation)
dict_testing_ps = obtain_dict_UserIDMovieID(df_testing_ps, dict_testing)

### 开始创建负样本

### 训练集每个UserID需要4*正样本数个（来自neural_collaborative_filtering）      
### 验证集每个UserID需要4个
### 测试集每个UserID需要100个

In [None]:
# 使用 groupby 和 size 来统计每个 UserID 的出现次数
user_counts = df_training_ps.groupby('UserID').size().reset_index(name='counts')

count_list = [count for count in user_counts["counts"]]

count_list[0], count_list[1], count_list[2]

In [None]:
num_training_ng = count_list
num_validation_ng = 4
num_testing_ng = 100

### **注意：**我们需要保证每次随机生成的MovieID既不在原始数据集中，又不在前面随机生成过的MovieID中

In [None]:
round(1.2)

In [None]:
# 生成该UserID下相应个数的MovieID
def MovieID_generation(UserID, magnification, dict_ng_sample, dict_UserIDMovieID_generated): # magnification是倍率
    for i in range(round(count_list[UserID] * magnification)): # 生成count_list[UserID] * magnification个MovieID
        random_MovieID = random.randint(0, MovieID_num)
        while random_MovieID in dict_total_UserIDMovieID[UserID] or random_MovieID in dict_UserIDMovieID_generated[UserID]:
            random_MovieID = random.randint(0,MovieID_num)
        dict_ng_sample[UserID].append(random_MovieID)
        dict_UserIDMovieID_generated[UserID].append(random_MovieID)

In [None]:
# 新建一个dict储存在调用方法时生成的每个UserID对应的MovieID
dict_UserIDMovieID_generated = {}
# 先要对每个UserID初始化一个list
for UserID in range(UserID_num+1):
    dict_UserIDMovieID_generated[UserID] = []

def ng_sample_generation_bylist(count_list):
    '''
    count_list是一个list，其中的每个元素代表对于该UserID，需要生成几个负样本
    '''
    
    dict_ng_sample = {} # 结构与dict_UserIDMovieID类似

    for UserID in tqdm(range(UserID_num+1)):
        dict_ng_sample[UserID] = []
                       
        # 如果该UserID对应的MovieID个数小于等于0.19*(MovieID_num + 1)，我们采取4倍的负样本率
        if count_list[UserID] <= 0.19 * (MovieID_num + 1): 
            MovieID_generation(UserID, 4, dict_ng_sample, dict_UserIDMovieID_generated)
        # 如果UserID对应的MovieID个数大于0.19*(MovieID_num + 1)，小于0.49*(MovieID_num + 1)，我们采取1倍的负样本率
        elif 0.19 * (MovieID_num + 1) < count_list[UserID] <= 0.49 * (MovieID_num + 1):
            MovieID_generation(UserID, 1, dict_ng_sample, dict_UserIDMovieID_generated)              
        # 如果UserID对应的MovieID个数大于0.49*(MovieID_num + 1)，小于0.66*(MovieID_num + 1)，我们采取0.5倍的负样本率
        elif 0.49 * (MovieID_num + 1) < count_list[UserID] < 0.66 * (MovieID_num + 1):
            MovieID_generation(UserID, 0.5, dict_ng_sample, dict_UserIDMovieID_generated)
        else:
            print(UserID)
                
    return dict_ng_sample

In [None]:
def ng_sample_generation_bynum(count):
    '''
    count代表一个UserID要生成几个MovieID
    '''
    
    dict_ng_sample = {} # 结构与dict_UserIDMovieID类似

    for UserID in tqdm(range(UserID_num+1)):
        dict_ng_sample[UserID] = []
        for i in range(count): # 生成count_list[UserID] * magnification个MovieID
            random_MovieID = random.randint(0, MovieID_num)
            while random_MovieID in dict_total_UserIDMovieID[UserID] or random_MovieID in dict_UserIDMovieID_generated[UserID]:
                random_MovieID = random.randint(0,MovieID_num)
            dict_ng_sample[UserID].append(random_MovieID)
            dict_UserIDMovieID_generated[UserID].append(random_MovieID)
                
    return dict_ng_sample

### 生成用于训练，验证和测试的负样本字典

In [None]:
dict_training_ng = ng_sample_generation_bylist(num_training_ng)
dict_validation_ng = ng_sample_generation_bynum(num_validation_ng)
dict_testing_ng = ng_sample_generation_bynum(num_testing_ng)

In [None]:
dict_training_ng[1], dict_validation_ng[0], dict_testing_ng[0]

In [None]:
for i in tqdm(range(UserID_num+1)):
    print(count_list[i], len(dict_training_ng[i]), len(dict_validation_ng[i]), len(dict_testing_ng[i]))
    break

### 开始合并正样本和负样本
现在有：      
dict_training_ps, dict_validation_ps, dict_testing_ps            
dict_training_ng, dict_validation_ng, dict_testing_ng         

In [None]:
def dict_conbine_ps_ng(dict_ps, dict_ng):
    dict_total = {}
    labels_total = []
    for UserID in tqdm(range(UserID_num+1)): 
        
        dict_total[UserID] = dict_ps[UserID] + dict_ng[UserID]
        
        len_label_ps = len(dict_ps[UserID])
        len_label_ng = len(dict_ng[UserID])
        
        labels_ps = [1 for _ in range(len_label_ps)]
        labels_ng = [0 for _ in range(len_label_ng)]
        
        labels_total = labels_total + labels_ps + labels_ng
    return dict_total, labels_total

In [None]:
dict_training_all, labels_training_all = dict_conbine_ps_ng(dict_training_ps, dict_training_ng)
dict_validation_all, labels_validation_all = dict_conbine_ps_ng(dict_validation_ps, dict_validation_ng)
dict_testing_all, labels_testing_all = dict_conbine_ps_ng(dict_testing_ps, dict_testing_ng)

In [None]:
# len(dict_training_all[0]), labels_training_all[0:256]

### 将字典转化为二维list

In [None]:
# 转换为二维列表
list_training_all = [[key, value] for key, values in dict_training_all.items() for value in values]
list_validation_all = [[key, value] for key, values in dict_validation_all.items() for value in values]
list_testing_all = [[key, value] for key, values in dict_testing_all.items() for value in values]

In [None]:
len(list_training_all), list_training_all[0:5]

### 现在有训练集全部数据list_training_all, 训练集全部标签labels_training_all

### 使用d2l的load_array()，但没有shuffle

In [None]:
# def load_array(data_arrays, batch_size, is_train=True):  
#     dataset = data.TensorDataset(*data_arrays)
#     return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [None]:
# train_features = torch.tensor(list_training_all, dtype=torch.float32)
# train_labels = torch.tensor(labels_training_all, dtype=torch.float32)

In [None]:
# batch_size = 16
# train_loader = load_array((train_features, train_labels), batch_size)

In [None]:
# for a,b in train_loader:
#     print(a, b)
#     break

### 使用DataLoader，有shuffle

### 将二维list和一维的list转化成可以使用Pytorch训练的格式

In [None]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.long)

In [None]:
# 假设 train_features 和 test_features 是你的二维列表数据
# train_labels 和 test_labels 是对应的标签列表
train_dataset = CustomDataset(list_training_all, labels_training_all)
validation_dataset = CustomDataset(list_validation_all, labels_validation_all)
test_dataset = CustomDataset(list_testing_all, labels_testing_all)

### 数据处理完成

In [None]:
batch_size_list = [128, 256, 512, 1024]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=True, num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=101, shuffle=True, num_workers=1)

# 我们选择在初始化模型时定义优化器和损失函数，而不是在训练过程中指定

# GMF

In [None]:
class GMF(nn.Module):
    def __init__(self, user_num, item_num, embedding_dim):
        super(GMF, self).__init__()
        """
        user_num: number of users;
        item_num: number of items;
        embedding_dim: number of embedding dimensions;
        hidden_layer: dimension of each hidden layer (list type)
        """

        self.embed_user = nn.Embedding(user_num, embedding_dim)
        self.embed_item = nn.Embedding(item_num, embedding_dim)
        
        self.predict_layer = nn.Linear(embedding_dim, 1)

        nn.init.normal_(self.embed_user.weight, std=0.01)
        nn.init.normal_(self.embed_item.weight, std=0.01)

        nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity='sigmoid')

        # Kaiming/Xavier initialization can not deal with non-zero bias terms
        if self.predict_layer.bias is not None:
            self.predict_layer.bias.data.zero_()

    def forward(self, user, item):
        embed_user = self.embed_user(user)
        embed_item = self.embed_item(item)
        
        output = embed_user * embed_item
        prediction = self.predict_layer(output)
        return prediction.view(-1)

In [None]:
embedding_dim = 8
GMF_model = GMF(UserID_num+1, MovieID_num+1, embedding_dim)

# 定义优化器
lr = 0.0001
optimizer = optim.Adam(GMF_model.parameters(), lr=lr)

# 定义损失函数
loss_function = nn.BCEWithLogitsLoss()

In [None]:
training_model(GMF_model, train_loader, test_loader, 10, 10) 

# MLP

In [None]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, user_num, item_num, layers, reg_layers, dropout):
        super(MLP, self).__init__()
        '''
        layers: List containing the output size of each layer, e.g., [64, 32, 16, 8]
        reg_layers: List containing the L2 regularization strength for each layer
        '''

        # Ensure input lists have correct dimensions
        assert len(layers) == len(reg_layers) & len(layers) == len(dropout), "Lengths of layers and reg_layers must be equal"

        # Embedding dimensions should be half of the first layer size
        embedding_dim = layers[0] // 2

        self.embed_user = nn.Embedding(user_num, embedding_dim)
        self.embed_item = nn.Embedding(item_num, embedding_dim)
        
        # Initializing the MLP layers
        self.layers = nn.ModuleList()
        input_dim = embedding_dim * 2  # User and item embeddings are concatenated

        for idx, (layer_size, dropout_rate) in enumerate(zip(layers, reg_layers)):
            layer = nn.Linear(input_dim, layer_size)
            # Applying L2 Regularization by adding weight decay directly into the optimizer setup later
            self.layers.append(layer)
            self.layers.append(nn.ReLU())
            if dropout_rate > 0:
                self.layers.append(nn.Dropout(dropout_rate))
            input_dim = layer_size
            
        # 添加最后一个输出层，确保输出尺寸为1
        self.layers.append(nn.Linear(layers[-1], 1))

    def forward(self, user, item):
        embed_user = self.embed_user(user)  # shape: (batch_size, embedding_dim)
        embed_item = self.embed_item(item)  # shape: (batch_size, embedding_dim)
        
        # Concatenate user and item embeddings to form input to MLP layers
        vector = torch.cat([embed_user, embed_item], dim=1)  # shape: (batch_size, 2*embedding_dim)

        # 通过 MLP 层
        for layer in self.layers:
            vector = layer(vector)
            
        return vector.reshape(-1)

In [None]:
layers_test = []
n = 1 # n为layers的个数。已经测试了 ，正在测试4
factors = 8 # factors为最后一个隐藏层的大小。已经测试了8，正在测试16
for i in range(n):
    layers_test = [factors] + layers_test
    factors = 2*factors
layers = layers_test

reg_layers = [0]
droupout = [0]

MLP_model = MLP(UserID_num+1, MovieID_num+1, layers, reg_layers, droupout)

# 收集不同层的参数，并为每一层应用不同的权重衰减
param_groups = [
    {'params': MLP_model.embed_user.parameters(), 'weight_decay': reg_layers[0]},
    {'params': MLP_model.embed_item.parameters(), 'weight_decay': reg_layers[0]}
]

# 为MLP每一层添加对应的权重衰减
for i, layer in enumerate(MLP_model.layers):
    if isinstance(layer, nn.Linear) and i < 2*len(layers): # 应该有4*2+1层隐藏层
        layer_idx = i // 2  # 因为每一个Linear层后面都跟着一个ReLU层
        param_groups.append({'params': layer.parameters(), 'weight_decay': reg_layers[layer_idx]})

# 使用带有不同权重衰减的参数组创建优化器
lr = 0.0001
optimizer = torch.optim.Adam(param_groups, lr=lr)

# 定义损失函数
loss_function = nn.BCEWithLogitsLoss()

In [None]:
layers

In [None]:
training_model(MLP_model, train_loader, test_loader, 10, 10) 

# NeuMF

In [None]:
embedding_dim = 8
GMF_model = GMF(UserID_num+1, MovieID_num+1, embedding_dim)
layers = [128, 64, 32, 8]
reg_layers = [0, 0, 0, 0]
droupout = [0, 0, 0, 0]
MLP_model = MLP(UserID_num+1, MovieID_num+1, layers, reg_layers, droupout)

# 定义损失函数
loss_function = nn.BCEWithLogitsLoss()

# 定义GMF_model的优化器
lr = 0.001
optimizer = optim.Adam(GMF_model.parameters(), lr=lr)
print("开始训练GMF_model\n")
training_model(GMF_model, train_loader, test_loader, 10, 10)

# 定义MLP_model的优化器
lr = 0.001
optimizer = optim.Adam(MLP_model.parameters(), lr=lr)
print("开始训练MLP_model\n")
training_model(MLP_model, train_loader, test_loader, 10, 10) 

In [None]:
class NeuMF(nn.Module):
    def __init__(self, user_num, item_num, gmf_embedding_dim, mlp_layers, reg_layers, dropout, alpha=0.5):
        super(NeuMF, self).__init__()
        '''
        user_num: number of users
        item_num: number of items
        gmf_embedding_dim: embedding size for GMF part
        mlp_layers: List containing the output size of each layer in MLP, e.g., [64, 32, 16, 8]
        reg_layers: List containing the L2 regularization strength for each layer in MLP
        dropout: List containing dropout rate for each layer in MLP
        alpha: hyper-parameter determining the trade-off between GMF and MLP
        '''

        self.alpha = alpha

        # GMF part
        self.embed_user_GMF = nn.Embedding(user_num, gmf_embedding_dim)
        self.embed_item_GMF = nn.Embedding(item_num, gmf_embedding_dim)
        
        nn.init.normal_(self.embed_user_GMF.weight, std=0.01)
        nn.init.normal_(self.embed_item_GMF.weight, std=0.01)

        # MLP part
        embedding_dim = mlp_layers[0] // 2
        self.embed_user_MLP = nn.Embedding(user_num, embedding_dim)
        self.embed_item_MLP = nn.Embedding(item_num, embedding_dim)
        
        nn.init.normal_(self.embed_user_MLP.weight, std=0.01)
        nn.init.normal_(self.embed_item_MLP.weight, std=0.01)
        
        # MLP layers
        self.mlp_layers = nn.ModuleList()
        input_dim = embedding_dim * 2

        for layer_size, dropout_rate in zip(mlp_layers, dropout):
            self.mlp_layers.append(nn.Linear(input_dim, layer_size))
            self.mlp_layers.append(nn.ReLU())
            if dropout_rate > 0:
                self.mlp_layers.append(nn.Dropout(dropout_rate))
            input_dim = layer_size
        
        # Final prediction layer
        predict_size = gmf_embedding_dim + mlp_layers[-1]
        self.predict_layer = nn.Linear(predict_size, 1)
        nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity='sigmoid')
        
        if self.predict_layer.bias is not None:
            self.predict_layer.bias.data.zero_()

    def forward(self, user, item):
        # GMF part
        embed_user_GMF = self.embed_user_GMF(user)
        embed_item_GMF = self.embed_item_GMF(item)
        gmf_output = embed_user_GMF * embed_item_GMF # 输出大小：batch_size * gmf_embedding_dim

        # MLP part
        embed_user_MLP = self.embed_user_MLP(user)
        embed_item_MLP = self.embed_item_MLP(item)
        mlp_vector = torch.cat([embed_user_MLP, embed_item_MLP], dim=1)
        
        for layer in self.mlp_layers:
            mlp_vector = layer(mlp_vector) # 输出大小：batch_size * 最后一个隐藏层的size

        # Concatenate GMF and MLP parts
        final_vector = torch.cat([self.alpha * gmf_output, (1 - self.alpha) * mlp_vector], dim=1)
 # 输出大小：batch_size * (gmf_embedding_dim + 最后一个隐藏层的size)

        # Final prediction
        prediction = self.predict_layer(final_vector)
        return prediction.view(-1)
    
    def load_pretrained_weights(self, gmf_model, mlp_model):
        self.embed_user_GMF.weight.data.copy_(gmf_model.embed_user.weight)
        self.embed_item_GMF.weight.data.copy_(gmf_model.embed_item.weight)
        
        self.embed_user_MLP.weight.data.copy_(mlp_model.embed_user.weight)
        self.embed_item_MLP.weight.data.copy_(mlp_model.embed_item.weight)
        
        for (self_layer, gmf_layer), (_, mlp_layer) in zip(self.mlp_layers.named_children(), mlp_model.layers.named_children()):
            if isinstance(self_layer, nn.Linear):
                self_layer.weight.data.copy_(mlp_layer.weight)
                if self_layer.bias is not None:
                    self_layer.bias.data.copy_(mlp_layer.bias)

In [None]:
gmf_embedding_dim = embedding_dim
mlp_layers = layers
reg_layers = reg_layers
dropout = droupout
# 创建NeuMF模型并加载预训练参数
NeuMF_model = NeuMF(UserID_num+1, MovieID_num+1, gmf_embedding_dim, mlp_layers, reg_layers, dropout, alpha=0.5)
NeuMF_model.load_pretrained_weights(GMF_model, MLP_model)

# 使用SGD优化器训练NeuMF
lr = 0.001
optimizer = optim.SGD(NeuMF_model.parameters(), lr=lr)
loss_function = nn.BCEWithLogitsLoss()

In [None]:
training_model(NeuMF_model, train_loader, test_loader, 10, 10) 

# evaluate

In [None]:
def evaluate(model, test_loader, top_k):
    hits = 0
    NDCG = 0
    correct = 0 # 测试集上正确预测的个数
    total = 0 # 总共的样本数

    for user_item, label in test_loader:
        user = user_item[:, 0]
        item = user_item[:, 1]

        user = user.long().to(device)
        item = item.long().to(device)
        label = label.float().to(device)  # 确保标签也在同一设备
        
        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k) # 得到predictions中前top_k大的值的索引
        # 由于我们没有shuffle，且一个batch 101个，正好是一个正样本+一百个负样本
        indices = indices.tolist()
        # 计算HR
        if 0 in indices:
            hits += 1
        # 计算NDCG
        position = [i for i, x in enumerate(indices) if x == 0] # [i for i, x in enumerate(indices) if x == 0]为只包含一个元素的list
        if len(position) == 0:
            NDCG += 0
        else:
            position = position[0]
            NDCG += 1/np.log2(position+1+1)
            
        # 计算测试集上的准确率
        # 应用 Sigmoid 函数以将输出转换为概率
        probabilities = torch.sigmoid(predictions)
        # 将概率转换为二进制预测
        predicted_labels = (probabilities > 0.5).float()  # 使用0.5作为阈值
        correct += (predicted_labels == label).sum().item()
        total += label.size(0)
    
    accuracy = 100 * correct / total
    print(f"测试集上的准确率为：{accuracy:.2f}%")
    
    # 根据公式计算HR
    HR = hits/len(test_loader)
    # 根据公式计算NDCG
    NDCG = NDCG/len(test_loader)
    
    return HR,NDCG

# 训练过程

In [None]:
# 检查是否存在 GPU，否则使用 CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [111]:
def training_model(model, train_loader, test_loader, epochs, top_k, batch_size, lr, factor):

    # 使用GPU加速
    model.to(device)
    # 记录所有epoch中最好的HR和NDCG
    best_HR = 0
    best_NDCG = 0
    
    ########################### TRAINING #####################################
    for epoch in range(epochs):
        loss_everyepoch = 0 # 训练中每个eopch的loss
        correct = 0
        total = 0
        # 训练模式
        model.train() 
        # 添加一个计时器
        start_time = time.time()
        for user_item, label in train_loader:
            user = user_item[:, 0]
            item = user_item[:, 1]
            
            # print(user.shape, type(user))
            user = user.long().to(device)
            item = item.long().to(device)
            label = label.float().to(device)
            
            model.zero_grad()
            prediction = model(user, item)
            
            # 应用 Sigmoid 函数以将输出转换为概率
            probabilities = torch.sigmoid(prediction)
            # 将概率转换为二进制预测
            predicted_labels = (probabilities > 0.5).float()  # 使用0.5作为阈值

            # 计算准确率
            # print(predicted_labels.shape, label.shape, predicted_labels)
            correct += (predicted_labels == label).sum().item()
            # print(correct)
            total += label.size(0)
    
            loss = loss_function(prediction, label)
            loss_everyepoch += loss
            loss.backward()
            optimizer.step()
            
        accuracy = 100 * correct / total
        print(f"Epoch {epoch + 1}/{epochs}, Accuracy: {accuracy:.2f}%, Loss in this epoch: {loss_everyepoch}")
        
        # 测试模式
        model.eval()
        HR, NDCG = evaluate(model, test_loader, top_k)
        
        if HR > best_HR:
            best_HR = HR
        if NDCG > best_NDCG:
            best_NDCG = NDCG
        
        elapsed_time = time.time() - start_time
#         print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
#                 time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
        print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))
    
    print("本次训练最高的HR为：", best_HR, "最高的NDCG为：", best_NDCG)
    # 获取模型名称
    model_name = type(model).__name__
    if model_name == "GMF":
        append_to_csv(GMF_csv_filename, batch_size, lr, factor, best_HR, best_NDCG)
    elif model_name == "MLP":
        if epochs > 5:
            append_to_csv(MLP_filename, batch_size, lr, factor, best_HR, best_NDCG)
        else:
            append_to_csv(MLP_csv_filename, batch_size, lr, factor, best_HR, best_NDCG)
    elif model_name == "NeuMF":
        append_to_csv(NeuMF_csv_filename, batch_size, lr, factor, best_HR, best_NDCG)

In [110]:
MLP_filename = "outcome/MLP_n.csv"

with open(MLP_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    # 写入表头
    writer.writerow(['batch_size', 'lr', 'factor', "HR", "NDCG"])

### GMF上，0.001的学习率，训练集上的acc快速上升并达到稳定
### MLP上，0.001的学习率也是快速收敛

# (1)第一问

### 将训练结果保存

In [107]:
import csv

# 定义CSV文件的文件名
GMF_csv_filename = 'outcome/GMF.csv'
MLP_csv_filename = 'outcome/MLP.csv'
NeuMF_csv_filename = 'outcome/NeuMF.csv'

csv_filename_list = [GMF_csv_filename, MLP_csv_filename, NeuMF_csv_filename]

# 创建并初始化CSV文件，写入表头
for csv_filename in csv_filename_list:
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        # 写入表头
        writer.writerow(['batch_size', 'lr', 'factor', "HR", "NDCG"])

# 函数：逐行添加数据到CSV文件
def append_to_csv(csv_filename, batch_size, lr, factor, HR, NDCG):
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        # 写入一行数据
        writer.writerow([batch_size, lr, factor, HR, NDCG])

# 示例：逐行添加数据
# append_to_csv(GMF_csv_filename, 1, 11, 1, 101, 5)

# print(f"数据已成功添加到 {csv_filename} 文件中。")

### 同时训练三个模型

In [108]:
batch_size_list = [128, 256, 512, 1024]
lr_list = [0.0001 ,0.0005, 0.001, 0.005]
factors = [8, 16, 32, 64]

In [None]:
for batch_size in batch_size_list:
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=True, num_workers=1)
    test_loader = DataLoader(test_dataset, batch_size=101, shuffle=False, num_workers=1)
    
    for lr in lr_list:
        for factor in factors:
    
            embedding_dim = factor
            lr = lr
            loss_function = nn.BCEWithLogitsLoss() # 定义损失函数
        
            GMF_model = GMF(UserID_num+1, MovieID_num+1, embedding_dim)
        
            layers =  [factor*(2**i) for i in reversed(range(4))] # range里面的数代表几层隐藏层，4代表3层
            reg_layers = [0] * 4
            droupout = [0] * 4
            MLP_model = MLP(UserID_num+1, MovieID_num+1, layers, reg_layers, droupout)
        
            print("本次训练batch_size为：", batch_size, "lr为：", lr, "factor为：", factor, "开始训练GMF_model")
            optimizer = optim.Adam(GMF_model.parameters(), lr=lr) # 定义优化器
            training_model(GMF_model, train_loader, test_loader, 5, 10, batch_size, lr, factor) 
            
            print("本次训练batch_size为：", batch_size, "lr为：", lr, "factor为：", factor, "开始训练MLP_model")
            optimizer = optim.Adam(MLP_model.parameters(), lr=lr) # 定义优化器
            training_model(MLP_model, train_loader, test_loader, 5, 10, batch_size, lr, factor) 
        
            ############################## NeuMF_model ####################################
            gmf_embedding_dim = factor
            mlp_layers = layers
            reg_layers = reg_layers
            dropout = droupout
            # 创建NeuMF模型并加载预训练参数
            NeuMF_model = NeuMF(UserID_num+1, MovieID_num+1, gmf_embedding_dim, mlp_layers, reg_layers, dropout, alpha=0.5)
            NeuMF_model.load_pretrained_weights(GMF_model, MLP_model)
            # 使用SGD优化器训练NeuMF
            lr = lr
            optimizer = optim.SGD(NeuMF_model.parameters(), lr=lr)
            loss_function = nn.BCEWithLogitsLoss()
        
            print("本次训练batch_size为：", batch_size, "lr为：", lr, "factor为：", factor, "开始训练NeuMF_model")
            training_model(NeuMF_model, train_loader, test_loader, 5, 10, batch_size, lr, factor) 

本次训练batch_size为： 128 lr为： 0.0001 factor为： 8 开始训练GMF_model
Epoch 1/5, Accuracy: 79.94%, Loss in this epoch: 16611.248046875
测试集上的准确率为：95.19%
HR: 0.457	NDCG: 0.257
Epoch 2/5, Accuracy: 82.77%, Loss in this epoch: 12996.66015625
测试集上的准确率为：93.34%
HR: 0.462	NDCG: 0.259
Epoch 3/5, Accuracy: 82.73%, Loss in this epoch: 12860.7421875
测试集上的准确率为：92.56%
HR: 0.464	NDCG: 0.260
Epoch 4/5, Accuracy: 82.65%, Loss in this epoch: 12776.8935546875
测试集上的准确率为：92.23%
HR: 0.463	NDCG: 0.260
Epoch 5/5, Accuracy: 82.74%, Loss in this epoch: 12646.30859375
测试集上的准确率为：91.95%
HR: 0.465	NDCG: 0.262
本次训练最高的HR为： 0.4649006622516556 最高的NDCG为： 0.2617680124619066
本次训练batch_size为： 128 lr为： 0.0001 factor为： 8 开始训练MLP_model
Epoch 1/5, Accuracy: 80.97%, Loss in this epoch: 15200.6533203125
测试集上的准确率为：92.12%
HR: 0.454	NDCG: 0.255
Epoch 2/5, Accuracy: 83.07%, Loss in this epoch: 12949.5908203125
测试集上的准确率为：91.46%
HR: 0.462	NDCG: 0.258
Epoch 3/5, Accuracy: 83.45%, Loss in this epoch: 12657.41796875
测试集上的准确率为：92.79%
HR: 0.463	NDCG: 

# (2)第二问

根据MLP的代码，如果layer的len为1，则嵌入层之后，隐藏层为a*a的方阵，但文章中说MLP-0是嵌入层然后直接输出预测，所以就是，所以我们直接

In [None]:
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=True, num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=101, shuffle=False, num_workers=1)
    
for n in range(0, 1): # n为几层隐藏层，4代表3个隐藏层
    for factor in factors:

        lr = 0.005 # 得根据上面的结果定
        
        loss_function = nn.BCEWithLogitsLoss() # 定义损失函数
        
        layers =  [factor*(2**i) for i in reversed(range(n+1))] # range里面的数代表几层隐藏层，4代表3层
        reg_layers = [0] * (n+1)
        droupout = [0] * (n+1)
        
        MLP_model = MLP(UserID_num+1, MovieID_num+1, layers, reg_layers, droupout)
        
        print("本次训练batch_size为：", batch_size, "lr为：", lr, "factor为：", factor, "开始训练MLP_model")
        optimizer = optim.Adam(MLP_model.parameters(), lr=lr) # 定义优化器
        training_model(MLP_model, train_loader, test_loader, 20, 10, batch_size, lr, factor) # 训练20个epoch

本次训练batch_size为： 1024 lr为： 0.005 factor为： 8 开始训练MLP_model
Epoch 1/20, Accuracy: 82.86%, Loss in this epoch: 1648.0091552734375
测试集上的准确率为：92.50%
HR: 0.462	NDCG: 0.255


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 2/20, Accuracy: 83.53%, Loss in this epoch: 1570.5020751953125
测试集上的准确率为：92.69%
HR: 0.463	NDCG: 0.260


Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0><function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
        if w.is_alive():if w.is_alive():

  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a

Epoch 3/20, Accuracy: 83.58%, Loss in this epoch: 1563.822265625
测试集上的准确率为：92.65%
HR: 0.464	NDCG: 0.260


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
      File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 4/20, Accuracy: 83.66%, Loss in this epoch: 1555.035888671875
测试集上的准确率为：93.09%
HR: 0.467	NDCG: 0.263


Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0><function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
        if w.is_alive():if w.is_alive():

  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a

Epoch 5/20, Accuracy: 83.78%, Loss in this epoch: 1545.3404541015625
测试集上的准确率为：92.27%
HR: 0.466	NDCG: 0.261


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
Exception ignored in:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>Exception ignored in:     self._shutdown_workers()
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>Traceback (most recent call last):

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    Traceback (most recent call last):
    if w.is_alive():  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__

      File "/usr/local/lib/python3.10/dist-pa

Epoch 6/20, Accuracy: 83.96%, Loss in this epoch: 1533.962646484375
测试集上的准确率为：93.35%
HR: 0.477	NDCG: 0.266


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 7/20, Accuracy: 84.16%, Loss in this epoch: 1521.4627685546875
测试集上的准确率为：93.01%
HR: 0.481	NDCG: 0.269


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: Exception ignored in: can only test a child process<function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 8/20, Accuracy: 84.33%, Loss in this epoch: 1510.976806640625
测试集上的准确率为：93.58%
HR: 0.490	NDCG: 0.275


Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 9/20, Accuracy: 84.46%, Loss in this epoch: 1501.626708984375
测试集上的准确率为：93.04%
HR: 0.502	NDCG: 0.280


Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0><function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
        if w.is_alive():if w.is_alive():

  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
        assert self._parent_pid == os.getpid(), 'can only te

Epoch 10/20, Accuracy: 84.59%, Loss in this epoch: 1492.7564697265625
测试集上的准确率为：93.15%
HR: 0.500	NDCG: 0.279


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdaf819dbd0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

Epoch 11/20, Accuracy: 84.69%, Loss in this epoch: 1484.5880126953125
测试集上的准确率为：93.42%
HR: 0.505	NDCG: 0.282


# 节省内存的小技巧

**ncf-pytorch中的读写数据，使用usecols**
```python
ratings_test = pd.read_csv('ml-1m/ratings.dat', sep='::', header = None, names=["UserID", "MovieID"], usecols=[0, 1], dtype={0: np.int32, 1: np.int32})
# usecols=[0, 1]：指定了仅使用数据文件中的第 0 列和第 1 列，即 'user' 和 'item' 列。这样做可以节省内存，加快数据加载速度。
ratings_test.head(5)
```

In [None]:
ratings_test = pd.read_csv('ml-1m/ratings.dat', sep='::', header = None, names=["UserID", "item"], usecols=[0, 1], dtype={0: np.int32, 1: np.int32})
# usecols=[0, 1]：指定了仅使用数据文件中的第 0 列和第 1 列，即 'user' 和 'item' 列。这样做可以节省内存，加快数据加载速度。
ratings_test.head(5)

In [None]:
a = 1
b = 25
ratings_test[(ratings_test['UserID'] == a) & (ratings_test['item'] == b)]

In [None]:
import scipy.sparse as sp

import torch.utils.data as data

user_num = ratings_test['UserID'].max() + 1
item_num = ratings_test['item'].max() + 1

ratings_test_list = ratings_test.values.tolist()

# load ratings as a dok matrix
train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)

for x in ratings_test_list:
    train_mat[x[0], x[1]] = 1.0