In [1]:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\\movie_score.csv')

# 显示加载的数据
#print(loaded_data)
loaded_data=loaded_data[loaded_data['Rate'] != 0]
print(loaded_data)

           User    Movie  Rate                       Time       Tag
1       1386692  1986338     3  2011-02-25T08:29:20+08:00   美国,2011
2       1386692  4268598     5  2011-02-14T10:05:42+08:00   日本,2011
3       1386692  1851857     4  2011-02-11T16:06:22+08:00   美国,2011
4       1386692  4023638     4  2011-02-10T19:25:12+08:00   英国,2011
5       1386692  1305903     3  2011-02-10T19:24:06+08:00  加拿大,2011
...         ...      ...   ...                        ...       ...
714934  1379646  1309004     5  2007-01-16T20:58:29+08:00       NaN
714967  1379646  1783772     5  2007-01-14T01:46:16+08:00       NaN
714984  1379646  1291859     5  2007-01-14T01:30:09+08:00       NaN
714993  1379646  1484091     5  2007-01-14T01:23:55+08:00       NaN
715018  1379646  1291836     4  2007-01-12T21:42:48+08:00       NaN

[523648 rows x 5 columns]


In [3]:
#TODO 处理加载的数据，得到item、user和star信息。
#合并相同的userids和itemsids
user_ids=loaded_data["User"].unique()
item_ids=loaded_data["Movie"].unique()
#创建user to row的字典
user_to_row ={user_id : idx for idx, user_id in enumerate(user_ids)}
item_to_row ={item_id : idx for idx, item_id in enumerate(item_ids)}

In [4]:

class RatingDataset(Dataset):
    def __init__(self,data,user_to_row,item_to_row):
        self.data=data
        #self.user_item_matrix=user_item_matrix
        self.user_to_row=user_to_row
        self.item_to_row=item_to_row

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        user = self.user_to_row[row['User']]
        movie = self.item_to_row[row['Movie']]
        rating = row['Rate'].astype('float32')
        return user, movie, rating
        
        
        

In [5]:
# embedding_dim为超参数，由用户定义
class MatrixFactorization(nn.Module):
    def __init__(self,num_users,num_movies,embedding_dim):
        super(MatrixFactorization,self).__init__()
        #词嵌入技术，将user和item分别嵌入为向量
        self.user_embeddings=nn.Embedding(num_users,embedding_dim)
        self.movie_embeddings=nn.Embedding(num_movies,embedding_dim)

    def forward(self, user,movie):
        # 输出即为user矩阵和movie矩阵相乘得到的结果
        user_embedding=self.user_embeddings(user)
        movie_embedding=self.movie_embeddings(movie)
        return (user_embedding*movie_embedding).sum(dim=1)



In [6]:
#TODO 创建训练集和测试集的数据集对象和数据加载器
train_data, test_data = train_test_split(loaded_data, test_size=0.2, random_state=42)

train_dataset = RatingDataset(train_data, user_to_row, item_to_row)
test_dataset = RatingDataset(test_data, user_to_row, item_to_row)

train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, drop_last = True)

embedding_dim=32

In [7]:
num_users=len(user_ids)
num_movies=len(item_ids)
print(num_users)
print(num_movies)

1014
1200


In [8]:
model=MatrixFactorization(num_users,num_movies,embedding_dim).to(device)
criterion=nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [9]:
model.train()
num_epochs = 10
lambda_b=0.001
lambda_u=0.001
for epoch in range(num_epochs):
    for user, movie, rating in train_dataloader:
        optimizer.zero_grad()
        output = model(user, movie)
        loss = criterion(output, rating) + lambda_u * model.user_embeddings.weight.norm(2) + lambda_b * model.movie_embeddings.weight.norm(2)
        loss.backward()
        optimizer.step()
    # 监控损失或其他性能指标
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")



Epoch 1/10, Loss: 27.623929977416992
Epoch 2/10, Loss: 15.97860050201416
Epoch 3/10, Loss: 4.323372840881348
Epoch 4/10, Loss: 2.2262110710144043
Epoch 5/10, Loss: 1.648421049118042
Epoch 6/10, Loss: 1.2159852981567383
Epoch 7/10, Loss: 1.0376439094543457
Epoch 8/10, Loss: 0.9023718237876892
Epoch 9/10, Loss: 0.852106511592865
Epoch 10/10, Loss: 0.7923122644424438


In [10]:

model.eval()  # 将模型设置为评估模式，不进行梯度更新
test_loss = 0.0
with torch.no_grad():  # 不计算梯度
    for user, movie, rating in test_dataloader:
        output = model(user, movie)
        loss = criterion(output, rating)
        #print(f"Test Loss: {loss}")
        test_loss += loss.item()

# 计算测试性能指标，例如均方误差或其他指标
average_test_loss = test_loss / len(test_dataloader)
print(f"Average Test Loss: {average_test_loss}")

Average Test Loss: 0.7214122080802917


In [13]:
from sklearn.metrics import ndcg_score
#使用余弦相似度计算预测顺序和实际数据的差距
model.eval()  # 将模型设置为评估模式，不进行梯度更新
num_users=int(num_users)
num_movies=int(num_movies)
real_score_array=np.zeros((num_users,num_movies))
pred_score_array=np.zeros((num_users,num_movies))

with torch.no_grad():  # 不计算梯度
    for user, movie, rating in test_dataloader:
        pred = model(user, movie)
        for i in range(4096):
            real_score_array[int(user[i]),int(movie[i])]=int(rating[i])
            pred_score_array[int(user[i]),int(movie[i])]=float(pred[i])
cos_sum=0
ndcg_sum=0
#抽取前100组user
for i in range(100):
    #cal ndcg
    ndcg=ndcg_score(real_score_array[i].reshape(1,-1),pred_score_array[i].reshape(1,-1))
    ndcg_sum=ndcg_sum+ndcg
    vec1=np.argsort(real_score_array[i])
    vec2=np.argsort(pred_score_array[i])
    cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    cos_sum=cos_sum+cos_sim
print(f"cos similarity={cos_sum/100}\nNDCG_score={ndcg_sum/100}")


        

cos similarity=0.9762505820468013
NDCG_score=0.9457839910554704
