In [2]:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\\book_score.csv')

# 显示加载的数据
#print(loaded_data)
loaded_data=loaded_data[loaded_data['Rate'] != 0]
print(loaded_data)

           User     Book  Rate                       Time          Tag
45      1398478  2348372     4  2009-11-10T18:42:00+08:00          NaN
164     1779492  1851385     3  2011-03-13T12:37:12+08:00  奥尔罕·帕慕克,土耳其
165     1779492  3266345     3  2010-10-20T19:31:20+08:00      葛瑞格·摩顿森
166     1779492  1001885     3  2010-10-20T19:29:16+08:00     林达,法国,旅行
168     1779492  1424741     3  2010-10-04T01:24:33+08:00      卡森·麦卡勒斯
...         ...      ...   ...                        ...          ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00   张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00   金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00      彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00    小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00           爱情

[403807 rows x 5 columns]


In [4]:
#TODO 处理加载的数据，得到item、user和star信息。
#合并相同的userids和itemsids
user_ids=loaded_data["User"].unique()
item_ids=loaded_data["Book"].unique()
#创建user to row的字典
user_to_row ={user_id : idx for idx, user_id in enumerate(user_ids)}
item_to_row ={item_id : idx for idx, item_id in enumerate(item_ids)}

In [5]:

class RatingDataset(Dataset):
    def __init__(self,data,user_to_row,item_to_row):
        self.data=data
        #self.user_item_matrix=user_item_matrix
        self.user_to_row=user_to_row
        self.item_to_row=item_to_row

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        user = self.user_to_row[row['User']]
        book = self.item_to_row[row['Book']]
        rating = row['Rate'].astype('float32')
        return user, book, rating
        
        
        

In [6]:
# embedding_dim为超参数，由用户定义
class MatrixFactorization(nn.Module):
    def __init__(self,num_users,num_books,embedding_dim):
        super(MatrixFactorization,self).__init__()
        #词嵌入技术，将user和item分别嵌入为向量
        self.user_embeddings=nn.Embedding(num_users,embedding_dim)
        self.book_embeddings=nn.Embedding(num_books,embedding_dim)

    def forward(self, user,book):
        # 输出即为user矩阵和book矩阵相乘得到的结果
        user_embedding=self.user_embeddings(user)
        book_embedding=self.book_embeddings(book)
        return (user_embedding*book_embedding).sum(dim=1)



In [7]:
#TODO 创建训练集和测试集的数据集对象和数据加载器
train_data, test_data = train_test_split(loaded_data, test_size=0.2, random_state=42)

train_dataset = RatingDataset(train_data, user_to_row, item_to_row)
test_dataset = RatingDataset(test_data, user_to_row, item_to_row)

train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, drop_last = True)

embedding_dim=32

In [8]:
num_users=len(user_ids)
num_books=len(item_ids)
print(num_users)
print(num_books)

4312
1200


In [9]:
model=MatrixFactorization(num_users,num_books,embedding_dim).to(device)
criterion=nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [10]:
model.train()
num_epochs = 10
lambda_b=0.001
lambda_u=0.001
for epoch in range(num_epochs):
    for user, book, rating in train_dataloader:
        optimizer.zero_grad()
        output = model(user, book)
        loss = criterion(output, rating) + lambda_u * model.user_embeddings.weight.norm(2) + lambda_b * model.book_embeddings.weight.norm(2)
        loss.backward()
        optimizer.step()
    # 监控损失或其他性能指标
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")



Epoch 1/10, Loss: 35.65945816040039
Epoch 2/10, Loss: 25.75108528137207
Epoch 3/10, Loss: 19.289918899536133
Epoch 4/10, Loss: 12.165305137634277
Epoch 5/10, Loss: 6.186763763427734
Epoch 6/10, Loss: 3.435378313064575
Epoch 7/10, Loss: 2.5138890743255615
Epoch 8/10, Loss: 1.9156748056411743
Epoch 9/10, Loss: 1.6768184900283813
Epoch 10/10, Loss: 1.4415652751922607


In [11]:

model.eval()  # 将模型设置为评估模式，不进行梯度更新
test_loss = 0.0
with torch.no_grad():  # 不计算梯度
    for user, book, rating in test_dataloader:
        output = model(user, book)
        loss = criterion(output, rating)
        #print(f"Test Loss: {loss}")
        test_loss += loss.item()

# 计算测试性能指标，例如均方误差或其他指标
average_test_loss = test_loss / len(test_dataloader)
print(f"Average Test Loss: {average_test_loss}")

Average Test Loss: 2.126078348410757


In [32]:
from sklearn.metrics import ndcg_score
#使用余弦相似度计算预测顺序和实际数据的差距
model.eval()  # 将模型设置为评估模式，不进行梯度更新
num_users=int(num_users)
num_books=int(num_books)
real_score_array=np.zeros((num_users,num_books))
pred_score_array=np.zeros((num_users,num_books))

with torch.no_grad():  # 不计算梯度
    for user, book, rating in test_dataloader:
        pred = model(user, book)
        for i in range(4096):
            real_score_array[int(user[i]),int(book[i])]=int(rating[i])
            pred_score_array[int(user[i]),int(book[i])]=float(pred[i])
cos_sum=0
ndcg_sum=0
#抽取前100组user
for i in range(100):
    #cal ndcg
    ndcg=ndcg_score(real_score_array[i].reshape(1,-1),pred_score_array[i].reshape(1,-1))
    ndcg_sum=ndcg_sum+ndcg
    vec1=np.argsort(real_score_array[i])
    vec2=np.argsort(pred_score_array[i])
    cos_sim = vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    cos_sum=cos_sum+cos_sim
print(f"cos similarity={cossum/100}\nNDCG_score={ndcg_sum/100}")


        

cos similarity=0.9962113396045966
NDCG_score=0.8770867967084897


In [33]:
from sklearn.metrics import ndcg_score
#print(real_score_array[0])
ndcg=ndcg_score(real_score_array[0].reshape(1,-1),pred_score_array[0].reshape(1,-1))
print(ndcg)

0.0
