In [24]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 生成文本表示

In [25]:
from transformers import BertTokenizer, BertModel
import torch
PATH= "../Chinese_Bert"
tokenizer = BertTokenizer.from_pretrained(PATH)

model = BertModel.from_pretrained(PATH).cpu()


In [27]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\selected_movie_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # 将标签列表转换为字符串
        tags_str = " ".join(rows.Tags)
        # 使用BERT中文模型对标签进行编码
        inputs = tokenizer(tags_str, truncation=True,max_length=512,return_tensors='pt')
        outputs = model(inputs.input_ids.cpu(), inputs.token_type_ids.cpu(), inputs.attention_mask.cpu())
        # 使用最后一层的平均隐藏状态作为标签的向量表示
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Movie] = tag_embedding


1200it [29:09,  1.46s/it]


In [15]:
import pickle

# 将映射表存储为二进制文件
with open('data/tag_embedding_dict_movie.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)


In [16]:
# 从二进制文件中读取映射表
with open('data/tag_embedding_dict_movie.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [17]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\\book_score.csv')

# 显示加载的数据
print(loaded_data)

           User     Book  Rate                       Time         Tag
0       1398478  1467022     0  2011-03-29T12:48:35+08:00         NaN
1       1398478  1777823     0  2011-02-02T21:58:55+08:00         NaN
2       1398478  1902628     0  2011-01-31T15:57:58+08:00         NaN
3       1398478  1878708     0  2011-01-26T11:27:59+08:00         NaN
4       1398478  4238362     0  2011-01-21T13:04:15+08:00         NaN
...         ...      ...   ...                        ...         ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00  张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00  金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00     彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00   小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00          爱情

[637254 rows x 5 columns]


In [23]:
class BookRatingDataset(Dataset):
    def __init__(self, data, user_to_idx, book_to_idx, tag_embedding_dict):
        self.data = data
        self.user_to_idx = user_to_idx
        self.book_to_idx = book_to_idx
        self.tag_embedding_dict = tag_embedding_dict

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_to_idx[row['User']]
        book = self.book_to_idx[row['Book']]
        rating = row['Rate'].astype('float32')
        text_embedding = self.tag_embedding_dict.get(row['Book'])
        return user, book, rating, text_embedding

class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim, hidden_state):
        super(MatrixFactorization, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings = nn.Embedding(num_books, embedding_dim)
        self.linear_embedding = nn.Linear(hidden_state, embedding_dim)
        self.output = nn.Linear(embedding_dim, 6)

    def forward(self, user, book, tag_embedding):
        user_embedding = self.user_embeddings(user)
        book_embedding = self.book_embeddings(book)
        tag_embedding_proj = self.linear_embedding(tag_embedding)
        book_intergrate = book_embedding + tag_embedding_proj
        return (user_embedding * book_intergrate).sum(dim = 1)
        
def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))
    
    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids)}
    
    # 创建将连续索引映射回原始ID的字典
    idx_to_id = {idx: id for id, idx in id_to_idx.items()}
    
    return id_to_idx, idx_to_id

# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group['true'].tolist()
    pred_ratings = group['pred'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [21]:
user_ids = loaded_data['User'].unique()
book_ids = loaded_data['Book'].unique()

user_to_idx, idx_to_user = create_id_mapping(user_ids)
book_to_idx, idx_to_book = create_id_mapping(book_ids)

# 划分训练集和测试集
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# 创建训练集和测试集的数据集对象
train_dataset = BookRatingDataset(train_data, user_to_idx, book_to_idx, tag_embedding_dict)
test_dataset = BookRatingDataset(test_data, user_to_idx, book_to_idx, tag_embedding_dict)

# 创建训练集和测试集的数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, drop_last = True)

num_users = loaded_data['User'].nunique()  
num_books = loaded_data['Book'].nunique() 
embedding_dim, hidden_state = 32, 768

model = MatrixFactorization(num_users, num_books, embedding_dim, hidden_state).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### 训练

In [22]:

num_epochs = 20
lambda_u, lambda_b = 0.001, 0.001

for epoch in range(num_epochs):
    model.train()
    total_loss_train, total_loss_test = 0.0, 0.0

    for idx, (user_ids, book_ids, ratings, tag_embedding) in tqdm(enumerate(train_dataloader)):
        # 使用user_ids, book_ids, ratings进行训练

        optimizer.zero_grad()
        
        predictions = model(user_ids.to(device), book_ids.to(device), tag_embedding.squeeze(1).to(device))
        loss = criterion(predictions, ratings.to(device)) + lambda_u * model.user_embeddings.weight.norm(2) + lambda_b * model.book_embeddings.weight.norm(2)
        
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        
        # if idx % 100 == 0:
        #     print(f'Step {idx}, Loss: {loss.item()}')

    output_loss_train = total_loss_train / (idx + 1) 

    results = []
    model.eval()

    with torch.no_grad():
        for idx, (user_ids, item_ids, true_ratings, tag_embedding) in enumerate(test_dataloader):
            pred_ratings = model(user_ids.to(device), item_ids.to(device), tag_embedding.squeeze(1).to(device))
                    # 使用 PyTorch 的均方误差函数计算 MSE

            loss = criterion(pred_ratings, ratings.to(device))
            total_loss_test += loss.item()

            # 将结果转换为 numpy arrays
            user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
            pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
            true_ratings_np = true_ratings.numpy().reshape(-1, 1)

            # 将这三个 arrays 合并成一个 2D array
            batch_results = np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))

            # 将这个 2D array 添加到 results
            results.append(batch_results)

        # 将结果的 list 转换为一个大的 numpy array
        results = np.vstack(results)


        # 将结果转换为DataFrame
        results_df = pd.DataFrame(results, columns=['user', 'pred', 'true'])
        results_df['user'] = results_df['user'].astype(int)

        ndcg_scores = results_df.groupby('user').apply(compute_ndcg)

        # 计算平均NDCG
        avg_ndcg = ndcg_scores.mean()
        print(f'Epoch {epoch}, Train loss: {output_loss_train}, Test loss:, {total_loss_test / (idx + 1)}, Average NDCG: {avg_ndcg}')

0it [00:00, ?it/s]

77it [00:29,  2.63it/s]


Epoch 0, Train loss: 37.896948009342346, Test loss:, 29.453981771097556, Average NDCG: 0.669455357503654


77it [00:32,  2.35it/s]


Epoch 1, Train loss: 23.381995684140687, Test loss:, 22.706378416581586, Average NDCG: 0.6693531359614187


77it [00:32,  2.35it/s]


Epoch 2, Train loss: 17.140176141416873, Test loss:, 18.81792095729283, Average NDCG: 0.6698120868948291


77it [00:32,  2.39it/s]


Epoch 3, Train loss: 13.557036969568822, Test loss:, 16.065745105991116, Average NDCG: 0.670790723460328


77it [00:30,  2.49it/s]


Epoch 4, Train loss: 9.927426065717425, Test loss:, 12.76462558647255, Average NDCG: 0.67484416983538


77it [00:32,  2.40it/s]


Epoch 5, Train loss: 5.900552923029119, Test loss:, 10.395055708947119, Average NDCG: 0.6765636899134558


77it [00:31,  2.47it/s]


Epoch 6, Train loss: 4.137545040675572, Test loss:, 9.264651657698991, Average NDCG: 0.6776403328667293


77it [00:27,  2.77it/s]


Epoch 7, Train loss: 3.4910978434921858, Test loss:, 8.790166607150784, Average NDCG: 0.679150087776319


77it [00:31,  2.47it/s]


Epoch 8, Train loss: 3.1195257638956044, Test loss:, 8.451067986426416, Average NDCG: 0.6805751176358041


77it [00:32,  2.34it/s]


Epoch 9, Train loss: 2.8442448176346815, Test loss:, 8.281070653494302, Average NDCG: 0.6811917990018123


77it [00:32,  2.37it/s]


Epoch 10, Train loss: 2.6953093262461874, Test loss:, 7.895974091121128, Average NDCG: 0.6816112464860326


77it [00:28,  2.68it/s]


Epoch 11, Train loss: 2.5714518033064806, Test loss:, 7.780641623905727, Average NDCG: 0.6824371097666528


77it [00:29,  2.65it/s]


Epoch 12, Train loss: 2.5038686232133345, Test loss:, 7.604694224023199, Average NDCG: 0.6838124438840457


77it [00:32,  2.38it/s]


Epoch 13, Train loss: 2.366533644787677, Test loss:, 7.500694033387419, Average NDCG: 0.6841136976724457


77it [00:32,  2.35it/s]


Epoch 14, Train loss: 2.2749087903406715, Test loss:, 7.810565583117596, Average NDCG: 0.6874353402935433


77it [00:32,  2.37it/s]


Epoch 15, Train loss: 2.2129329644240343, Test loss:, 7.511787024411288, Average NDCG: 0.6873139816566091


77it [00:33,  2.28it/s]


Epoch 16, Train loss: 2.1924588370632816, Test loss:, 7.701900284011643, Average NDCG: 0.6870089953476574


77it [00:32,  2.40it/s]


Epoch 17, Train loss: 2.115624068619369, Test loss:, 7.424032706718941, Average NDCG: 0.6891919188231611


77it [00:28,  2.75it/s]


Epoch 18, Train loss: 2.0820447082643385, Test loss:, 7.651313162469244, Average NDCG: 0.690736215952758


77it [00:28,  2.72it/s]


Epoch 19, Train loss: 2.0151553370735864, Test loss:, 7.349646159580776, Average NDCG: 0.691922204649647
