In [70]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

In [71]:
data = np.genfromtxt("L6/影片评分数据.dat", delimiter='::', dtype=int)

user_ids = data[:, 0]
movie_ids = data[:, 1]
ratings = data[:, 2]

# 获取用户ID和影片ID的范围
num_users = np.max(user_ids) + 1
num_movies = np.max(movie_ids) + 1

ratings_matrix = np.zeros((num_users, num_movies))
for i in range(len(user_ids)):
    user_id = user_ids[i]
    movie_id = movie_ids[i]
    rating = ratings[i]
    ratings_matrix[user_id, movie_id] = rating

print(ratings_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 3. 0. ... 0. 0. 0.]]


In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [73]:
# 找出评分不为0的索引
nonzero_indices = np.nonzero(ratings_matrix)

# 划分训练集和验证集
train_indices, val_indices = train_test_split(np.arange(len(nonzero_indices[0])), test_size=0.2, random_state=1)

train_user_ids = nonzero_indices[0][train_indices]
train_movie_ids = nonzero_indices[1][train_indices]
train_ratings = ratings_matrix[nonzero_indices][train_indices]

val_user_ids = nonzero_indices[0][val_indices]
val_movie_ids = nonzero_indices[1][val_indices]
val_ratings = ratings_matrix[nonzero_indices][val_indices]

# 转换为PyTorch Tensor
train_user_ids_tensor = torch.LongTensor(train_user_ids).to(device)
train_movie_ids_tensor = torch.LongTensor(train_movie_ids).to(device)
train_ratings_tensor = torch.tensor(train_ratings).float().to(device)

val_user_ids_tensor = torch.LongTensor(val_user_ids).to(device)
val_movie_ids_tensor = torch.LongTensor(val_movie_ids).to(device)
val_ratings_tensor = torch.tensor(val_ratings).float().to(device)


In [74]:
# 定义MF模型
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, num_factors):
        super(MatrixFactorization, self).__init__()
        self.user_factors = nn.Embedding(num_users, num_factors)
        self.movie_factors = nn.Embedding(num_movies, num_factors)
        self.user_biases = nn.Embedding(num_users, 1)
        self.movie_biases = nn.Embedding(num_movies, 1)

        # 初始化参数
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.movie_factors.weight.data.uniform_(0, 0.05)
        self.user_biases.weight.data.zero_()
        self.movie_biases.weight.data.zero_()

    def forward(self, user_ids, movie_ids):
        user_embedding = self.user_factors(user_ids)
        movie_embedding = self.movie_factors(movie_ids)
        user_bias = self.user_biases(user_ids).squeeze()
        movie_bias = self.movie_biases(movie_ids).squeeze()
        dot_product = torch.sum(user_embedding * movie_embedding, dim=1)
        return dot_product + user_bias + movie_bias


num_factors = 10  # 潜在特征数量
model = MatrixFactorization(num_users, num_movies, num_factors).to(device)

class NMSELoss(nn.Module):
    def __init__(self):
        super(NMSELoss, self).__init__()

    def forward(self, predictions, targets):
        mse = nn.MSELoss()(predictions, targets)
        variance = torch.var(targets)
        nmse = mse / variance
        return nmse

# 定义损失函数和优化器
criterion = NMSELoss()
optimizer = optim.SGD(model.parameters(), lr=10, momentum=0.9)

In [75]:
# 训练模型
num_epochs = 1000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    predicted_ratings = model(train_user_ids_tensor.to(device), train_movie_ids_tensor.to(device))
    loss = criterion(predicted_ratings, train_ratings_tensor)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    predicted_val_ratings = model(val_user_ids_tensor, val_movie_ids_tensor)
    val_loss = criterion(predicted_val_ratings, val_ratings_tensor)
    print(f"Validation Loss: {val_loss.item()}")

    tolerance = 0.5
    absolute_errors = torch.abs(predicted_val_ratings - val_ratings_tensor)
    correct_predictions = (absolute_errors <= tolerance).float()

    accuracy = correct_predictions.mean().item() * 100
    print(f"Validation Accuracy: {accuracy:.2f}%")

Epoch 10, Loss: 3.295128583908081
Epoch 20, Loss: 2.0645792484283447
Epoch 30, Loss: 1.2941704988479614
Epoch 40, Loss: 1.006013035774231
Epoch 50, Loss: 0.8886839747428894
Epoch 60, Loss: 0.8191966414451599
Epoch 70, Loss: 0.7801663875579834
Epoch 80, Loss: 0.753822386264801
Epoch 90, Loss: 0.7348647713661194
Epoch 100, Loss: 0.7204339504241943
Epoch 110, Loss: 0.7089837193489075
Epoch 120, Loss: 0.6996718645095825
Epoch 130, Loss: 0.6919538378715515
Epoch 140, Loss: 0.6854596138000488
Epoch 150, Loss: 0.6799272298812866
Epoch 160, Loss: 0.6751582622528076
Epoch 170, Loss: 0.6710003018379211
Epoch 180, Loss: 0.6673315167427063
Epoch 190, Loss: 0.6640540361404419
Epoch 200, Loss: 0.661086916923523
Epoch 210, Loss: 0.6583627462387085
Epoch 220, Loss: 0.6558241844177246
Epoch 230, Loss: 0.6534220576286316
Epoch 240, Loss: 0.6511128544807434
Epoch 250, Loss: 0.6488574743270874
Epoch 260, Loss: 0.6466203927993774
Epoch 270, Loss: 0.6443679928779602
Epoch 280, Loss: 0.6420686841011047
Epoch

In [76]:
# 预测初始值为0的数据
def predict_all_ratings(model, ratings_matrix):
    with torch.no_grad():
        zero_indices = np.argwhere(ratings_matrix == 0)
        zero_indices_tensor = torch.tensor(zero_indices).to(device)
        user_ids = zero_indices_tensor[:, 0]
        movie_ids = zero_indices_tensor[:, 1]
        predicted_ratings = model(user_ids, movie_ids)
        predicted_ratings = torch.clamp(predicted_ratings, 1, 5)  # 将预测评分限制在1到5之间
    return predicted_ratings.cpu().numpy()

predicted_all_ratings = predict_all_ratings(model, ratings_matrix)
# 将预测后的数据补充到原始矩阵中
ratings_matrix_predicted = ratings_matrix.copy()
zero_indices = np.argwhere(ratings_matrix == 0)
ratings_matrix_predicted[zero_indices[:, 0], zero_indices[:, 1]] = predicted_all_ratings

In [77]:
def average_ratings(ratings_matrix):
    # 计算每部电影的评分总和和被评分的次数
    movie_sums = np.sum(ratings_matrix, axis=0)
    num_ratings = np.sum(ratings_matrix != 0, axis=0)
    # 计算每部电影的平均得分
    average_ratings = np.divide(movie_sums, num_ratings, where=num_ratings != 0)
    return average_ratings

# 计算每部电影的平均得分
average_ratings = average_ratings(ratings_matrix_predicted)

# 找出平均得分最高的10部电影
top_indices = np.argsort(average_ratings)[-10:]
top_movie_ids = top_indices
top_average_ratings = average_ratings[top_indices]

print("TOP Movies with Highest Average Ratings:")
for i in range(10):
    print(f"Movie {top_movie_ids[i]} has average rating of {top_average_ratings[i]}")


TOP Movies with Highest Average Ratings:
Movie 1198 has average rating of 4.403781304985379
Movie 858 has average rating of 4.423511101934732
Movie 904 has average rating of 4.42363148072247
Movie 1148 has average rating of 4.4410004619335695
Movie 2019 has average rating of 4.449543039010586
Movie 745 has average rating of 4.465214004679994
Movie 50 has average rating of 4.488059602535989
Movie 527 has average rating of 4.495255713114733
Movie 318 has average rating of 4.542595365706155
Movie 2905 has average rating of 4.669391165583206


In [78]:
# 找出每个用户最喜欢的十个电影
top_n = 10
user_top_movies = {}

for user_id in range(num_users):
    # 获取当前用户的所有评分
    user_ratings = ratings_matrix_predicted[user_id, :]
    # 找出评分最高的top_n个电影的索引
    top_movie_indices = np.argsort(user_ratings)[-top_n:][::-1]
    user_top_movies[user_id] = top_movie_indices

output_file = "user_top_movies.txt"
with open(output_file, "w") as f:
    for user_id, top_movies in user_top_movies.items():
        f.write(f"User {user_id}:\n")
        for movie_id in top_movies:
            f.write(f"Movie {movie_id}, Rating: {ratings_matrix_predicted[user_id, movie_id]:.2f}\n")
        f.write("\n")

print(f"User top movies have been written to {output_file}")

User top movies have been written to user_top_movies.txt
