In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
start_time = time.time()

In [2]:
learning_rate = 0.001   # 学习率
dimension = 40  # 维度
epoch = 100    # 迭代次数

In [3]:
def data_preprocess(data_path:str)->pd.DataFrame:
    """
    数据预处理
    :param data_path: 数据路径
    :return: 处理后的数据
    """
    data_raw = pd.read_csv(data_path)   # 读取数据
    data_raw = data_raw.iloc[:, :3] # 去掉时间戳
    data_raw = data_raw.sample(frac=1, random_state=42) # 打乱数据
    return data_raw

In [4]:
data_raw = data_preprocess("ratings.csv")
data_raw.head()

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0


In [5]:
# 划分k折
def k_fold_split(data:pd.DataFrame, k:int)->list:
    """
    划分k折
    :param data: 数据
    :param k: k折
    :return: k折数据
    """
    data_len = len(data)
    for i in range(k):
        start = int(i * data_len / k)
        end = int((i + 1) * data_len / k)
        # 返回训练集和验证集
        yield pd.concat([data.iloc[:start], data.iloc[end:]]), data.iloc[start:end]

In [6]:
for _train_set, _valid_set in k_fold_split(data_raw, 10):
    # print(len(_train_set), len(_valid_set))
    train_set = _train_set.copy()
    valid_set = _valid_set.copy()
    break

len(train_set), len(valid_set)

(90753, 10083)

In [7]:
class Node:
    """节点"""
    def __init__(self, dimension:int=40):
        """
        :param dimension: 节点维度
        """
        self.position = np.random.rand(dimension) # 随机初始化位置

In [8]:
def get_nodes(data:pd.DataFrame, dimension:int=40)->tuple[dict, dict]:
    """
    获取节点，所有的用户和物品都要有节点，否则预测的时候可能会缺少节点
    :param data: 数据
    :param dimension: 节点维度
    :return: 用户节点，物品节点
    """
    user_nodes, item_nodes = {}, {}
    for user, _ in data.groupby("userId"):
        user_nodes[user] = Node(dimension)
    for item, _ in data.groupby("movieId"):
        item_nodes[item] = Node(dimension)
    return user_nodes, item_nodes

In [9]:
_user_nodes, _item_nodes = get_nodes(data_raw, dimension)

In [10]:
_user_nodes[1].__dict__

{'position': array([0.72577678, 0.22965042, 0.31013765, 0.5937562 , 0.12065466,
        0.52380847, 0.01232994, 0.71750174, 0.90211535, 0.53645526,
        0.822347  , 0.4912728 , 0.59801397, 0.83483123, 0.71038348,
        0.8344525 , 0.88302012, 0.63372847, 0.14496075, 0.93047271,
        0.38475083, 0.99503636, 0.27188269, 0.11973636, 0.39022265,
        0.91569622, 0.96210586, 0.01352019, 0.97708882, 0.41319958,
        0.46023534, 0.43124339, 0.70969465, 0.31295105, 0.22984566,
        0.33606102, 0.92653158, 0.14398125, 0.29085527, 0.87927988])}

In [11]:
def update(train_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, learning_rate:float=0.001, maxR:float=5., minR:float=0.5)->tuple[dict, dict]:
    """
    更新节点
    :param train_set: 训练集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :param learning_rate: 学习率
    :param maxR: 最大评分
    :param minR: 最小评分
    :return: 更新后的用户节点，物品节点
    """
    for _, row in train_set.iterrows():
        user, item, rating = int(row["userId"]), int(row["movieId"]), row["rating"]
        user_node, item_node = user_nodes[user], item_nodes[item]

        # 计算欧氏距离
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        # 计算真实距离
        real_distance = 100 * (maxR - rating) / (maxR - minR) 
        error = real_distance - pred_distance   # 误差
        direction = user_node.position - item_node.position # 方向
        # 更新节点位置
        to_move = learning_rate * error * direction
        user_node.position += to_move
        item_node.position -= to_move
        
    return user_nodes, item_nodes

In [12]:
def predict(valid_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, maxR:float=5., minR:float=.5)->pd.DataFrame:
    """
    预测评分
    :param valid_set: 验证集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :param maxR: 最大评分
    :param minR: 最小评分
    :return: 添加预测列后的验证集
    """
    # 为验证集添加一列，存储预测评分
    valid_set["predict"] = 0
    for i, row in valid_set.iterrows():
        user, item, rating = int(row["userId"]), int(row["movieId"]), row["rating"]
        user_node, item_node = user_nodes[user], item_nodes[item]
        # 计算欧氏距离
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        pred_rating = maxR - (maxR - minR) * pred_distance / 100
        if pred_rating > maxR:
            pred_rating = maxR
        elif pred_rating < minR:
            pred_rating = minR
        valid_set.loc[i, "predict"] = pred_rating
    return valid_set

In [13]:
def train(train_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, epoch:int=20, learning_rate:float=0.001, maxR:float=5., minR:float=0.5, log:bool=False):
    """
    训练
    :param train_set: 训练集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :param epoch: 迭代次数
    :param learning_rate: 学习率
    :param maxR: 最大评分
    :param minR: 最小评分
    :param log: 是否打印日志
    :return: 训练后的用户节点，物品节点
    """
    temp = None # 用于记录上一次的mae
    for i in range(epoch):
        if log:
            print(f"epoch: {i+1}")
        user_nodes, item_nodes = update(train_set, user_nodes, item_nodes, learning_rate, maxR, minR)   # 更新节点
        # 每10次迭代，预测训练集并计算在训练集上的mae，如果训练集mae减小不明显，停止迭代
        if i % 10 == 0: # 每10次迭代计算一次mae
            predictions_train = predict(train_set, user_nodes, item_nodes, maxR, minR)  # 预测训练集
            mae = np.mean(np.abs(predictions_train["predict"] - predictions_train["rating"]))
            if log:
                print(f"MAE_train: {mae}")
            if i == 0:
                temp = mae
                continue
            if temp - mae < 0.03:   # 如果训练集mae减小不明显，停止迭代
                break
            temp = mae     
    return user_nodes, item_nodes

In [14]:
from copy import deepcopy
_user_nodes_trained, _item_nodes_trained = train(train_set, deepcopy(_user_nodes), deepcopy(_item_nodes), epoch=200, learning_rate=learning_rate, maxR=5., minR=0.5, log=True)

epoch: 1
MAE_train: 0.7519877363925526
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
MAE_train: 0.5546463433826863
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
epoch: 20
epoch: 21
MAE_train: 0.5255251908382255


In [15]:
_predictions = predict(valid_set, _user_nodes_trained, _item_nodes_trained, maxR=5., minR=0.5)

In [16]:
_predictions

Unnamed: 0,userId,movieId,rating,predict
67037,432,77866,4.5,3.408221
42175,288,474,3.0,3.131054
93850,599,4351,3.0,2.771411
6187,42,2987,4.0,3.573726
12229,75,1610,4.0,3.379172
...,...,...,...,...
84545,543,1387,5.0,3.548884
52065,339,1580,2.0,3.574282
92269,597,1090,5.0,4.329528
18346,116,30749,4.5,3.233753


In [17]:
mae = np.mean(np.abs(_predictions["predict"] - _predictions["rating"]))
rmse = np.sqrt(np.mean(np.square(_predictions["predict"] - _predictions["rating"])))
print(f"MAE: {mae}, RMSE: {rmse}")

MAE: 0.687293833387285, RMSE: 0.8780510251626551


In [18]:
def correction_user(train_set:pd.DataFrame, user_node:Node, item_nodes:dict[int, Node], maxR:float=5.0, minR:float=0.5)->float:
    """
    修正单个用户位置
    :param train_set: 训练集
    :param user_node: 单个用户节点
    :param item_nodes: 所有的电影节点
    :param maxR: 最大评分
    :param minR: 最小评分
    :return: 用户修正量
    """
    weight = {}
    # 计算权重
    for _, row in train_set.iterrows():
        movie_id = int(row["movieId"])
        item_node = item_nodes[movie_id]
        weight[movie_id] = 1 / np.linalg.norm(user_node.position - item_node.position)
    sim_sum = sum(weight.values())
    for movie_id in weight.keys():
        weight[movie_id] /= sim_sum

    # 计算修正量
    correction = 0
    for _, row in train_set.iterrows():
        movie_id = int(row["movieId"])
        item_node = item_nodes[movie_id]
        real_rating = row["rating"]
        pred_distance = np.linalg.norm(user_node.position - item_node.position) # 预测距离
        pred_rating = maxR - (maxR - minR) * pred_distance / 100    # 预测评分
        error = real_rating - pred_rating   # 误差
        correction += error * weight[movie_id]  # 累加修正量
    return correction

In [19]:
def correction_item(train_set:pd.DataFrame, item_node:Node, user_nodes:dict[int, Node], maxR:float=5.0, minR:float=0.5)->float:
    """
    修正物品位置
    :param train_set: 训练集
    :param item_node: 单个物品节点
    :param user_nodes: 所有的用户节点
    :param maxR: 最大评分
    :param minR: 最小评分
    :return: 物品修正量
    """
    weight = {}
    # 计算权重
    for _, row in train_set.iterrows():
        user_id = int(row["userId"])
        user_node = user_nodes[user_id]
        weight[user_id] = 1 / np.linalg.norm(user_node.position - item_node.position)
    sim_sum = sum(weight.values())
    for user_id in weight.keys():
        weight[user_id] /= sim_sum

    # 计算修正量
    correction = 0
    for _, row in train_set.iterrows():
        user_id = int(row["userId"])
        user_node = user_nodes[user_id]
        real_rating = row["rating"]
        pred_distance = np.linalg.norm(user_node.position - item_node.position) # 预测距离
        pred_rating = maxR - (maxR - minR) * pred_distance / 100    # 预测评分
        error = real_rating - pred_rating   # 误差
        correction += error * weight[user_id]   # 累加修正量
    return correction

In [20]:
def get_correction(train_set:pd.DataFrame, user_nodes:dict[int, Node], item_nodes:dict[int, Node], maxR:float=5.0, minR:float=0.5)->tuple[dict, dict]:
    """
    获取修正值
    :param train_set: 训练集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :return: 修正值
    """
    user_correction, item_correction = {}, {}
    # 计算单个用户的修正值
    for user, group in train_set.groupby("userId"):
        user_correction[user] = correction_user(group, user_nodes[user], item_nodes, maxR, minR)
    # 计算单个物品的修正值
    for item, group in train_set.groupby("movieId"):
        item_correction[item] = correction_item(group, item_nodes[item], user_nodes, maxR, minR)
    return user_correction, item_correction

In [21]:
_user_correction, _item_correction = get_correction(train_set, _user_nodes_trained, _item_nodes_trained, maxR=5., minR=0.5)

In [22]:
_user_correction

{1: 0.39311606310169706,
 2: 0.17935325463148716,
 3: 0.7072872815774133,
 4: 0.44367697939243483,
 5: 0.21152041121918286,
 6: 0.2275641888355766,
 7: 0.29943849868101313,
 8: 0.16919347881446292,
 9: 0.36039343661686085,
 10: 0.40189424703211957,
 11: 0.24337687494224608,
 12: 0.7421396736481135,
 13: 0.29527444377157386,
 14: 0.19013131626212396,
 15: 0.27521566620516225,
 16: 0.07331101430761663,
 17: 0.06577547730932637,
 18: 0.15302165413045363,
 19: -0.017396171756738275,
 20: 0.32955154317155794,
 21: 0.18042376359412288,
 22: 0.0398777586517953,
 23: 0.10230024329789589,
 24: 0.051276561067992686,
 25: 0.49342981044747136,
 26: 0.05987009071447853,
 27: 0.28528290947761975,
 28: 0.04216067622299459,
 29: 0.12983963264255635,
 30: 0.3886178412963113,
 31: 0.27419101618405417,
 32: 0.14501179941697664,
 33: 0.26770571345966926,
 34: 0.350939747630496,
 35: 0.3869966139770021,
 36: 0.052516475880969166,
 37: 0.37400387284988956,
 38: 0.21013725828144914,
 39: 0.24142624924749814,

In [23]:
_item_correction

{1: 0.19944646644379752,
 2: 0.16784243820733796,
 3: 0.22306617729200695,
 4: 0.131860186398285,
 5: 0.16462544279344057,
 6: 0.190604560308143,
 7: 0.15571962076067813,
 8: 0.2485960657682029,
 9: 0.19271311050891113,
 10: 0.16664955978993629,
 11: 0.15615039369796888,
 12: 0.2024476849157346,
 13: 0.01920374806847938,
 14: 0.1673776197173874,
 15: 0.24254409833522467,
 16: 0.24012431438226356,
 17: 0.31807357643696726,
 18: 0.21015379031679862,
 19: 0.08177807822399805,
 20: 0.13649101772850766,
 21: 0.1442120484061436,
 22: 0.1223634140060546,
 23: 0.11880947179699669,
 24: 0.2025796411290512,
 25: 0.2733180005234449,
 26: 0.02201671879173106,
 27: 0.32539555861946173,
 28: 0.4570860312203992,
 29: 0.2655538943004318,
 30: 0.37596458036451175,
 31: 0.18505899995635114,
 32: 0.2115089169371167,
 34: 0.2609124835465159,
 36: 0.15748897505878698,
 38: -0.037757627946679556,
 39: 0.1620599183594287,
 40: 0.7077344789719909,
 41: 0.1571415595217415,
 42: 0.07988460083142379,
 43: 0.0025

In [24]:
def DTEC(predictions:pd.DataFrame, user_corrections:dict, item_corrections:dict)->pd.DataFrame:
    """
    Dual Training Error based Correction 
    :param predictions: 预测评分
    :param user_corrections: 用户修正值
    :param item_corrections: 物品修正值
    :return: 修正后的预测评分
    """
    for i, row in predictions.iterrows():
        user, item = int(row["userId"]), int(row["movieId"])
        if user in user_corrections.keys() and item in item_corrections.keys():
            predictions.loc[i, "predict"] += (user_corrections[user] + item_corrections[item]) / 2
    return predictions

In [25]:
_predictions_corrected = DTEC(deepcopy(_predictions), _user_correction, _item_correction)

In [26]:
_predictions_corrected, _predictions

(       userId  movieId  rating   predict
 67037     432    77866     4.5  3.453385
 42175     288      474     3.0  3.315516
 93850     599     4351     3.0  2.849962
 6187       42     2987     4.0  3.745268
 12229      75     1610     4.0  3.550738
 ...       ...      ...     ...       ...
 84545     543     1387     5.0  4.103573
 52065     339     1580     2.0  3.727486
 92269     597     1090     5.0  4.600403
 18346     116    30749     4.5  3.369318
 17425     111     5025     3.0  2.955428
 
 [10083 rows x 4 columns],
        userId  movieId  rating   predict
 67037     432    77866     4.5  3.408221
 42175     288      474     3.0  3.131054
 93850     599     4351     3.0  2.771411
 6187       42     2987     4.0  3.573726
 12229      75     1610     4.0  3.379172
 ...       ...      ...     ...       ...
 84545     543     1387     5.0  3.548884
 52065     339     1580     2.0  3.574282
 92269     597     1090     5.0  4.329528
 18346     116    30749     4.5  3.233753
 1742

In [27]:
print("After DTEC:")
mae = np.abs(_predictions_corrected["predict"] - _predictions_corrected["rating"]).mean()
rmse = np.sqrt(((_predictions_corrected["predict"] - _predictions_corrected["rating"]) ** 2).mean())
print("MAE:", mae, "RMSE:", rmse)

After DTEC:
MAE: 0.6569355756912727 RMSE: 0.8617082972997369


# SCoR

In [28]:
def SCoR(rating_path:str, fold:int=10, learning_rate:float=0.001, dimension:int=40, epoch:int=20):
    """
    SCoR
    :param rating_path: 评分路径
    :param fold: k折验证折数
    :param learning_rate: 学习率
    :param dimension: 节点维度
    :param epoch: 迭代次数
    """
    print("SCoR")
    data = data_preprocess(rating_path)
    maxR, minR = data["rating"].max(), data["rating"].min() # 最大评分和最小评分
    
    MAEs, RMSEs = [], []
    for i, (train_set, valid_set) in enumerate(k_fold_split(data, fold)):
        print("fold:", i+1)
        # 转为节点
        user_nodes, item_nodes = get_nodes(data, dimension=dimension)
        # 训练
        user_nodes_trained, item_nodes_trained = train(train_set, user_nodes, item_nodes, epoch=epoch, learning_rate=learning_rate, maxR=maxR, minR=minR)
        # 预测 
        predictions = predict(valid_set.copy(), user_nodes_trained, item_nodes_trained, maxR, minR)
        
        MAEs.append(np.abs(predictions["predict"] - predictions["rating"]).mean())
        RMSEs.append(np.sqrt(np.mean(np.square(predictions["predict"] - predictions["rating"]))))
        print("MAE:", MAEs[-1], "RMSE:", RMSEs[-1])
    print()
    print("MAE:", np.mean(MAEs), "RMSE:", np.mean(RMSEs))

In [29]:
SCoR("ratings.csv", fold=10, learning_rate=learning_rate, dimension=dimension, epoch=epoch)

SCoR
fold: 1
MAE: 0.684886347669925 RMSE: 0.8748901725348186
fold: 2
MAE: 0.6791876866958317 RMSE: 0.8669003909167764
fold: 3
MAE: 0.6707264265182281 RMSE: 0.8567295696569738
fold: 4
MAE: 0.6758933553781776 RMSE: 0.8652616088966502
fold: 5
MAE: 0.6825408801745466 RMSE: 0.8669724197501856
fold: 6
MAE: 0.6618290424124481 RMSE: 0.8489410387093318
fold: 7
MAE: 0.6809241318028194 RMSE: 0.870372503798219
fold: 8
MAE: 0.6743565831809151 RMSE: 0.8577357747220082
fold: 9
MAE: 0.677370647820086 RMSE: 0.8646639929438499
fold: 10
MAE: 0.6833212706910403 RMSE: 0.8752092389320032

MAE: 0.6771036372344018 RMSE: 0.8647676710860817


# DTEC-SCoR

In [30]:
def DTEC_SCoR(rating_path:str, fold:int=10, learning_rate:float=0.001, dimension:int=40, epoch:int=20):
    """
    DTEC-SCoR
    :param rating_path: 评分路径
    :param fold: k折验证折数
    :param learning_rate: 学习率
    :param dimension: 节点维度
    :param epoch: 迭代次数
    """
    print("DTEC-SCoR")
    data = data_preprocess(rating_path)
    maxR, minR = data["rating"].max(), data["rating"].min() # 最大评分和最小评分
    
    MAEs, RMSEs = [], []
    for i, (train_set, valid_set) in enumerate(k_fold_split(data, fold)):
        print("fold:", i+1)
        # 转为节点
        user_nodes, item_nodes = get_nodes(data, dimension=dimension)
        # 训练
        user_nodes_trained, item_nodes_trained = train(train_set, user_nodes, item_nodes, epoch=epoch, learning_rate=learning_rate, maxR=maxR, minR=minR)
        # 预测 
        predictions = predict(valid_set.copy(), user_nodes_trained, item_nodes_trained, maxR, minR)
        # 获取修正值
        user_correction, item_correction = get_correction(train_set, user_nodes_trained, item_nodes_trained, maxR, minR)
        # 修正后的预测评分
        predictions_corrected = DTEC(predictions, user_correction, item_correction)
        
        MAEs.append(np.abs(predictions_corrected["predict"] - predictions_corrected["rating"]).mean())
        RMSEs.append(np.sqrt(np.mean(np.square(predictions_corrected["predict"] - predictions_corrected["rating"]))))
        print("MAE:", MAEs[-1], "RMSE:", RMSEs[-1])
    print()
    print("MAE:", np.mean(MAEs), "RMSE:", np.mean(RMSEs))

In [31]:
DTEC_SCoR("ratings.csv", fold=10, learning_rate=learning_rate, dimension=dimension, epoch=epoch)

DTEC-SCoR
fold: 1
MAE: 0.6561091112522638 RMSE: 0.8593247281302276
fold: 2
MAE: 0.6460959709036869 RMSE: 0.8459473964457924
fold: 3
MAE: 0.6402665463745283 RMSE: 0.8372040320271203
fold: 4
MAE: 0.6452944067571341 RMSE: 0.8454180900216479
fold: 5
MAE: 0.6483738087206062 RMSE: 0.8436732513455588
fold: 6
MAE: 0.6322705159313388 RMSE: 0.8298306607044306
fold: 7
MAE: 0.6510552284451218 RMSE: 0.8520861543723344
fold: 8
MAE: 0.6420960887441406 RMSE: 0.835368497005186
fold: 9
MAE: 0.6475348570407073 RMSE: 0.8439198625617543
fold: 10
MAE: 0.6536459249482498 RMSE: 0.8562608477101622

MAE: 0.6462742459117777 RMSE: 0.8449033520324214


In [32]:
print("time:", time.time() - start_time)

time: 2279.0144531726837
