In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import itertools

from User_based_CF import *
from Item_based_CF import *

import torch
import torch.nn as nn

from sklearn.metrics import mean_squared_error
import math

In [4]:
# Create a user-item matrix
def create_user_item_matrix(data):
    """
    data: (user, item, rating, timestamp)
    """
    user_list = rating_data["User_id"].values
    item_list = rating_data["Item_id"].values
    rating_list = rating_data["Rating"].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean,\
        rownames=["User_id"], colnames=["Item_id"])
    return user_item_matrix_data

In [5]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [6]:
# Load data to dataframe
with open("ratings.data", "r") as f:
    rating_data = [i.replace("\n", "").split("\t") for i in f.readlines()]
rating_data = pd.DataFrame(np.array(rating_data), columns=["User_id", "Item_id", "Rating", "timestamp"]).astype("int")

# transform timestamp into datetime
rating_data["timestamp"] = [datetime.utcfromtimestamp(i) for i in rating_data["timestamp"]]
rating_data.head()

Unnamed: 0,User_id,Item_id,Rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [7]:
# split data
traindata, testdata = train_test_split(rating_data, test_size=0.25, random_state=12345)

In [8]:
# transform train data into user-item matrix
user_item_matrix_data = create_user_item_matrix(traindata)

In [11]:
similarity_method = ["pearson", "cosine"]
K_list = [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
result_dict = dict()

for one_similarity_method in similarity_method:
    # User-based Collaborative Filtering
    user_cf = User_based_CF(traindata, user_item_matrix_data)
    user_user_correlation_data = user_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        result_dict[f"user-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

    # Item-based Collaborative Filtering
    item_cf = Item_based_CF(traindata, user_item_matrix_data)
    item_item_correlation_data = item_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        result_dict[f"item-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

100%|██████████| 889249/889249 [04:55<00:00, 3007.95it/s]
100%|██████████| 25000/25000 [13:15<00:00, 31.42it/s]
100%|██████████| 25000/25000 [14:03<00:00, 29.63it/s]
100%|██████████| 25000/25000 [14:18<00:00, 29.12it/s]
100%|██████████| 25000/25000 [14:46<00:00, 28.21it/s]
100%|██████████| 25000/25000 [15:16<00:00, 27.27it/s]
100%|██████████| 25000/25000 [15:45<00:00, 26.44it/s]
100%|██████████| 25000/25000 [16:14<00:00, 25.65it/s]
100%|██████████| 25000/25000 [16:52<00:00, 24.69it/s]
100%|██████████| 25000/25000 [21:33<00:00, 19.33it/s]
100%|██████████| 25000/25000 [26:23<00:00, 15.78it/s]
100%|██████████| 25000/25000 [31:19<00:00, 13.30it/s]
100%|██████████| 25000/25000 [35:54<00:00, 11.61it/s]
100%|██████████| 25000/25000 [42:45<00:00,  9.75it/s]
100%|██████████| 25000/25000 [47:17<00:00,  8.81it/s]
100%|██████████| 25000/25000 [52:10<00:00,  7.99it/s]
 94%|█████████▍| 23455/25000 [54:56<03:35,  7.17it/s]

100%|██████████| 2829124/2829124 [05:13<00:00, 9027.61it/s]


In [27]:
# Matrix Factorization

# 1. 建立模型→使用pytorch class
class matrix_factorization(nn.Module):
    def __init__(self, num_user_id, num_item_id, num_features):
        super(matrix_factorization, self).__init__()
        self.num_user_id = num_user_id
        self.num_item_id = num_item_id
        self.num_features = num_features
        self.p_matrix = torch.randn(size=(num_user_id, num_features), requires_grad=True)
        self.q_matrix = nn.Linear(num_features, num_item_id)
        return

    def forward(self):
        X = self.q_matrix(self.p_matrix)
        return X

In [None]:
# 2. 建立Loss function
class mf_loss_function_nobias(nn.Module):
    def __init__(self):
        super(mf_loss_function_nobias, self).__init__()
        return

    def forward(self, pred_user_item_matrix, true_user_item_matrix):
        true_user_item_matrix_value_or_not = (true_user_item_matrix > 0).long()
        return

In [77]:
# 2. 建立Loss function→不使用pytorch class
def loss_func(true_user_item_matrix, pred_user_item_matrix, p_matrix, q_matrix, _lambda_=1e-3):
    """
    true_user_item_matrix: 有遺失值的data.frame
    pred_user_item_matrix: 由模型計算出來的ndarray
    """
    # 2.1 先辨識出該值是否為有值
    identify_true_value_matrix = (true_user_item_matrix > 0).astype("int")

    # 2.2 把真實值遺失值的部分補零，並且把它轉成ndarray
    true_user_item_matrix = true_user_item_matrix.fillna(0).values

    # 2.3 計算y_true-y_pred
    true_minus_pred = true_user_item_matrix-pred_user_item_matrix

    # 2.4 計算loss
    loss = (np.sum( np.power(true_minus_pred, 2) )/2 + _lambda_ * (np.power(p_matrix, 2).sum()+np.power(q_matrix, 2).sum()))/2
    return true_minus_pred, loss

In [54]:
# 3. 計算gradients
def compute_gradients(true_minus_pred, p_matrix, q_matrix, _lambda_=1e-3):
    gradient_p_matrix = 2 * np.sum( np.dot(true_minus_pred, q_matrix.T) ) + _lambda_ * math.sqrt(np.power(p_matrix, 2).sum())
    gradient_q_matrix = 2 * np.sum( np.dot(true_minus_pred.T, p_matrix)  ) + _lambda_ * math.sqrt(np.power(q_matrix, 2).sum())
    return gradient_p_matrix, gradient_q_matrix

In [68]:
# 4. Weight Updated
def weight_updated(p_matrix, q_matrix, gradient_p_matrix, gradient_q_matrix, learning_rate):
    p_matrix += learning_rate * gradient_p_matrix
    q_matrix += learning_rate * gradient_q_matrix
    return p_matrix, q_matrix

In [80]:
# 5. 建立整體訓練流程
num_user_id = 943
num_item_id = 1682
num_features = 20
_lambda_ = 0.1

# 1. 建立模型→不使用pytorch class
p_matrix = np.random.random(size=(num_user_id, num_features))
q_matrix = np.random.random(size=(num_features, num_item_id))
epochs = 10
learning_rate = 1e-5

for epoch in range(epochs):
    print(f"=== Epoch: {epoch} ===")
    pred_user_item_matrix = np.dot(p_matrix, q_matrix)
    true_minus_pred, loss = loss_func(true_user_item_matrix=user_item_matrix_data, pred_user_item_matrix=pred_user_item_matrix, p_matrix=p_matrix, q_matrix=q_matrix)
    gradient_p_matrix, gradient_q_matrix = compute_gradients(true_minus_pred=true_minus_pred, p_matrix=p_matrix, q_matrix=q_matrix, _lambda_=_lambda_)
    p_matrix, q_matrix = weight_updated(p_matrix, q_matrix, gradient_p_matrix, gradient_q_matrix, learning_rate)
    print(gradient_p_matrix, gradient_q_matrix)
    print(f"Loss is {loss}")

=== Epoch: 0 ===
-155168066.82713935 -154659572.49637195
Loss is 9818006.68358793
=== Epoch: 1 ===
4.705025592135713e+18 4.720503105928497e+18
Loss is 9.12292925184716e+20
=== Epoch: 2 ===
-1.3303509884192438e+50 -1.3259890538305355e+50
Loss is 7.824169308536507e+62
=== Epoch: 3 ===
2.9680675835167162e+144 2.977831251336412e+144
Loss is 4.935714554283228e+188
=== Epoch: 4 ===
-inf -inf
Loss is inf
=== Epoch: 5 ===
inf inf
Loss is inf
=== Epoch: 6 ===
nan nan
Loss is nan
=== Epoch: 7 ===
nan nan
Loss is nan
=== Epoch: 8 ===
nan nan
Loss is nan
=== Epoch: 9 ===
nan nan
Loss is nan
