In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from User_based_CF import *
from Item_based_CF import *

import torch
import torch.nn as nn

In [2]:
# Create a user-item matrix
def create_user_item_matrix(data):
    """
    data: (user, item, rating, timestamp)
    """
    user_list = rating_data["User_id"].values
    item_list = rating_data["Item_id"].values
    rating_list = rating_data["Rating"].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean,\
        rownames=["User_id"], colnames=["Item_id"])
    return user_item_matrix_data

In [3]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [4]:
# Load data to dataframe
with open("ratings.data", "r") as f:
    rating_data = [i.replace("\n", "").split("\t") for i in f.readlines()]
rating_data = pd.DataFrame(np.array(rating_data), columns=["User_id", "Item_id", "Rating", "timestamp"]).astype("int")

# transform timestamp into datetime
rating_data["timestamp"] = [datetime.utcfromtimestamp(i) for i in rating_data["timestamp"]]
rating_data.head()

Unnamed: 0,User_id,Item_id,Rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
# split data
traindata, testdata = train_test_split(rating_data, test_size=0.2, random_state=12345)

In [7]:
# transform train data into user-item matrix
user_item_matrix_data = create_user_item_matrix(traindata)

In [7]:
# User-based Collaborative Filtering
user_cf = User_based_CF(traindata, user_item_matrix_data)
user_user_correlation_data = user_cf.compute_correlation(corr_methods="pearson")

# 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=10), tqdm([i for i in range(testdata.shape[0])])))
pred_user_data = [i if i > 0 else 0 for i in pred_user_data]

100%|██████████| 889249/889249 [05:15<00:00, 2819.02it/s]


In [8]:
# Item-based Collaborative Filtering
item_cf = Item_based_CF(traindata, user_item_matrix_data)
item_item_correlation_data = item_cf.compute_correlation(corr_methods="pearson")

# 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
pred_item_data = list(map(lambda x: item_cf.predict_without_time(testdata.iloc[x, 1], testdata.iloc[x, 0], num_item=10), tqdm([i for i in range(testdata.shape[0])])))
pred_item_data = [i if i > 0 else 0 for i in pred_item_data]

100%|██████████| 2829124/2829124 [05:13<00:00, 9027.61it/s]


In [22]:
# Matrix Factorization

# 1. 建立模型
class matrix_factorization(nn.Module):
    def __init__(self, num_user_id, num_item_id, num_features):
        super(matrix_factorization, self).__init__()
        self.num_user_id = num_user_id
        self.num_item_id = num_item_id
        self.num_features = num_features
        self.p_matrix = torch.randn(size=(num_user_id, num_features))
        self.q_matrix = nn.Linear(num_features, num_item_id)
        return

    def forward(self):
        X = self.q_matrix(self.p_matrix)
        return X

In [None]:
# 2. 建立Loss function
class mf_loss_function(nn.Module):
    def __init__(self):
        super(mf_loss_function).__init__()
        return

    def forward(self):
        return

In [23]:
mf_model = matrix_factorization(num_user_id = 983, num_item_id = 1682, num_features=5)

In [24]:
X = mf_model.forward()

In [26]:
X

tensor([[-0.3092,  0.1653, -0.0395,  ...,  0.0210, -0.4245, -0.3916],
        [-1.6193,  0.5300, -0.9046,  ..., -0.7070, -1.1013,  1.1863],
        [-1.4282,  0.9486,  0.3203,  ..., -0.6128, -0.5700,  0.6090],
        ...,
        [-0.9301, -1.2219, -0.6867,  ..., -0.9112,  0.1688, -0.2855],
        [ 1.1720, -0.3517, -0.7473,  ...,  0.7841, -0.6798, -0.8881],
        [-1.0077,  0.5090, -1.8312,  ..., -0.9504, -1.7188,  0.7250]],
       grad_fn=<AddmmBackward>)