In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from User_based_CF import *

In [2]:
# Create a user-item matrix
def create_user_item_matrix(data):
    """
    data: (user, item, rating, timestamp)
    """
    user_list = rating_data["User_id"].values
    item_list = rating_data["Item_id"].values
    rating_list = rating_data["Rating"].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean,\
        rownames=["User_id"], colnames=["Item_id"])
    return user_item_matrix_data

In [3]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [4]:
# Load data to dataframe
with open("ratings.data", "r") as f:
    rating_data = [i.replace("\n", "").split("\t") for i in f.readlines()]
rating_data = pd.DataFrame(np.array(rating_data), columns=["User_id", "Item_id", "Rating", "timestamp"]).astype("int")

# transform timestamp into datetime
rating_data["timestamp"] = [datetime.utcfromtimestamp(i) for i in rating_data["timestamp"]]
rating_data.head()

Unnamed: 0,User_id,Item_id,Rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
# split data
traindata, testdata = train_test_split(rating_data, test_size=0.2, random_state=12345)

In [6]:
testdata.head()

Unnamed: 0,User_id,Item_id,Rating,timestamp
71751,788,726,4,1997-11-30 06:25:28
80493,728,319,3,1997-11-13 17:36:52
2655,58,284,4,1998-01-09 00:08:39
53233,763,11,4,1997-11-07 15:58:53
91141,890,194,5,1997-12-17 23:52:54


In [6]:
# transform train data into user-item matrix
user_item_matrix_data = create_user_item_matrix(traindata)

In [7]:
# User-based Collaborative Filtering
user_cf = User_based_CF(traindata, user_item_matrix_data)
user_user_correlation_data = user_cf.compute_correlation(corr_methods="pearson")

100%|██████████| 889249/889249 [05:15<00:00, 2819.02it/s]


In [12]:
# 針對test data做預測以及模型評估
pred_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=10), tqdm([i for i in range(testdata.shape[0])])))

 44%|████▎     | 8710/20000 [07:00<09:34, 19.65it/s]

[3.0000000000000004,
 2.0,
 3.66972170069632,
 4.0,
 3.8104925876271816,
 3.8771173419678995,
 4.0,
 4.189249651799858,
 4.727284549655083,
 4.0]

2.0