In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import itertools

from User_based_CF import *
from Item_based_CF import *
from Matrix_Factorization import *


import torch
import torch.nn as nn

from sklearn.metrics import mean_squared_error
import math


import json

In [4]:
# Create a user-item matrix
def create_user_item_matrix(data, user_column_name, item_column_name, result_name):
    """
    data: (user_column_name, item_column_name, result_name, timestamp)
    """
    user_list = rating_data.iloc[:, 0].values
    item_list = rating_data[item_column_name].iloc[:, 0].values
    rating_list = rating_data[result_name].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean,\
        rownames=[list(data.index)], colnames=[list(data.columns)])
    return user_item_matrix_data

In [5]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [6]:
# Create a user-item matrix
def create_user_item_matrix_for_matrix_factorization(data, unique_user_id, unique_item_id):
    """
    data: (user, item, rating, timestamp)
    """
    user_item_matrix_data = pd.DataFrame(np.array([np.nan] * (len(unique_user_id) * len(unique_item_id))).reshape(len(unique_user_id), len(unique_item_id)),\
        index=unique_user_id, columns=unique_item_id)
    
    for one_index in data.index:
        user_item_matrix_data.loc[data.loc[one_index, "User_id"], data.loc[one_index, "Item_id"]] = \
            data.loc[one_index, "Rating"]
    return user_item_matrix_data

In [7]:
# 要建構四種資料，分別為User的特徵、Item的特徵、User-Item matrix與User對應Item的紀錄
def split_four_data(user_data, item_data, user_item_interaction_data, user_column_name, item_column_name, result_name):
    """
    user_data：使用者相關資料（user_id一定要放第一個column）
    item_data：物品相關資料（item_id一定要放第二個column）
    """
    all_data = list()
    if user_data != None:
        user_feature_data = user_data.iloc[:, 1:]
        all_data.append(user_feature_data)
    
    if item_data != None:
        item_feature_data = item_data.iloc[:, 1:]
        all_data.append(item_feature_data)
    
    if user_item_interaction_data != None:
        # transform train data into user-item matrix
        user_item_matrix_data = create_user_item_matrix(user_item_interaction_data, user_column_name, item_column_name, result_name)
        all_data.append(user_item_interaction_data)
        all_data.append(user_item_matrix_data)
    return all_data

# 將每種不同資料前處理

## Movielens

In [20]:
with open(r"data\Movielens\movie_genre.dat", "r") as f:
    movie_genre = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_genre = pd.DataFrame(np.array(movie_genre), columns=["movie_id", "genre"])

with open(r"data\Movielens\movie_movie(knn).dat", "r") as f:
    movie_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_movie = pd.DataFrame(np.array(movie_movie), columns=["movie1", "movie2", "similarity"])

with open(r"data\Movielens\user_age.dat", "r") as f:
    user_age = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_age = pd.DataFrame(np.array(user_age), columns=["user_id", "age"])

with open(r"data\Movielens\user_occupation.dat", "r") as f:
    user_occupation = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_occupation = pd.DataFrame(np.array(user_occupation), columns=["user_id", "occupation"])

with open(r"data\Movielens\user_user(knn).dat", "r") as f:
    user_user = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_user = pd.DataFrame(np.array(user_user), columns=["user1", "user2", "similarity"])

with open(r"data\Movielens\user_movie.dat", "r") as f:
    user_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_movie = pd.DataFrame(np.array(user_movie), columns=["user_id", "movie_id", "rating", "timestamp"])

In [23]:
merge_data = pd.merge(user_movie, user_age, how="left", on="user_id")
merge_data = pd.merge(merge_data, user_occupation, how="left", on="user_id")
merge_data = pd.merge(merge_data, movie_genre, how="left", on="movie_id")

In [26]:
user_feature_data = merge_data[["age", "occupation"]]
movie_feature_data = merge_data[["genre"]]

In [None]:
# 產生四種資料
user_feature_data, movie_feature_data, user_item_interaction_data, user_item_matrix_data =\
     split_four_data(user_feature_data, movie_feature_data, user_movie)

## Douban_Book

## Yelp

In [8]:
# Load data to dataframe
with open("ratings.data", "r") as f:
    rating_data = [i.replace("\n", "").split("\t") for i in f.readlines()]
rating_data = pd.DataFrame(np.array(rating_data), columns=["User_id", "Item_id", "Rating", "timestamp"]).astype("int")

# transform timestamp into datetime
rating_data["timestamp"] = [datetime.utcfromtimestamp(i) for i in rating_data["timestamp"]]
rating_data.head()

Unnamed: 0,User_id,Item_id,Rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [9]:
# split data
traindata, testdata = train_test_split(rating_data, test_size=0.25, random_state=12345)

In [None]:
# 產生四種資料
user_feature_data, movie_feature_data, user_item_interaction_data, user_item_matrix_data = split_four_data(user_data, item_data, user_item_interaction_data)

# Collaborative Filtering

In [7]:
similarity_method = ["pearson", "cosine"]
K_list = [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
CF_result = dict()

for one_similarity_method in similarity_method:
    # User-based Collaborative Filtering
    user_cf = User_based_CF(traindata, user_item_matrix_data)
    user_user_correlation_data = user_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"user-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

    # Item-based Collaborative Filtering
    item_cf = Item_based_CF(traindata, user_item_matrix_data)
    item_item_correlation_data = item_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"item-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

100%|██████████| 889249/889249 [04:04<00:00, 3631.08it/s]
100%|██████████| 25000/25000 [13:46<00:00, 30.25it/s]
100%|██████████| 25000/25000 [13:57<00:00, 29.85it/s]
100%|██████████| 25000/25000 [14:30<00:00, 28.72it/s]
100%|██████████| 25000/25000 [15:21<00:00, 27.12it/s]
100%|██████████| 25000/25000 [15:51<00:00, 26.28it/s]
100%|██████████| 25000/25000 [16:22<00:00, 25.45it/s]
100%|██████████| 25000/25000 [16:56<00:00, 24.59it/s]
100%|██████████| 25000/25000 [17:57<00:00, 23.21it/s]
100%|██████████| 25000/25000 [22:41<00:00, 18.37it/s]
100%|██████████| 25000/25000 [27:04<00:00, 15.39it/s]
100%|██████████| 25000/25000 [32:40<00:00, 12.75it/s]
100%|██████████| 25000/25000 [37:48<00:00, 11.02it/s]
100%|██████████| 25000/25000 [44:33<00:00,  9.35it/s]
100%|██████████| 25000/25000 [49:31<00:00,  8.41it/s]
100%|██████████| 25000/25000 [53:32<00:00,  7.78it/s]
100%|██████████| 25000/25000 [1:00:04<00:00,  6.94it/s]
100%|██████████| 25000/25000 [1:07:35<00:00,  6.16it/s]
100%|██████████| 282

KeyboardInterrupt: 

# Matrix Factorization

In [99]:
class matrix_factorization():
    def __init__(self, true_user_item_matrix, num_features):
        """
        true_user_item_matrix：user與item的matrix，沒有補過值
        """
        self.user_id_list = list(true_user_item_matrix.index)
        self.item_id_list = list(true_user_item_matrix.columns)

        # 辨識該值是否真的有值
        self.identify_value_exist = torch.from_numpy( user_item_matrix_data.isna().astype("float").values )
        self.true_user_item_matrix = self.preprocessing_user_item_matrix(true_user_item_matrix)

        self.p_matrix = torch.randn(size=(len(self.user_id_list), num_features), requires_grad=True)
        self.q_matrix = torch.randn(size=(num_features, len(self.item_id_list)), requires_grad=True)

        # 計算global mean
        self.global_mean = torch.mean( self.true_user_item_matrix.flatten()[self.true_user_item_matrix.flatten().nonzero()] )

        # 計算bias of user and bias of item
        self.bu = torch.Tensor(list(map(lambda x: torch.mean(x[x.nonzero()]), self.true_user_item_matrix ))).reshape(shape=(-1, 1)) - self.global_mean
        self.bi = torch.Tensor(list(map(lambda x: torch.mean(x[x.nonzero()]), torch.transpose(self.true_user_item_matrix, 0, 1) ))).reshape(shape=(1, -1)) - self.global_mean
        return

    def preprocessing_user_item_matrix(self, true_user_item_matrix):
        # 把NaN全部補零
        fill_user_item_matrix_data = true_user_item_matrix.fillna(0)
        return torch.from_numpy(fill_user_item_matrix_data.values)
    
    def fit(self, epochs, learning_rate, regularization_rate, bias_or_not):
        # 建立空的儲存以存取Loss
        self.train_loss = list()

        # 定義loss function
        loss_func = nn.MSELoss()

        # 定義optimizer
        optimizer = torch.optim.SGD([self.p_matrix, self.q_matrix], lr=learning_rate, weight_decay=regularization_rate)

        for epoch in range(epochs):
            if bias_or_not == False:
                yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))
            else:
                yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))+self.bu+self.bi+self.global_mean
                
            yhat = yhat * identify_value_exist

            loss = loss_func(yhat, self.true_user_item_matrix)
            self.train_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(f"=== Train Loss: {loss.item()}")
        return

    def predict(self, user_id, item_id):
        yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))
        yhat_dataframe = pd.DataFrame(yhat.detach().numpy(), index=self.user_id_list, columns=self.item_id_list)
        return yhat_dataframe.loc[user_id, item_id]

    def evaluate(self, testdata):
        """
        testdata：data.frame，<user_id, item_id, rating, (timestamp)>
        """
        testdata["yhat"] = list(map(lambda user, item: self.predict(user, item), testdata.iloc[:, 0], testdata.iloc[:, 1]))
        print(f"MSE: {mean_squared_error(y_true=testdata.iloc[:, 2], y_pred=testdata['yhat'])}\nr2_score: {r2_score(y_true=testdata.iloc[:, 2], y_pred=testdata['yhat'])}")
        return

    def save_model(self):
        return

In [8]:
epochs = 100
learning_rate = 1e-2
num_user_id = user_item_matrix_data.shape[0]
num_item_id = user_item_matrix_data.shape[1]
num_features = 10

model = matrix_factorization(true_user_item_matrix=user_item_matrix_data, num_features=num_features)

In [9]:
model.fit(epochs=epochs, learning_rate=learning_rate, regularization_rate=1e-2, bias_or_not=True)

=== Epoch: 0 Train Loss: 20.347803903557356
=== Epoch: 1 Train Loss: 20.343315536402333
=== Epoch: 2 Train Loss: 20.33882939400865
=== Epoch: 3 Train Loss: 20.33434546393661
=== Epoch: 4 Train Loss: 20.329863742936592
=== Epoch: 5 Train Loss: 20.325384238194857
=== Epoch: 6 Train Loss: 20.32090698039349
=== Epoch: 7 Train Loss: 20.31643194274621
=== Epoch: 8 Train Loss: 20.311959118716747
=== Epoch: 9 Train Loss: 20.307488476059742
=== Epoch: 10 Train Loss: 20.30302005638681
=== Epoch: 11 Train Loss: 20.29855385718001
=== Epoch: 12 Train Loss: 20.29408990454399
=== Epoch: 13 Train Loss: 20.289628141663844
=== Epoch: 14 Train Loss: 20.285168583568062
=== Epoch: 15 Train Loss: 20.280711219793393
=== Epoch: 16 Train Loss: 20.276256088627882
=== Epoch: 17 Train Loss: 20.271803161762897
=== Epoch: 18 Train Loss: 20.267352438306993
=== Epoch: 19 Train Loss: 20.262903901035056
=== Epoch: 20 Train Loss: 20.258457561614453
=== Epoch: 21 Train Loss: 20.254013430075798
=== Epoch: 22 Train Loss: 2

In [10]:
model.evaluate(testdata=testdata)

100%|██████████| 25000/25000 [00:53<00:00, 463.02it/s]

MSE: 24.362353662799624
r2_score: -18.258546776614086





# Factorization Machine

In [None]:
# 輸入資料的結構：<user_id, item_id, result, timestamp>

In [None]:
# 把所有東西都變成Label Encoding→Movielens
def movielens_onehotencoding(user_feature_data, movie_feature_data):
    user_age_onehotcoding = OneHotEncoder().fit(user_feature_data["user_age"])
    user_occupation_onehotencoding = OneHotEncoder().fit(user_feature_data["user_occupation"])
    movie_genre_onehotencoding = OneHotEncoder().fit(movie_feature_data["movie_genre"])
    return user_age_onehotcoding, user_occupation_onehotencoding, movie_genre_onehotencoding

In [None]:
# 建構模型→Movielens
class fm_model(nn.Module):
    def __init__(self, num_user_age, num_user_occupation, num_movie_genre, num_features):
        super(fm_model, self).__init__()
        self.user_age = nn.Linear(num_user_age, num_features)
        self.user_occupation = nn.Linear(num_user_occupation, num_features)
        self.movie_genre = nn.Linear(num_movie_genre, num_features)
        self.user_age_weight_linear = nn.Linear(num_user_age, 1)
        self.user_occupation_weight_linear = nn.Linear(num_user_occupation, 1)
        self.movie_genre_weight_linear = nn.Linear(num_movie_genre, 1)
        self.decoder = nn.Linear(6, 1)
        return

    def forward(self, user_age_feature, user_occupation_feature, movie_genre_feature):
        # Embedding Learning
        self.user_age_embedding = self.user_age(user_age_feature) # shape=(batch_size, num_features)
        self.user_occupation_embedding = self.user_occupation(user_occupation_feature) # shape=(batch_size, num_features)
        self.movie_genre_embedding = self.movie_genre(movie_genre_feature) # shape=(batch_size, num_features)
        self.user_age_weight = self.user_age_weight_linear(user_age_feature) # shape = (batch_size, 1)
        self.user_occupation_weight = self.user_occupation_weight_linear(user_occupation_feature) # shape = (batch_size, 1)
        self.movie_genre_weight = self.movie_genre_weight_linear(movie_genre_feature) # shape = (batch_size, 1)

        # Inner product
        self.user_age_user_occupation = self.user_age_embedding * self.user_occupation_embedding # shape=(batch_size, num_features)
        self.user_age_movie_genre = self.user_age_embedding * self.movie_genre_embedding # shape=(batch_size, num_features)
        self.user_occupation_movie_genre = self.user_occupation_embedding * self.movie_genre_embedding # shape=(batch_size, num_features)

        # Concatenate
        self.all = torch.cat((self.user_age_user_occupation, self.user_age_movie_genre, self.user_occupation_movie_genre,\
                              self.user_age_weight, self.user_occupation_weight, self.movie_genre_weight), dim=-1) 

        # Decoder
        X = self.decoder(self.all)
        return X

# GDBT+LR

# IPNN

# OPNN