In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import itertools

from User_based_CF import *
from Item_based_CF import *
from Matrix_Factorization import *
from Factorization_Machine import *
from IPNN_model import *


import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import mean_squared_error
import math


import json

In [2]:
# Create a user-item matrix
def create_user_item_matrix(data, user_column_name, item_column_name, result_name):
    """
    data: (user_column_name, item_column_name, result_name, timestamp)
    """
    user_list = data.iloc[:, 0]
    item_list = data.iloc[:, 1]
    rating_list = data[result_name].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean)
    return user_item_matrix_data

In [3]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [4]:
# Create a user-item matrix
def create_user_item_matrix_for_matrix_factorization(data, unique_user_id, unique_item_id):
    """
    data: (user, item, rating, timestamp)
    """
    user_item_matrix_data = pd.DataFrame(np.array([np.nan] * (len(unique_user_id) * len(unique_item_id))).reshape(len(unique_user_id), len(unique_item_id)),\
        index=unique_user_id, columns=unique_item_id)
    
    for one_index in data.index:
        user_item_matrix_data.loc[data.loc[one_index, "User_id"], data.loc[one_index, "Item_id"]] = \
            data.loc[one_index, "Rating"]
    return user_item_matrix_data

In [5]:
# 要建構四種資料，分別為User的特徵、Item的特徵、User-Item matrix與User對應Item的紀錄
def split_four_data(user_data, item_data, user_item_interaction_data, user_column_name, item_column_name, result_name):
    """
    user_data：使用者相關資料（user_id一定要放第一個column）
    item_data：物品相關資料（item_id一定要放第二個column）
    """
    all_data = list()
    if isinstance(user_data, pd.DataFrame):
        # user_feature_data = user_data.iloc[:, 1:]
        all_data.append(user_data)
    
    if isinstance(item_data, pd.DataFrame):
        # item_feature_data = item_data.iloc[:, 1:]
        all_data.append(item_data)
    
    if isinstance(user_item_interaction_data, pd.DataFrame):
        # transform train data into user-item matrix
        user_item_matrix_data = create_user_item_matrix(user_item_interaction_data, user_column_name, item_column_name, result_name)
        all_data.append(user_item_interaction_data)
        all_data.append(user_item_matrix_data)
    return all_data

# 將每種不同資料前處理

## Movielens

In [6]:
# 定義OneHotEncoding的內容→Movielens
def movielens_onehotencoding(user_feature_data, movie_feature_data):
    user_age_onehotcoding = OneHotEncoder(sparse=False).fit(user_feature_data["age"].values.reshape((-1, 1)))
    user_occupation_onehotencoding = OneHotEncoder(sparse=False).fit(user_feature_data["occupation"].values.reshape((-1, 1)))
    return user_age_onehotcoding, user_occupation_onehotencoding

In [7]:
with open(r"data\Movielens\movie_genre.dat", "r") as f:
    movie_genre = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_genre = pd.DataFrame(np.array(movie_genre), columns=["movie_id", "genre"])
movie_genre["genre"] = movie_genre["genre"].astype("str")

with open(r"data\Movielens\movie_movie(knn).dat", "r") as f:
    movie_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_movie = pd.DataFrame(np.array(movie_movie), columns=["movie1", "movie2", "similarity"])

with open(r"data\Movielens\user_age.dat", "r") as f:
    user_age = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_age = pd.DataFrame(np.array(user_age), columns=["user_id", "age"])

with open(r"data\Movielens\user_occupation.dat", "r") as f:
    user_occupation = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_occupation = pd.DataFrame(np.array(user_occupation), columns=["user_id", "occupation"])

with open(r"data\Movielens\user_user(knn).dat", "r") as f:
    user_user = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_user = pd.DataFrame(np.array(user_user), columns=["user1", "user2", "similarity"])

with open(r"data\Movielens\user_movie.dat", "r") as f:
    user_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_movie = pd.DataFrame(np.array(user_movie), columns=["user_id", "movie_id", "rating", "timestamp"])
user_movie["rating"] = user_movie["rating"].astype("int")

In [8]:
# 針對電影種類前處理：由於一部電影可能有多種種類，因此將每個種類用OneHotEncoding表示
movie_genre["index"] = 1
movie_genre = movie_genre.pivot_table(index="movie_id", columns="genre", values="index", fill_value=0)
movie_genre = movie_genre.reset_index()

In [9]:
merge_data = pd.merge(user_movie, user_age, how="inner", on="user_id")
print(merge_data.shape)
merge_data = pd.merge(merge_data, user_occupation, how="left", on="user_id")
print(merge_data.shape)
merge_data = pd.merge(merge_data, movie_genre, how="left", on="movie_id").fillna(0)
print(merge_data.shape)

(100000, 5)
(100000, 6)
(100000, 24)


In [10]:
user_feature_data = merge_data[["age", "occupation"]]
movie_feature_data = merge_data[movie_genre.columns[1:]]

In [11]:
# 產生四種資料
user_feature_data, movie_feature_data, user_item_interaction_data, user_item_matrix_data =\
     split_four_data(user_feature_data, movie_feature_data, user_movie, user_column_name="user_id", item_column_name="movie_id", result_name="rating")

In [12]:
# 把四種資料作訓練與測試資料的切割
train_user_feature_data, test_user_feature_data, train_movie_feature_data, test_movie_feature_data = \
    train_test_split(user_feature_data, movie_feature_data, random_state=12345, test_size=0.25)

train_result_data, test_result_data =\
    train_test_split(user_item_interaction_data["rating"].values, test_size=0.25, random_state=12345)

In [13]:
# 1. 定義OneHotEncoding的內容→Movielens
user_age_onehotencoding, user_occupation_onehotencoding =\
    movielens_onehotencoding(user_feature_data, movie_feature_data)

# 2. 把所有訓練資料以及測試資料都轉成OneHotEncoding
train_user_age_onehotencoding, test_user_age_onehotencoding =\
    list(map(lambda x: user_age_onehotencoding.transform(x), [train_user_feature_data["age"].values.reshape((-1, 1)), test_user_feature_data["age"].values.reshape((-1, 1))]))
train_user_occupation_onehotencoding, test_user_occupation_onehotencoding =\
    list(map(lambda x: user_occupation_onehotencoding.transform(x), [train_user_feature_data["occupation"].values.reshape((-1, 1)), test_user_feature_data["occupation"].values.reshape((-1, 1))]))

## Douban_Book

## Yelp

# Collaborative Filtering

In [7]:
similarity_method = ["pearson", "cosine"]
K_list = [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
CF_result = dict()

for one_similarity_method in similarity_method:
    # User-based Collaborative Filtering
    user_cf = User_based_CF(traindata, user_item_matrix_data)
    user_user_correlation_data = user_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"user-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

    # Item-based Collaborative Filtering
    item_cf = Item_based_CF(traindata, user_item_matrix_data)
    item_item_correlation_data = item_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(testdata.iloc[x, 0], testdata.iloc[x, 1], num_user=K), tqdm([i for i in range(testdata.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"item-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=testdata["Rating"].values, y_pred=np.array(pred_user_data)))

100%|██████████| 889249/889249 [04:04<00:00, 3631.08it/s]
100%|██████████| 25000/25000 [13:46<00:00, 30.25it/s]
100%|██████████| 25000/25000 [13:57<00:00, 29.85it/s]
100%|██████████| 25000/25000 [14:30<00:00, 28.72it/s]
100%|██████████| 25000/25000 [15:21<00:00, 27.12it/s]
100%|██████████| 25000/25000 [15:51<00:00, 26.28it/s]
100%|██████████| 25000/25000 [16:22<00:00, 25.45it/s]
100%|██████████| 25000/25000 [16:56<00:00, 24.59it/s]
100%|██████████| 25000/25000 [17:57<00:00, 23.21it/s]
100%|██████████| 25000/25000 [22:41<00:00, 18.37it/s]
100%|██████████| 25000/25000 [27:04<00:00, 15.39it/s]
100%|██████████| 25000/25000 [32:40<00:00, 12.75it/s]
100%|██████████| 25000/25000 [37:48<00:00, 11.02it/s]
100%|██████████| 25000/25000 [44:33<00:00,  9.35it/s]
100%|██████████| 25000/25000 [49:31<00:00,  8.41it/s]
100%|██████████| 25000/25000 [53:32<00:00,  7.78it/s]
100%|██████████| 25000/25000 [1:00:04<00:00,  6.94it/s]
100%|██████████| 25000/25000 [1:07:35<00:00,  6.16it/s]
100%|██████████| 282

KeyboardInterrupt: 

# Matrix Factorization

In [99]:
class matrix_factorization():
    def __init__(self, true_user_item_matrix, num_features):
        """
        true_user_item_matrix：user與item的matrix，沒有補過值
        """
        self.user_id_list = list(true_user_item_matrix.index)
        self.item_id_list = list(true_user_item_matrix.columns)

        # 辨識該值是否真的有值
        self.identify_value_exist = torch.from_numpy( user_item_matrix_data.isna().astype("float").values )
        self.true_user_item_matrix = self.preprocessing_user_item_matrix(true_user_item_matrix)

        self.p_matrix = torch.randn(size=(len(self.user_id_list), num_features), requires_grad=True)
        self.q_matrix = torch.randn(size=(num_features, len(self.item_id_list)), requires_grad=True)

        # 計算global mean
        self.global_mean = torch.mean( self.true_user_item_matrix.flatten()[self.true_user_item_matrix.flatten().nonzero()] )

        # 計算bias of user and bias of item
        self.bu = torch.Tensor(list(map(lambda x: torch.mean(x[x.nonzero()]), self.true_user_item_matrix ))).reshape(shape=(-1, 1)) - self.global_mean
        self.bi = torch.Tensor(list(map(lambda x: torch.mean(x[x.nonzero()]), torch.transpose(self.true_user_item_matrix, 0, 1) ))).reshape(shape=(1, -1)) - self.global_mean
        return

    def preprocessing_user_item_matrix(self, true_user_item_matrix):
        # 把NaN全部補零
        fill_user_item_matrix_data = true_user_item_matrix.fillna(0)
        return torch.from_numpy(fill_user_item_matrix_data.values)
    
    def fit(self, epochs, learning_rate, regularization_rate, bias_or_not):
        # 建立空的儲存以存取Loss
        self.train_loss = list()

        # 定義loss function
        loss_func = nn.MSELoss()

        # 定義optimizer
        optimizer = torch.optim.SGD([self.p_matrix, self.q_matrix], lr=learning_rate, weight_decay=regularization_rate)

        for epoch in range(epochs):
            if bias_or_not == False:
                yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))
            else:
                yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))+self.bu+self.bi+self.global_mean
                
            yhat = yhat * identify_value_exist

            loss = loss_func(yhat, self.true_user_item_matrix)
            self.train_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(f"=== Train Loss: {loss.item()}")
        return

    def predict(self, user_id, item_id):
        yhat = torch.tensordot(self.p_matrix, self.q_matrix, dims=([1], [0]))
        yhat_dataframe = pd.DataFrame(yhat.detach().numpy(), index=self.user_id_list, columns=self.item_id_list)
        return yhat_dataframe.loc[user_id, item_id]

    def evaluate(self, testdata):
        """
        testdata：data.frame，<user_id, item_id, rating, (timestamp)>
        """
        testdata["yhat"] = list(map(lambda user, item: self.predict(user, item), testdata.iloc[:, 0], testdata.iloc[:, 1]))
        print(f"MSE: {mean_squared_error(y_true=testdata.iloc[:, 2], y_pred=testdata['yhat'])}\nr2_score: {r2_score(y_true=testdata.iloc[:, 2], y_pred=testdata['yhat'])}")
        return

    def save_model(self):
        return

In [8]:
epochs = 100
learning_rate = 1e-2
num_user_id = user_item_matrix_data.shape[0]
num_item_id = user_item_matrix_data.shape[1]
num_features = 10

model = matrix_factorization(true_user_item_matrix=user_item_matrix_data, num_features=num_features)

In [9]:
model.fit(epochs=epochs, learning_rate=learning_rate, regularization_rate=1e-2, bias_or_not=True)

=== Epoch: 0 Train Loss: 20.347803903557356
=== Epoch: 1 Train Loss: 20.343315536402333
=== Epoch: 2 Train Loss: 20.33882939400865
=== Epoch: 3 Train Loss: 20.33434546393661
=== Epoch: 4 Train Loss: 20.329863742936592
=== Epoch: 5 Train Loss: 20.325384238194857
=== Epoch: 6 Train Loss: 20.32090698039349
=== Epoch: 7 Train Loss: 20.31643194274621
=== Epoch: 8 Train Loss: 20.311959118716747
=== Epoch: 9 Train Loss: 20.307488476059742
=== Epoch: 10 Train Loss: 20.30302005638681
=== Epoch: 11 Train Loss: 20.29855385718001
=== Epoch: 12 Train Loss: 20.29408990454399
=== Epoch: 13 Train Loss: 20.289628141663844
=== Epoch: 14 Train Loss: 20.285168583568062
=== Epoch: 15 Train Loss: 20.280711219793393
=== Epoch: 16 Train Loss: 20.276256088627882
=== Epoch: 17 Train Loss: 20.271803161762897
=== Epoch: 18 Train Loss: 20.267352438306993
=== Epoch: 19 Train Loss: 20.262903901035056
=== Epoch: 20 Train Loss: 20.258457561614453
=== Epoch: 21 Train Loss: 20.254013430075798
=== Epoch: 22 Train Loss: 2

In [10]:
model.evaluate(testdata=testdata)

100%|██████████| 25000/25000 [00:53<00:00, 463.02it/s]

MSE: 24.362353662799624
r2_score: -18.258546776614086





# Factorization Machine

In [None]:
# user_item_interaction的結構：<user_id, item_id, result, timestamp>

In [24]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = fm_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_decoder=num_features*3+3,
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [25]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2425.652796149254
=== Epoch: 1, Train Loss: 758.0403942465782
=== Epoch: 2, Train Loss: 752.8756191730499


# FNN

In [14]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = ipnn_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_decoder=num_features*3+3,
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [15]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2269.737111568451
=== Epoch: 1, Train Loss: 744.1373466849327
=== Epoch: 2, Train Loss: 743.6776596307755


# GDBT+LR

# IPNN

# OPNN