In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import itertools

from User_based_CF import *
from Item_based_CF import *
from Matrix_Factorization import *
from Factorization_Machine import *
from IPNN_model import *
from FNN_model import *
from OPNN_model import *
from PIN_model import *
from GMF_NeuCF_model import *
from MLP_NeuCF_model import *


import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import mean_squared_error
import math


import json

In [2]:
# Create a user-item matrix
def create_user_item_matrix(data, user_column_name, item_column_name, result_name):
    """
    data: (user_column_name, item_column_name, result_name, timestamp)
    """
    user_list = data.iloc[:, 0]
    item_list = data.iloc[:, 1]
    rating_list = data[result_name].values
    user_item_matrix_data = pd.crosstab(index=user_list, columns=item_list, values=rating_list, aggfunc=np.mean)
    return user_item_matrix_data

In [3]:
# Identify whether the value exists or not.
def identify_value_exist(user_item_matrix_data):
    """
    user_item_matrix_data: DataFrame
    """
    return (user_item_matrix_data.isna() == False).astype("int")

In [4]:
# Create a user-item matrix
def create_user_item_matrix_for_matrix_factorization(data, unique_user_id, unique_item_id):
    """
    data: (user, item, rating, timestamp)
    """
    user_item_matrix_data = pd.DataFrame(np.array([np.nan] * (len(unique_user_id) * len(unique_item_id))).reshape(len(unique_user_id), len(unique_item_id)),\
        index=unique_user_id, columns=unique_item_id)
    
    for one_index in data.index:
        user_item_matrix_data.loc[data.loc[one_index, "User_id"], data.loc[one_index, "Item_id"]] = \
            data.loc[one_index, "Rating"]
    return user_item_matrix_data

In [5]:
# 要建構四種資料，分別為User的特徵、Item的特徵、User-Item matrix與User對應Item的紀錄
def split_four_data(user_data, item_data, user_item_interaction_data, user_column_name, item_column_name, result_name):
    """
    user_data：使用者相關資料（user_id一定要放第一個column）
    item_data：物品相關資料（item_id一定要放第二個column）
    """
    all_data = list()
    if isinstance(user_data, pd.DataFrame):
        # user_feature_data = user_data.iloc[:, 1:]
        all_data.append(user_data)
    
    if isinstance(item_data, pd.DataFrame):
        # item_feature_data = item_data.iloc[:, 1:]
        all_data.append(item_data)
    
    if isinstance(user_item_interaction_data, pd.DataFrame):
        # transform train data into user-item matrix
        user_item_matrix_data = create_user_item_matrix(user_item_interaction_data, user_column_name, item_column_name, result_name)
        all_data.append(user_item_interaction_data)
        all_data.append(user_item_matrix_data)
    return all_data

In [6]:
# 定義OneHotEncoding的內容→Movielens
def movielens_onehotencoding(merge_data, user_feature_data, movie_feature_data):
    user_id_onehotencoding = OneHotEncoder(sparse=False).fit(merge_data["user_id"].values.reshape((-1, 1)))
    movie_id_onehotencoding = OneHotEncoder(sparse=False).fit(merge_data["movie_id"].values.reshape((-1, 1)))
    user_age_onehotencoding = OneHotEncoder(sparse=False).fit(user_feature_data["age"].values.reshape((-1, 1)))
    user_occupation_onehotencoding = OneHotEncoder(sparse=False).fit(user_feature_data["occupation"].values.reshape((-1, 1)))
    return user_id_onehotencoding, movie_id_onehotencoding, user_age_onehotencoding, user_occupation_onehotencoding

# 將每種不同資料前處理

## Movielens

### Load data

In [7]:
with open(r"data\Movielens\movie_genre.dat", "r") as f:
    movie_genre = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_genre = pd.DataFrame(np.array(movie_genre), columns=["movie_id", "genre"])
movie_genre["genre"] = movie_genre["genre"].astype("str")

with open(r"data\Movielens\movie_movie(knn).dat", "r") as f:
    movie_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
movie_movie = pd.DataFrame(np.array(movie_movie), columns=["movie1", "movie2", "similarity"])

with open(r"data\Movielens\user_age.dat", "r") as f:
    user_age = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_age = pd.DataFrame(np.array(user_age), columns=["user_id", "age"])

with open(r"data\Movielens\user_occupation.dat", "r") as f:
    user_occupation = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_occupation = pd.DataFrame(np.array(user_occupation), columns=["user_id", "occupation"])

with open(r"data\Movielens\user_user(knn).dat", "r") as f:
    user_user = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_user = pd.DataFrame(np.array(user_user), columns=["user1", "user2", "similarity"])

with open(r"data\Movielens\user_movie.dat", "r") as f:
    user_movie = [i.replace("\n", "").split("\t") for i in f.readlines()]
user_movie = pd.DataFrame(np.array(user_movie), columns=["user_id", "movie_id", "rating", "timestamp"])
user_movie["rating"] = user_movie["rating"].astype("int")

### Build User-Item matrix

In [8]:
user_item_matrix_data = create_user_item_matrix(user_movie, user_column_name="user_id", item_column_name="item_id", result_name="rating")

### Preprocessing of Rating data

In [9]:
# 針對電影種類前處理：由於一部電影可能有多種種類，因此將每個種類用OneHotEncoding表示
movie_genre["index"] = 1
movie_genre = movie_genre.pivot_table(index="movie_id", columns="genre", values="index", fill_value=0)
movie_genre = movie_genre.reset_index()

In [10]:
rating_merge_data = pd.merge(user_movie, user_age, how="inner", on="user_id")
print(rating_merge_data.shape)
rating_merge_data = pd.merge(rating_merge_data, user_occupation, how="left", on="user_id")
print(rating_merge_data.shape)
rating_merge_data = pd.merge(rating_merge_data, movie_genre, how="left", on="movie_id").fillna(0)
print(rating_merge_data.shape)

(100000, 5)
(100000, 6)
(100000, 24)


In [11]:
# 最終想生成哪些資料
# 1. 訓練資料、測試資料
# 2. 每個資料的單純User-id、Item-id, User-id所有特徵、Item-id所有特徵、User-Item-interaction、User-Item Matrix
    # 把資料分割成訓練資料與測試資料
rating_train_merge_data, rating_test_merge_data = train_test_split(rating_merge_data, test_size=0.2, random_state=12345)

rating_train_user_id_data, rating_test_user_id_data = rating_train_merge_data["user_id"], rating_test_merge_data["user_id"] # 目標
rating_train_movie_id_data, rating_test_movie_id_data = rating_train_merge_data["movie_id"], rating_test_merge_data["movie_id"] # 目標
rating_train_user_feature_data, rating_test_user_feature_data = rating_train_merge_data[["age", "occupation"]], rating_test_merge_data[["age", "occupation"]]
rating_train_movie_feature_data, rating_test_movie_feature_data = rating_train_merge_data[movie_genre.columns[1:]], rating_test_merge_data[movie_genre.columns[1:]]
rating_train_result_data, rating_test_result_data = rating_train_merge_data["rating"].values, rating_test_merge_data["rating"].values

rating_train_user_feature_data, rating_train_movie_feature_data, rating_train_user_item_interaction_data, rating_train_user_item_matrix_data =\
    split_four_data(rating_train_user_feature_data, rating_train_movie_feature_data, rating_train_merge_data[["user_id", "movie_id", "rating", "timestamp"]], user_column_name="user_id", item_column_name="movie_id", result_name="rating")

rating_test_user_feature_data, rating_test_movie_feature_data, rating_test_user_item_interaction_data, rating_test_user_item_matrix_data =\
    split_four_data(rating_test_user_feature_data, rating_test_movie_feature_data, rating_test_merge_data[["user_id", "movie_id", "rating", "timestamp"]], user_column_name="user_id", item_column_name="movie_id", result_name="rating")

del rating_test_user_item_matrix_data


In [12]:
# 1. 定義OneHotEncoding的內容→Movielens
rating_user_id_onehotencoding, rating_movie_id_onehotencoding, rating_user_age_onehotencoding, rating_user_occupation_onehotencoding =\
    movielens_onehotencoding(rating_merge_data, rating_train_user_feature_data, rating_train_movie_feature_data)

# 2. 把所有訓練資料以及測試資料都轉成OneHotEncoding
rating_train_user_age_onehotencoding, rating_test_user_age_onehotencoding =\
    list(map(lambda x: rating_user_age_onehotencoding.transform(x), [rating_train_user_feature_data["age"].values.reshape((-1, 1)), rating_test_user_feature_data["age"].values.reshape((-1, 1))]))
rating_train_user_occupation_onehotencoding, rating_test_user_occupation_onehotencoding =\
    list(map(lambda x: rating_user_occupation_onehotencoding.transform(x), [rating_train_user_feature_data["occupation"].values.reshape((-1, 1)), rating_test_user_feature_data["occupation"].values.reshape((-1, 1))]))
rating_train_user_id_onehotencoding, rating_test_user_id_onehotencoding =\
    list(map(lambda x: rating_user_id_onehotencoding.transform(x), [rating_train_user_id_data.values.reshape((-1, 1)), rating_test_user_id_data.values.reshape((-1, 1))]))
rating_train_movie_id_onehotencoding, rating_test_movie_id_onehotencoding =\
    list(map(lambda x: rating_movie_id_onehotencoding.transform(x), [rating_train_movie_id_data.values.reshape((-1, 1)), rating_test_movie_id_data.values.reshape((-1, 1))]))

In [13]:
del rating_merge_data, rating_train_merge_data, rating_test_merge_data

綜合上述前處理過程，只需要保留以下幾個變數：
1. rating_train_user_id_onehoteocoding, rating_test_user_id_onehotencoding
2. rating_train_movie_id_onehotencoding, rating_test_movie_id_onehotencoding
3. rating_train_user_age_onehotencoding, rating_test_user_age_onehotencoding
4. rating_train_user_occupation_onehotencoding, rating_test_user_occupation_onehotencoding
5. rating_train_movie_feature_data, rating_test_movie_feature_data
6. rating_train_user_item_interaction_data, rating_test_user_item_interaction_data: pd.DataFrame
7. rating_train_user_item_matrix_data: pd.DataFrame

### Preprocessing of binary data

In [14]:
# 產生binary data→定義某個user是否會對某個item做評論
# 1. 產生某個值是否為真實值
identify_value_exist_binary = (user_item_matrix_data.isna() == False).astype("int")

# 2. 產生index名稱為column
identify_value_exist_binary["user_id"] = identify_value_exist_binary.index

# 3. pandas melt
binary_result = pd.melt(identify_value_exist_binary, id_vars=["user_id"])

In [15]:
binary_merge_data = pd.merge(binary_result, user_age, how="inner", on="user_id")
print(binary_merge_data.shape)
binary_merge_data = pd.merge(binary_merge_data, user_occupation, how="left", on="user_id")
print(binary_merge_data.shape)
binary_merge_data = pd.merge(binary_merge_data, movie_genre, how="left", on="movie_id").fillna(0)
print(binary_merge_data.shape)

(1586126, 4)
(1586126, 5)
(1586126, 23)


In [16]:
# 最終想生成哪些資料
# 1. 訓練資料、測試資料
# 2. 每個資料的單純User-id、Item-id, User-id所有特徵、Item-id所有特徵、User-Item-interaction、User-Item Matrix
    # 把資料分割成訓練資料與測試資料
binary_train_merge_data, binary_test_merge_data = train_test_split(binary_merge_data, test_size=0.2, random_state=12345)

binary_train_user_id_data, binary_test_user_id_data = binary_train_merge_data["user_id"], binary_test_merge_data["user_id"] # 目標
binary_train_movie_id_data, binary_test_movie_id_data = binary_train_merge_data["movie_id"], binary_test_merge_data["movie_id"] # 目標
binary_train_user_feature_data, binary_test_user_feature_data = binary_train_merge_data[["age", "occupation"]], binary_test_merge_data[["age", "occupation"]]
binary_train_movie_feature_data, binary_test_movie_feature_data = binary_train_merge_data[movie_genre.columns[1:]], binary_test_merge_data[movie_genre.columns[1:]]
binary_train_result_data, binary_test_result_data = binary_train_merge_data["value"].values, binary_test_merge_data["value"].values

binary_train_user_feature_data, binary_train_movie_feature_data, binary_train_user_item_interaction_data, binary_train_user_item_matrix_data =\
    split_four_data(binary_train_user_feature_data, binary_train_movie_feature_data, binary_train_merge_data[["user_id", "movie_id", "value"]], user_column_name="user_id", item_column_name="movie_id", result_name="value")

binary_test_user_feature_data, binary_test_movie_feature_data, binary_test_user_item_interaction_data, binary_test_user_item_matrix_data =\
    split_four_data(binary_test_user_feature_data, binary_test_movie_feature_data, binary_test_merge_data[["user_id", "movie_id", "value"]], user_column_name="user_id", item_column_name="movie_id", result_name="value")

del binary_train_user_item_matrix_data


In [17]:
# 1. 定義OneHotEncoding的內容→Movielens
binary_user_id_onehotencoding, binary_movie_id_onehotencoding, binary_user_age_onehotencoding, binary_user_occupation_onehotencoding =\
    movielens_onehotencoding(binary_merge_data, binary_train_user_feature_data, binary_train_movie_feature_data)

# 2. 把所有訓練資料以及測試資料都轉成OneHotEncoding
binary_train_user_age_onehotencoding, binary_test_user_age_onehotencoding =\
    list(map(lambda x: binary_user_age_onehotencoding.transform(x), [binary_train_user_feature_data["age"].values.reshape((-1, 1)), binary_test_user_feature_data["age"].values.reshape((-1, 1))]))
binary_train_user_occupation_onehotencoding, binary_test_user_occupation_onehotencoding =\
    list(map(lambda x: binary_user_occupation_onehotencoding.transform(x), [binary_train_user_feature_data["occupation"].values.reshape((-1, 1)), binary_test_user_feature_data["occupation"].values.reshape((-1, 1))]))
binary_train_user_id_onehotencoding, binary_test_user_id_onehotencoding =\
    list(map(lambda x: binary_user_id_onehotencoding.transform(x), [binary_train_user_id_data.values.reshape((-1, 1)), binary_test_user_id_data.values.reshape((-1, 1))]))
binary_train_movie_id_onehotencoding, binary_test_movie_id_onehotencoding =\
    list(map(lambda x: binary_movie_id_onehotencoding.transform(x), [binary_train_movie_id_data.values.reshape((-1, 1)), binary_test_movie_id_data.values.reshape((-1, 1))]))

In [18]:
binary_train_user_age_onehotencoding.shape, binary_train_movie_feature_data.shape

((1268900, 8), (1268900, 18))

## Douban_Book

## Yelp

# Collaborative Filtering

In [40]:
similarity_method = ["pearson", "cosine"]
K_list = [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
CF_result = dict()

for one_similarity_method in similarity_method:
    # User-based Collaborative Filtering
    user_cf = User_based_CF(rating_train_user_item_interaction_data, user_item_matrix_data)
    user_user_correlation_data = user_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: user_cf.predict_without_time(rating_test_user_item_interaction_data.iloc[x, 0], rating_test_user_item_interaction_data.iloc[x, 1], user_column_name="user_id", item_column_name="movie_id", num_user=K), tqdm([i for i in range(rating_test_user_item_interaction_data.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"user-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=rating_test_user_item_interaction_data["rating"].values, y_pred=np.array(pred_user_data)))
        break

    # Item-based Collaborative Filtering
    item_cf = Item_based_CF(rating_train_user_item_interaction_data, user_item_matrix_data)
    item_item_correlation_data = item_cf.compute_correlation(corr_methods=one_similarity_method)

    for K in K_list:
        # 針對test data做預測以及模型評估（注意，每次計算是針對一筆資料）
        pred_user_data = list(map(lambda x: item_cf.predict_without_time(rating_test_user_item_interaction_data.iloc[x, 0], rating_test_user_item_interaction_data.iloc[x, 1], user_column_name="user_id", item_column_name="movie_id", num_item=K), tqdm([i for i in range(rating_test_user_item_interaction_data.shape[0])])))
        pred_user_data = [i if i > 0 else 0 for i in pred_user_data]
        CF_result[f"item-based_{one_similarity_method}_{K}"] = math.sqrt(mean_squared_error(y_true=rating_test_user_item_interaction_data["rating"].values, y_pred=np.array(pred_user_data)))
        break

100%|██████████| 889249/889249 [05:53<00:00, 2516.07it/s]
  5%|▍         | 934/20000 [01:19<26:59, 11.77it/s]


KeyboardInterrupt: 

# Matrix Factorization

In [37]:
epochs = 100
learning_rate = 1e-2
num_user_id = user_item_matrix_data.shape[0]
num_item_id = user_item_matrix_data.shape[1]
num_features = 10

model = matrix_factorization(true_user_item_matrix=rating_train_user_item_matrix_data, num_features=num_features)
model.fit(epochs=epochs, learning_rate=learning_rate, regularization_rate=1e-2, bias_or_not=True)

=== Epoch: 0 Train Loss: 20.599211405299222
=== Epoch: 1 Train Loss: 20.59466254555195
=== Epoch: 2 Train Loss: 20.590115948601476
=== Epoch: 3 Train Loss: 20.585571626313254
=== Epoch: 4 Train Loss: 20.581029547581313
=== Epoch: 5 Train Loss: 20.57648974614625
=== Epoch: 6 Train Loss: 20.57195217663575
=== Epoch: 7 Train Loss: 20.567416873053997
=== Epoch: 8 Train Loss: 20.56288380328726
=== Epoch: 9 Train Loss: 20.55835299136292
=== Epoch: 10 Train Loss: 20.553824444222204
=== Epoch: 11 Train Loss: 20.549298140749443
=== Epoch: 12 Train Loss: 20.544774087423903
=== Epoch: 13 Train Loss: 20.54025230256187
=== Epoch: 14 Train Loss: 20.535732735034546
=== Epoch: 15 Train Loss: 20.531215408604808
=== Epoch: 16 Train Loss: 20.526700349304868
=== Epoch: 17 Train Loss: 20.522187512591717
=== Epoch: 18 Train Loss: 20.51767692373556
=== Epoch: 19 Train Loss: 20.513168575551603
=== Epoch: 20 Train Loss: 20.50866248156334
=== Epoch: 21 Train Loss: 20.50415860576128
=== Epoch: 22 Train Loss: 20.

In [39]:
rating_test_user_item_interaction_data

Unnamed: 0,user_id,movie_id,rating,timestamp
71751,655,52,3,891585279
80493,748,208,4,879454522
2655,303,388,2,879544365
53233,463,111,2,877385414
91141,864,588,3,888887289
...,...,...,...,...
29759,274,71,4,878946612
88142,833,234,3,875122884
36759,338,168,3,879438225
90574,845,286,5,885409719


In [38]:
model.evaluate(testdata=rating_test_user_item_interaction_data)

  0%|          | 91/20000 [00:00<00:45, 437.18it/s]


KeyError: '1581'

# Factorization Machine

In [None]:
# user_item_interaction的結構：<user_id, item_id, result, timestamp>

In [41]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(rating_train_user_age_onehotencoding),
                               torch.FloatTensor(rating_train_user_occupation_onehotencoding),
                               torch.FloatTensor(rating_train_movie_feature_data.values),
                               torch.FloatTensor(rating_train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(rating_test_user_age_onehotencoding),
                               torch.FloatTensor(rating_test_user_occupation_onehotencoding),
                               torch.FloatTensor(rating_test_movie_feature_data.values),
                               torch.FloatTensor(rating_test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = fm_model(num_user_age=binary_train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=binary_train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=binary_train_movie_feature_data.values.shape[1],
                 num_features=num_features,
                 methods = "regression")
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [42]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2831.227592587471
=== Epoch: 1, Train Loss: 811.0931701660156
=== Epoch: 2, Train Loss: 805.9960036277771


In [20]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(binary_train_user_age_onehotencoding),
                               torch.FloatTensor(binary_train_user_occupation_onehotencoding),
                               torch.FloatTensor(binary_train_movie_feature_data.values),
                               torch.FloatTensor(binary_train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(binary_test_user_age_onehotencoding),
                               torch.FloatTensor(binary_test_user_occupation_onehotencoding),
                               torch.FloatTensor(binary_test_movie_feature_data.values),
                               torch.FloatTensor(binary_test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = fm_model(num_user_age=binary_train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=binary_train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=binary_train_movie_feature_data.values.shape[1],
                 num_features=num_features,
                 methods = "classification")
loss_func = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [27]:
# Model Training
epochs = 30

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result.reshape((-1, 1)))
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2336.783412784338
=== Epoch: 1, Train Loss: 2331.496040932834
=== Epoch: 2, Train Loss: 2326.598101451993
=== Epoch: 3, Train Loss: 2321.9458227828145
=== Epoch: 4, Train Loss: 2317.4644750207663
=== Epoch: 5, Train Loss: 2313.1144154295325
=== Epoch: 6, Train Loss: 2308.8806760013103
=== Epoch: 7, Train Loss: 2304.766260832548
=== Epoch: 8, Train Loss: 2300.786325581372
=== Epoch: 9, Train Loss: 2296.961825057864
=== Epoch: 10, Train Loss: 2293.3146854266524
=== Epoch: 11, Train Loss: 2289.863220587373
=== Epoch: 12, Train Loss: 2286.6195099279284
=== Epoch: 13, Train Loss: 2283.588393494487
=== Epoch: 14, Train Loss: 2280.768659673631
=== Epoch: 15, Train Loss: 2278.153487429023
=== Epoch: 16, Train Loss: 2275.7329700142145
=== Epoch: 17, Train Loss: 2273.495465449989
=== Epoch: 18, Train Loss: 2271.428537324071


KeyboardInterrupt: 

In [25]:
# Predict
all_yhat = list()
test_loss = 0.0

for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in test_dataloader:
    yhat = model(user_age_feature=torch_user_age, 
                    user_occupation_feature=torch_user_occupation,
                    movie_genre_feature=torch_movie_genre)

    loss = loss_func(yhat, torch_result.reshape((-1, 1)))
    test_loss += loss.item()
    all_yhat.extend(yhat[:, 0].detach().numpy().tolist())

In [56]:
def evaluate_topk_recall(testdata,
                         one_user,
                         user_column, 
                         item_column, 
                         true_result_column, 
                         topk=10):
    """
    testdata: data.frame，同時有user_id, item_id, rating, yhat
    """
    one_user_data = binary_test_user_item_interaction_data[binary_test_user_item_interaction_data[user_column] == one_user]

    # 2. 針對某個User，挑出所有真實有評分的Item，以及挑出預測前十名的Item
    true_item = list(one_user_data[one_user_data[true_result_column] == 1][item_column])
    predict_item = list(one_user_data.sort_values("yhat", ascending=True)[item_column][:10])

    # 3. 計算所有有真實評分的Item的數量
    length_true_item = len(true_item)

    # 4. 計算預測與真實有相對應的資料
    length_true_predict_match = sum([1 if i in predict_item else 0 for i in true_item])

    # 5. 計算Recall
    try:
        topk_recall = length_true_predict_match / length_true_item
    except:
        topk_recall = 0
    return topk_recall

In [58]:
# Evaluate
# Recall@10

# 0. 把預測結果鑲嵌至testdata
binary_test_user_item_interaction_data["yhat"] = np.array(all_yhat)

# 1. 建立User清單
binary_test_user_id = list(binary_test_user_item_interaction_data["user_id"].unique())

one_user = [0]
all_recall_list = list(map(lambda x: evaluate_topk_recall(testdata=binary_test_user_item_interaction_data, 
                                                          one_user=x,
                                                          user_column="user_id",
                                                          item_column="movie_id",
                                                          true_result_column="value"), binary_test_user_id))

# 6. 將所有Recall平均
final_recall = np.array(all_recall_list).mean()


# IPNN

In [44]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(rating_train_user_age_onehotencoding),
                               torch.FloatTensor(rating_train_user_occupation_onehotencoding),
                               torch.FloatTensor(rating_train_movie_feature_data.values),
                               torch.FloatTensor(rating_train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(rating_test_user_age_onehotencoding),
                               torch.FloatTensor(rating_test_user_occupation_onehotencoding),
                               torch.FloatTensor(rating_test_movie_feature_data.values),
                               torch.FloatTensor(rating_test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)



# 2. 呼叫模型與設定Loss function
model = ipnn_model(num_user_age=rating_train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=rating_train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=rating_train_movie_feature_data.values.shape[1],
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [45]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2003.3519181013107
=== Epoch: 1, Train Loss: 793.9028970599174
=== Epoch: 2, Train Loss: 793.8328919410706


# GDBT+LR

# FNN

In [14]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = fnn_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [15]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 1832.3123235106468
=== Epoch: 1, Train Loss: 745.8843268752098
=== Epoch: 2, Train Loss: 745.6503906846046


# OPNN

In [160]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = opnn_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_decoder=3*(num_features**2)+3,
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [161]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 816.1337397694588
=== Epoch: 1, Train Loss: 744.1182813048363
=== Epoch: 2, Train Loss: 743.7272185087204


# PIN

In [14]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = pin_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_decoder=3 * num_features,
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [16]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        print(torch_user_age.size())
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([128, 8])
torch.Size([1

# CCPM

In [14]:
batch_size = 128
num_features = 10

# 1. 把所有東西包成dataloader
train_dataset = TensorDataset( torch.FloatTensor(train_user_age_onehotencoding),
                               torch.FloatTensor(train_user_occupation_onehotencoding),
                               torch.FloatTensor(train_movie_feature_data.values),
                               torch.FloatTensor(train_result_data) )
test_dataset = TensorDataset(  torch.FloatTensor(test_user_age_onehotencoding),
                               torch.FloatTensor(test_user_occupation_onehotencoding),
                               torch.FloatTensor(test_movie_feature_data.values),
                               torch.FloatTensor(test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)


# 2. 呼叫模型與設定Loss function
model = pin_model(num_user_age=train_user_age_onehotencoding.shape[1], 
                 num_user_occupation=train_user_occupation_onehotencoding.shape[1], 
                 num_movie_genre=train_movie_feature_data.values.shape[1],
                 num_decoder=3 * num_features,
                 num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [16]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user_age, torch_user_occupation, torch_movie_genre, torch_result in train_dataloader:
        yhat = model(user_age_feature=torch_user_age, 
                     user_occupation_feature=torch_user_occupation,
                     movie_genre_feature=torch_movie_genre)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 743.2036992311478
=== Epoch: 1, Train Loss: 743.2006698846817
=== Epoch: 2, Train Loss: 743.1977047920227


# GMF in NeuCF

In [14]:
batch_size = 128
num_features = 20

train_dataset = TensorDataset( torch.FloatTensor(rating_train_user_id_onehotencoding),
                               torch.FloatTensor(rating_train_movie_id_onehotencoding),
                               torch.FloatTensor(rating_train_result_data) )
test_dataset = TensorDataset( torch.FloatTensor(rating_test_user_id_onehotencoding),
                              torch.FloatTensor(rating_test_movie_id_onehotencoding),
                              torch.FloatTensor(rating_test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model = gmf_neucf_model( num_user=rating_train_user_id_onehotencoding.shape[1],
                         num_item=rating_train_movie_id_onehotencoding.shape[1],
                         num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [15]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user, torch_movie, torch_result in train_dataloader:
        yhat = model(user=torch_user,
                     item=torch_movie)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 3663.477033853531
=== Epoch: 1, Train Loss: 1028.2818360328674
=== Epoch: 2, Train Loss: 812.5593981742859


# MLP in NeuCF

In [14]:
batch_size = 128
num_features = 20

train_dataset = TensorDataset( torch.FloatTensor(rating_train_user_id_onehotencoding),
                               torch.FloatTensor(rating_train_movie_id_onehotencoding),
                               torch.FloatTensor(rating_train_result_data) )
test_dataset = TensorDataset( torch.FloatTensor(rating_test_user_id_onehotencoding),
                              torch.FloatTensor(rating_test_movie_id_onehotencoding),
                              torch.FloatTensor(rating_test_result_data))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model = mlp_neucf_model( num_user=rating_train_user_id_onehotencoding.shape[1],
                         num_item=rating_train_movie_id_onehotencoding.shape[1],
                         num_features=num_features)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [15]:
epochs = 3

train_each_iteration_loss = list()
train_loss_list = list()

for epoch in range(epochs):
    train_loss = 0.0
    for torch_user, torch_movie, torch_result in train_dataloader:
        yhat = model(user=torch_user,
                     item=torch_movie)

        loss = loss_func(yhat, torch_result)
        train_loss += loss.item()
        train_each_iteration_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss_list.append(train_loss)
    print(f"=== Epoch: {epoch}, Train Loss: {train_loss}")

=== Epoch: 0, Train Loss: 2382.4753637313843
=== Epoch: 1, Train Loss: 793.3175737261772
=== Epoch: 2, Train Loss: 793.259542286396


# DeepFM