In [71]:
# import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [72]:
# load in the data
# the anime dataset
anime = pd.read_csv("anime.csv")

# the users rating dataset
user_ratings = pd.read_csv("rating.csv")

In [73]:
# replaing the -1's in user_rating.rating with np.nan
user_ratings.loc[user_ratings.rating == -1, "rating"] = np.nan

In [74]:
# number of mising values in user_rating
user_ratings.isnull().mean() # ~18.9%

user_id     0.000000
anime_id    0.000000
rating      0.188962
dtype: float64

In [75]:
#number of missing values in anime
anime.isnull().mean()

anime_id    0.000000
name        0.000000
genre       0.005043
type        0.002034
episodes    0.000000
rating      0.018708
members     0.000000
dtype: float64

In [76]:
# merging anime and user_ratings
user_ratings = pd.merge(user_ratings, anime, on = "anime_id")

In [77]:
user_ratings.columns

Index(['user_id', 'anime_id', 'rating_x', 'name', 'genre', 'type', 'episodes',
       'rating_y', 'members'],
      dtype='object')

In [78]:
user_ratings.fillna(0, inplace = True)

In [79]:
user_ratings.isnull().mean()

user_id     0.0
anime_id    0.0
rating_x    0.0
name        0.0
genre       0.0
type        0.0
episodes    0.0
rating_y    0.0
members     0.0
dtype: float64

In [80]:
# dropping the unnecessary columns
user_ratings.drop(["genre", "type", "episodes", "rating_y", "members"], axis = 1, inplace = True)
#rename
user_ratings.rename(columns = {"rating_x": "rating"}, inplace = True)
user_ratings.rename(columns = {"name": "anime_name"}, inplace = True)

In [81]:
user_ratings.columns #check

Index(['user_id', 'anime_id', 'rating', 'anime_name'], dtype='object')

In [82]:
len(user_ratings)

7813727

In [83]:
user_ratings.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,1,20,0.0,Naruto
1,3,20,8.0,Naruto
2,5,20,6.0,Naruto
3,6,20,0.0,Naruto
4,10,20,0.0,Naruto


In [84]:
user_ratings = user_ratings[user_ratings.user_id <= 1000]

In [85]:
len(user_ratings)

96479

In [86]:
user_ratings = user_ratings.sort_values(by=['user_id', 'anime_id'])
user_ratings.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,1,20,0.0,Naruto
25925,1,24,0.0,School Rumble
33078,1,79,0.0,Shuffle!
40190,1,226,0.0,Elfen Lied
67696,1,241,0.0,Girls Bravo: First Season


In [87]:
user_ratings.shape[0]

96479

In [88]:
user_ratings.describe()

Unnamed: 0,user_id,anime_id,rating
count,96479.0,96479.0,96479.0
mean,511.330372,10864.815794,6.343515
std,279.701931,9117.207645,3.39145
min,1.0,1.0,0.0
25%,285.0,2472.0,6.0
50%,510.0,9656.0,8.0
75%,754.0,16762.0,9.0
max,1000.0,34240.0,10.0


In [89]:
# setting to raise exceptions
np.seterr(all = "raise")

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

In [90]:
#split 70-30 for training - testing set
train_df, valid_df = train_test_split(user_ratings, test_size=0.3)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['user_id', 'anime_id', 'rating', 'anime_name']]
valid_df = valid_df.reset_index()[['user_id', 'anime_id', 'rating', 'anime_name']]

In [91]:
train_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,183,30276,8.0,One Punch Man
1,127,813,9.0,Dragon Ball Z
2,455,1681,8.0,Brave Story
3,562,457,10.0,Mushishi
4,470,523,0.0,Tonari no Totoro


In [92]:
train_df = train_df.sort_values(by=['user_id', 'anime_id'])
train_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
33719,1,20,0.0,Naruto
33146,1,79,0.0,Shuffle!
44275,1,226,0.0,Elfen Lied
22115,1,241,0.0,Girls Bravo: First Season
40234,1,356,0.0,Fate/stay night


In [93]:
train_df.describe()

Unnamed: 0,user_id,anime_id,rating
count,67535.0,67535.0,67535.0
mean,510.765559,10836.002576,6.356793
std,279.906755,9094.935141,3.383208
min,1.0,1.0,0.0
25%,282.0,2472.0,6.0
50%,508.0,9591.0,8.0
75%,753.0,16742.0,9.0
max,1000.0,34240.0,10.0


In [94]:
train_df.columns

Index(['user_id', 'anime_id', 'rating', 'anime_name'], dtype='object')

In [95]:
valid_df.columns

Index(['user_id', 'anime_id', 'rating', 'anime_name'], dtype='object')

In [96]:
valid_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
0,656,9919,7.0,Ao no Exorcist
1,207,14345,0.0,Btooom!
2,382,2889,5.0,Bleach Movie 2: The DiamondDust Rebellion - Mo...
3,807,1030,10.0,Heisei Tanuki Gassen Ponpoko
4,54,30549,0.0,Soukyuu no Fafner: Dead Aggressor - Exodus 2nd...


In [97]:
valid_df = valid_df.sort_values(by=['user_id', 'anime_id'])
valid_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
5974,1,24,0.0,School Rumble
21411,1,355,0.0,Shakugan no Shana
16840,1,442,0.0,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shin...
14553,1,2025,0.0,Darker than Black: Kuro no Keiyakusha
26726,1,2144,0.0,Naruto Movie 3: Dai Koufun! Mikazuki Jima no A...


In [98]:
len(train_df), len(valid_df)

(67535, 28944)

In [99]:
# indices start from 0
train_df.iloc[:, :2] -= 1
valid_df.iloc[:, :2] -= 1

In [100]:
train_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
33719,0,19,0.0,Naruto
33146,0,78,0.0,Shuffle!
44275,0,225,0.0,Elfen Lied
22115,0,240,0.0,Girls Bravo: First Season
40234,0,355,0.0,Fate/stay night


In [101]:
valid_df.head()

Unnamed: 0,user_id,anime_id,rating,anime_name
5974,0,23,0.0,School Rumble
21411,0,354,0.0,Shakugan no Shana
16840,0,441,0.0,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shin...
14553,0,2024,0.0,Darker than Black: Kuro no Keiyakusha
26726,0,2143,0.0,Naruto Movie 3: Dai Koufun! Mikazuki Jima no A...


In [102]:
# converting the dataframe into a numpy array
train_matrix = train_df.to_numpy()
train_matrix

array([[0, 19, 0.0, 'Naruto'],
       [0, 78, 0.0, 'Shuffle!'],
       [0, 225, 0.0, 'Elfen Lied'],
       ...,
       [999, 11756, 9.0, 'Sword Art Online'],
       [999, 12444, 10.0, 'Tasogare Otome x Amnesia'],
       [999, 14288, 10.0, 'Sukitte Ii na yo.']], dtype=object)

In [103]:
test_matrix = valid_df.to_numpy()
test_matrix

array([[0, 23, 0.0, 'School Rumble'],
       [0, 354, 0.0, 'Shakugan no Shana'],
       [0, 441, 0.0,
        'Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!'],
       ...,
       [999, 8073, 9.0, 'Highschool of the Dead'],
       [999, 13758, 7.0, 'Sakurasou no Pet na Kanojo'],
       [999, 14226, 10.0, 'Tonari no Kaibutsu-kun']], dtype=object)

In [104]:
class MF(object):
    def __init__(self, Y, K, X=None, W=None, lambda_=0.1, alpha=0.2, epochs=200):
        """
        constructor
        :param Y: the utility matrix
        :param K: the X columns and W rows
        :param X: the items matrix
        :param W: the users matrix
        :param lambda_: regularization param of cost function to solve overFitting, default 0.1
        :param alpha: learning rate for gradient descent, default 0.2
        :param epochs: number of training loops, default 100
        """
        self.__Y_raw_data = Y
        # normalized data, update later in normalized_Y function
        self.__Y = self.__Y_raw_data.copy()
        self.__K = K

        self.__lambda_ = lambda_
        self.__alpha = alpha
        self.__epochs = epochs
        # number of users, items, and ratings
        self.__users_count = int(np.max(Y[:, 0])) + 1
        self.__items_count = int(np.max(Y[:, 1])) + 1
        self.__ratings_count = Y.shape[0]
        self.__mu = np.zeros(self.__users_count)

        if X is None:
            self.__X = np.random.randn(self.__items_count, K)
        else:
            self.__X = X
        if W is None:
            self.__W = np.random.randn(K, self.__users_count)
        else:
            self.__W = W

    def __normalized(self):
        """
        this method is used to normalized ratings
        :return:
        """
        mu = np.zeros(self.__users_count)

        for i in range(self.__users_count):
            indices_user_i = np.where(self.__Y_raw_data[:, 0] == i)[0].astype(np.int32)
            ratings = []
            for j in indices_user_i:
                if float(self.__Y[j, 2]) != float(0):
                    ratings.append(self.__Y[j, 2])
            if len(ratings):
                _mean = np.mean(ratings)
            else:
                _mean = 0
            mu[i] = _mean
            # normalized
            if _mean != 0:
                for j in indices_user_i:
                    if float(self.__Y[j, 2]) != float(0):
                        self.__Y[j, 2] -= mu[i]
        self.__mu = mu

    def __cost_function(self):
        """
        this method is used to calculate the cost function
        :return: cost function J
        """
        J = 0
        for i in range(self.__ratings_count):
            user = int(self.__Y[i, 0])
            item = int(self.__Y[i, 1])
            rate = self.__Y[i, 2]
            J += (1 / (2 * self.__ratings_count)) * np.square(rate - self.__X[item, :].dot(self.__W[:, user]))
        # regularized
        J += (self.__lambda_ / 2) * (
                np.linalg.norm(self.__X, ord="fro") + np.linalg.norm(self.__W, ord="fro"))  # Frobenius Norm
        return J

    def __get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user user_id and get the corresponding rates
        :param user_id: id of target user
        :return: array of item ids and ratings
        """
        indices_user = np.where(self.__Y[:, 0] == user_id)[0].astype(np.int32)
        item_ids = self.__Y[indices_user, 1].astype(np.int32)
        ratings = self.__Y[indices_user, 2].astype(np.float32)
        return item_ids, ratings

    def __get_users_rating_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding rates
        :param item_id: id of item that need to find users who rated it
        :return: array of user ids and ratings
        """
        indices_item = np.where(self.__Y[:, 1] == item_id)[0].astype(np.int32)
        user_ids = self.__Y[indices_item, 0].astype(np.int32)
        ratings = self.__Y[indices_item, 2].astype(np.float32)
        return user_ids, ratings

    def __update_x(self):
        """
        update rows of X matrix
        :return:
        """
        for i in range(self.__items_count):
            user_ids, ratings = self.__get_users_rating_item(i)
            Wi = self.__W[:, user_ids]
            self.__X[i, :] = self.__X[i, :] - self.__alpha * (
                        -(1 / self.__ratings_count) * ((ratings - np.dot(self.__X[i, :], Wi)).dot(Wi.T))
                        + (self.__lambda_ * self.__X[i, :])).reshape((-1, self.__K))

    def __update_w(self):
        """
        update columns of W matrix
        :return:
        """
        for i in range(self.__users_count):
            item_ids, ratings = self.__get_items_rated_by_user(i)
            Xi = self.__X[item_ids, :]
            self.__W[:, i] = self.__W[:, i] - self.__alpha * (
                        -(1 / self.__ratings_count) * Xi.T.dot(ratings - Xi.dot(self.__W[:, i]))
                        + self.__lambda_ * self.__W[:, i]).reshape((self.__K, ))

    def matrix_factorization(self):
        """
        implementation of matrix factorization algo
        :return:
        """
        self.__normalized()
        for i in range(self.__epochs):
            self.__update_x()
            self.__update_w()
            mse_train = self.mse_evaluate(self.__Y_raw_data)
            print("epoch:", i + 1, "cost:", self.__cost_function(), "mse:", mse_train)

    def __predict(self, user_id, item_id):
        """
        this method is used to make prediction about rating for item item_id of user user_id
        :param user_id: id of user target
        :param item_id: id of item target
        :return: prediction
        """
        pred_result = self.__X[item_id, :].dot(self.__W[:, user_id]) + self.__mu[user_id]
        # truncate if results are out of range [0, 10]
        if pred_result < 0:
            return 0
        elif pred_result > 10:
            return 10
        return pred_result

    def mse_evaluate(self, testing_set):
        """
        this method is used to evaluate the accuracy of our model using MSE
        :param testing_set: our dataset for testing
        :return: MSE
        """
        number_of_test = testing_set.shape[0]
        square_error = 0
        for i in range(number_of_test):
            prediction = self.__predict(testing_set[i, 0], testing_set[i, 1])
            square_error += np.square(prediction - testing_set[i, 2])
        mean_square_error = square_error / number_of_test
        return mean_square_error

    def __recommend(self, user_id):
        """
        Determine all items should be recommended for user u
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        indices_of_user = np.where(self.__Y[:, 0] == user_id)[0]
        items_rated_by_user = self.__Y[indices_of_user, 1].tolist()
        recommended_items = []
        for i in range(self.__items_count):
            if i not in items_rated_by_user:
                rating = self.__predict(user_id, i)
                if rating > 7:
                    recommended_items.append(i)
        return recommended_items

    def print_recommendation(self, user_id):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        recommended_items = self.__recommend(user_id)
        print('Recommend item(s):', recommended_items, 'to user', user_id)



In [105]:
result_train = MF(train_matrix, K = 10)
result_train.matrix_factorization()

test_evaluate = result_train.mse_evaluate(test_matrix)
print("Test MSE:", test_evaluate)

epoch: 1 cost: 38.73005362812637 mse: 14.2306641786887
epoch: 2 cost: 37.703981773676674 mse: 13.957793630056438
epoch: 3 cost: 36.71965179194599 mse: 13.69950309252473
epoch: 4 cost: 35.77451016546333 mse: 13.455273852297898
epoch: 5 cost: 34.866196357004895 mse: 13.225955953128702


KeyboardInterrupt: 