In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
mnames = ['movie_id', 'title', 'genre']
movies_df = pd.read_table('ml-1m/movies.dat', names = mnames, sep = "::", engine = 'python', encoding='ISO-8859-1')
# Loading the cleaned datasets
rnames = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_table("ml-1m/ratings.dat", header =None, sep='::',names=rnames, engine= 'python')
uname = ['user_id','gender','age','occupation','zip']
users_df = pd.read_table("ml-1m/users.dat", sep='::', header = None, names=uname, engine='python')

KeyboardInterrupt: 

In [None]:
# Splitting the ratings dataset into the feature set (X) and target labels (y)
X = ratings_df.drop(columns='rating')
y = ratings_df["rating"].values  # The movie ratings are the target variables we want to predict

# Preparing train, validation and test datasets.
# I have chosen a split ratio of 80%, 10%, 10%, because I want a somewhat large training set at the cost of a
# smaller validation and test set. I do not think that a smaller validation (or test) dataset will negatively
# impact the generalization ability of the chosen models, because I am only using rather simple ML models
# with few hyperparamaters.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Creating a complete training dataset with X_train and y_train
train_df = X_train.copy()
train_df["rating"] = y_train

test_df = X_test.copy()
test_df["rating"] = y_test

In [None]:
train_set = np.array(train_df)
train_set = train_set.astype("int")
test_set = np.array(train_df)
test_set = test_set.astype("int")

In [None]:
#take max users id in train and test data
nb_users = int(max(max(train_set[:, 0]), max(test_set[:, 0])))
nb_movies =  int(max(max(train_set[:, 1]), max(test_set[:, 1])))
print(nb_users, nb_movies)

6040 3952


In [None]:
def convert(data):
    rating_data = np.zeros((nb_users, nb_movies))
    timestamp_data = np.zeros((nb_users, nb_movies))
    for id_users in range(1, nb_users + 1):
        ##id of movies that is rated by current users
        id_movies = data[:,1][data[:,0] == id_users]
        
        ##rate of movies that is given by current user
        id_ratings = data[:,3][data[:,0] == id_users]
        id_timestamps = data[:, 2][data[:, 0] == id_users]
        
        #inialize ratings for all movies
        #set 0 for movies that are not rated by current users
        ratings = np.zeros(nb_movies)
        #movie id starts from 1, 1st movie will be 1st element in rating with index as 0
        ratings[id_movies - 1] = id_ratings
        rating_data[id_users - 1] = ratings

        timestamps = np.zeros(nb_movies)
        timestamps[id_movies - 1] = id_timestamps
        timestamp_data[id_users - 1] = timestamps
    return rating_data, timestamp_data

In [None]:
train_rate, train_timestamp = convert(train_set)
test_rate, test_timestamp = convert(test_set)

In [None]:
films_df = pd.read_csv("./films.csv")
nb_genres = len(films_df.columns) - 2
films_df = films_df.set_index('movie_id')
films_df.drop(columns=["title"], inplace=True)
new_index = np.array(range(1, 3953))
films_df = films_df.reindex(new_index, fill_value=0)
films_matrix = np.array(films_df)
films_matrix = films_matrix.astype('int')

In [None]:
from tqdm import tqdm 

users_prefer_matrix = np.zeros((nb_users, nb_genres))
users_rating_matrix = np.zeros((nb_users, nb_genres))
for i in tqdm(range(nb_users)):
    films = films_matrix[train_rate[i] > 0]
    ratings = train_rate[i][train_rate[i] > 0].reshape(films.shape[0])
    timestamps = train_timestamp[i][train_rate[i] > 0]
    
    rate = 0.1
    timestamps = (timestamps - np.min(timestamps)) / (np.max(timestamps) - np.min(timestamps)) * (-rate)
    timestamps = np.exp(timestamps)
    A = np.diag(np.ones(films.shape[0])) * timestamps * ratings 
    avg = np.ones((1, films.shape[0])) / films.shape[0]
    P = np.dot(avg, np.dot(A, films)).reshape(-1)
    users_prefer_matrix[i] = P


100%|██████████| 6040/6040 [00:02<00:00, 2636.18it/s]


In [260]:
k = 9
F = 60
lr = 0.015
lam = 0.03
epochs = 20
N = 30

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, random_state=101).fit(users_prefer_matrix)



In [None]:

item_ids = set(ratings_df['movie_id'].values)


In [None]:
from math import exp
import random
import pickle

class Corpus:
    @classmethod
    def pre_process(cls, frame, index):
        cls.frame = frame
        cls.items_dict_path = 'lfm/lfm_items_{}.dict'.format(index)
        cls.user_ids = set(cls.frame['user_id'].values)
        cls.item_ids = item_ids
        cls.items_dict = {user_id: cls._get_pos_neg_item(user_id) for user_id in list(cls.user_ids)}
        cls.save(index)

    @classmethod
    def _get_pos_neg_item(cls, user_id):
        """
        Define the pos and neg item for user.
        pos_item mean items that user have rating, and neg_item can be items
        that user never see before.
        Simple down sample method to solve unbalance sample.
        """
        pos_item_ids = set(cls.frame[cls.frame['user_id'] == user_id]['movie_id'])
        neg_item_ids = cls.item_ids ^ pos_item_ids
        neg_item_ids = list(neg_item_ids)[:len(pos_item_ids)]
        item_dict = {}
        sub_frame = cls.frame[cls.frame['user_id'] == user_id]
        for item in pos_item_ids: item_dict[item] = int(sub_frame[sub_frame['movie_id'] == item]['rating'])
        for item in neg_item_ids: item_dict[item] = 0
        return item_dict

    @classmethod
    def save(cls, index):
        f = open('lfm/lfm_items_{}.dict'.format(index), 'wb')
        pickle.dump(cls.items_dict, f)
        f.close()

    @classmethod
    def load(cls, index):
        f = open('lfm/lfm_items_{}.dict'.format(index), 'rb')
        items_dict = pickle.load(f)
        f.close()
        return items_dict

In [None]:
for index in range(9):
    frame = train_df[kmeans.labels_[train_df['user_id']-1] == index]
    print(index, frame.size)
    Corpus.pre_process(frame, index)

0 84344
1 481080
2 174028
3 45380
4 659208
5 168660
6 1269484
7 272924
8 45560


In [121]:
user_neighbors_N = np.zeros((nb_users, N))
user_neighbors_index_N = np.zeros((nb_users, N))
user_neighbors_sim_N = np.zeros((nb_users, N))
user_neighbors_sim_sum = np.zeros((nb_users))
cluster_prefix_matrix = []
cluster_index = []
for i in range(k):
    cluster_prefix_matrix.append(users_prefer_matrix[kmeans.labels_ == i])
    cluster_index.append(np.argwhere(kmeans.labels_ == i))

In [125]:
sum_user = 0
for i in range(k):
    group_index = cluster_index[i]
    sum_user += len(group_index)
    group_prefer_matrix = cluster_prefix_matrix[i]
    norm = np.linalg.norm(group_prefer_matrix, axis=1, keepdims=True)
    group_sim_matrix = (np.dot(group_prefer_matrix, group_prefer_matrix.T)) / (norm * norm.T)
    for i in range(len(group_index)):
        group_sim_matrix[i, i] = 0

    for index, user in tqdm(enumerate(group_index)):
        neighbors = np.argpartition(group_sim_matrix[index], -N)[-N:]
        user_neighbors_index_N[user] = group_index[neighbors].reshape(1, -1).astype(int)
        user_neighbors_N[user] = (group_index[neighbors].reshape(1, -1) + 1).astype(np.int64)
        user_neighbors_sim_N[user] = group_sim_matrix[index, neighbors]
        user_neighbors_sim_sum[user] = np.sum(user_neighbors_sim_N[user])


303it [00:00, 37865.33it/s]
945it [00:00, 32578.39it/s]
517it [00:00, 36919.93it/s]
119it [00:00, 39656.93it/s]
996it [00:00, 34337.44it/s]
491it [00:00, 37762.96it/s]
1680it [00:00, 28958.81it/s]
829it [00:00, 30696.71it/s]
160it [00:00, 39993.36it/s]


In [257]:
class LFM:
    def __init__(self, F, lr, lam, epochs, frame, index):
        self.class_count = F
        self.iter_count = epochs
        self.lr = lr
        self.lam = lam
        self._init_model(frame, index)

    def _init_model(self, frame, index):
        """
        Get corpus and initialize model params.
        """
        self.frame = frame
        self.user_ids = set(self.frame['user_id'].values)
        self.item_ids = item_ids
        self.items_dict = Corpus.load(index)
        self.index = index

        array_p = np.random.randn(len(self.user_ids), self.class_count)
        array_q = np.random.randn(len(self.item_ids), self.class_count)
        self.p = pd.DataFrame(array_p, columns=range(0, self.class_count), index=list(self.user_ids))
        self.q = pd.DataFrame(array_q, columns=range(0, self.class_count), index=list(self.item_ids))
        
    def _predict(self, user_id, item_id):
        """
        Calculate interest between user_id and item_id.
        p is the look-up-table for user's interest of each class.
        q means the probability of each item being classified as each class.
        """
        p = np.mat(self.p.loc[user_id].values)
        q = np.mat(self.q.loc[item_id].values).T
        r = (p * q).sum()
        
        # p = self.p.loc[user_neighbors_N[user_id - 1]].values # N * F
        # rate = train_rate[user_neighbors_index_N[user_id - 1].astype(int), item_id - 1].reshape(-1, 1)
        # neigh = (np.dot(p, q) - rate) * user_neighbors_sim_N[user_id - 1] / user_neighbors_sim_sum[user_id - 1]
        # print(neigh)
        return r

    def _loss(self, user_id, item_id, y, step):
        """
        Loss Function define as MSE, the code write here not that formula you think.
        """
        e = y - self._predict(user_id, item_id)
        return e

    def _optimize(self, user_id, item_id, e):
        """
        Use SGD as optimizer, with L2 p, q square regular.
        e.g: E = 1/2 * (y - predict)^2, predict = matrix_p * matrix_q
             derivation(E, p) = -matrix_q*(y - predict), derivation(E, q) = -matrix_p*(y - predict),
             derivation（l2_square，p) = lam * p, derivation（l2_square, q) = lam * q
             delta_p = lr * (derivation(E, p) + derivation（l2_square，p))
             delta_q = lr * (derivation(E, q) + derivation（l2_square, q))
        """
        gradient_p = -e * self.q.loc[item_id].values
        l2_p = self.lam * self.p.loc[user_id].values
        delta_p = self.lr * (gradient_p + l2_p)

        gradient_q = -e * self.p.loc[user_id].values
        l2_q = self.lam * self.q.loc[item_id].values
        delta_q = self.lr * (gradient_q + l2_q)

        self.p.loc[user_id] -= delta_p
        self.q.loc[item_id] -= delta_q

    def train(self):
        """
        Train model.
        """
        for step in tqdm(range(0, self.iter_count)):
            for user_id, item_dict in self.items_dict.items():
                item_ids = list(item_dict.keys())
                random.shuffle(item_ids)
                for item_id in item_ids:
                    e = self._loss(user_id, item_id, item_dict[item_id] / 5, step)
                    self._optimize(user_id, item_id, e)
            self.lr *= 0.9
        self.save()

    def predict(self, user_id, top_n=10):
        """
        Calculate all item user have not meet before and return the top n interest items.
        """
        user_item_ids = set(self.frame[self.frame['user_id'] == user_id]['movie_id'])
        other_item_ids = self.item_ids ^ user_item_ids
        interest_list = [self._predict(user_id, item_id) for item_id in other_item_ids]
        candidates = sorted(zip(list(other_item_ids), interest_list), key=lambda x: x[1], reverse=True)
        return candidates[:top_n]
    
    def predictRate(self, user_id, item_id):
        return self._predict(user_id, item_id)

    def save(self):
        """
        Save model params.
        """
        p = self.p.reset_index(drop=False)
        q = self.q.reset_index(drop=False)

        p.to_csv("lfm/lfm_p_{}.csv".format(self.index), index=False)
        q.to_csv("lfm/lfm_q_{}.csv".format(self.index), index=False)
        # f = open('lfm/lfm_{}.model'.format(self.index), 'wb')
        # pickle.dump((self.p, self.q), f)
        # f.close()

    def load(self):
        """
        Load model params.
        """
        self.p = pd.read_csv("lfm/lfm_p_{}.csv".format(self.index))
        self.p.set_index('index',drop=True, append=False, inplace=True, verify_integrity=False)
        self.q = pd.read_csv("lfm/lfm_q_{}.csv".format(self.index))
        self.q.set_index('index',drop=True, append=False, inplace=True, verify_integrity=False)

        # f = open('lfm/lfm_{}.model'.format(self.index), 'rb')
        # self.p, self.q = pickle.load(f)
        # print(self.p, self.q)
        # f.close()

In [262]:
for index in range(1, k):
    frame = train_df[kmeans.labels_[train_df['user_id']-1] == index]
    lfm = LFM(F, lr, lam, epochs, frame, index)
    lfm.train()

 15%|█▌        | 3/20 [10:57<1:02:05, 219.15s/it]

In [None]:
import math

test_loss = 0
s = float(np.sum(test_set > 0))
for i in range(k):
    frame = train_df[kmeans.labels_[train_df['user_id']-1] == i]
    lfm = LFM(F, lr, lam, epochs, frame, i)
    lfm.load()
    for user in cluster_index[i]:
        items = test_rate[user][0]
        for index, item in enumerate(items):
            if item > 0:
                pred = lfm.predictRate(user[0] + 1, index + 1)
                test_loss += (pred * 5 - item) ** 2

test_loss = math.sqrt(test_loss / s)     
print(test_loss)   

8.22578462168399
