# Implemented Matrix Factorization Approach
### Paper: https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import scipy
import time
import math
from scipy.sparse import coo_matrix

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.python.framework import ops
RANDOM_STATE = 17

  from ._conv import register_converters as _register_converters


### Main class

In [9]:
from cf_utils import *  # model utils
class CollaborativeFiltering():
    
    """
        parameters -- dictionary with studied parameters user_predicted and item_predicted
    """
    parameters = {}
    objective = []
    user_predicted = None # embeddings
    item_predicted = None # embeddings
    dictionary = {}
    train, test = None, None
    data = None
    indexes_of_test = []
    
    def __init__(self, data):
        """
            data -- format specified in paper
        """
        self.data = data 

    
    def _train_test_split(self, k=1):
        """
            Constrtucting test set by changing the non-zero values of ratings to zero
        """
        ## test set
        self.train = np.copy(self.data)
        self.test = np.copy(self.data)
        for user in range(self.data.shape[0]):
            num = self.data[user, self.data[user].nonzero()].shape[1]
            if num > k*2:
                new_size = int(num * 0.3)
                test_rating = np.random.choice(self.data[user,:].nonzero()[0],size=new_size)
                self.indexes_of_test.append(test_rating)
            elif num <= k:
                test_rating = np.random.choice(self.data[user,:].nonzero()[0],size=k)
                self.indexes_of_test.append(test_rating)
            
            self.train[user, test_rating] = 0
            self.test[user, test_rating] = self.data[user, test_rating]
        
        
    def _getEmbeddings(self):
        m, n = self.train.shape[0],self.train.shape[1]
        u_emb, i_emb = np.random.rand(m, self.n_factors), np.random.rand(self.n_factors, n)
        return u_emb, i_emb
    
    def _toDataFrame(self, pred):
        df = pd.DataFrame(pred, index=self.data.index, columns=self.data.columns)
        return df
    
    def fit(self, n_factors, learning_rate = 0.005, num_epochs = 10, verbose=0, minibatch_size = 2, 
         lmbda = .02):
        """
            Learning user and item embeddings from train data
            @params -- hyperparameters for objective optimization
        """
        # initialization
        ops.reset_default_graph()  # to be able to rerun the model without overwriting tf variables
        tf.set_random_seed(RANDOM_STATE)  
        self._train_test_split(2)
        n_rows, n_cols = self.train.shape[0], self.train.shape[1]
        costs = []
        
#         u_emb, i_emb = self._getEmbeddings()
        # regularization parameter
        beta = tf.constant(lmbda, dtype=tf.float64)


        # making the u_emb,i_emb tf variables and creating a dictionary with that params
        parameters = initialize_parameters_with_random(n_rows, n_cols, n_factors)
        
        u_emb, i_emb = parameters['p_u'], parameters['q_i']
        # forward pass (dot product)
        Y_pred = forward_pass(parameters)

        # computing loss
        loss = compute_cost(Y_pred, self.train)
        regularization = tf.add(tf.nn.l2_loss(u_emb), tf.nn.l2_loss(i_emb))
        cost = tf.reduce_mean(tf.add(tf.cast(loss, dtype=tf.float64), tf.cast(beta * regularization, dtype=tf.float64)))

        # optimization with Adam(improved SGD)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

        init = tf.global_variables_initializer()

        # make it all work for num_epochs times
        with tf.Session() as sess:
            sess.run(init)
            print('Start')
            for epoch in range(num_epochs):
                _, epoch_cost = sess.run([optimizer, cost])

                if verbose == 1 and epoch % 5 == 0:
                    print('After epoch %i cost: %.4f' % (epoch, epoch_cost))
                costs.append(epoch_cost)

            parameters = sess.run(parameters)
        self.objective = costs
        self.user_predicted, self.item_predicted = parameters['p_u'],parameters['q_i']
        self.parameters = parameters
    
    def predict(self, err='mae'):
        
        
        ## prediction on test set, computing the mae score
        
        prediction = np.dot(self.user_predicted, self.item_predicted)
        loss = 0
        pred = np.copy(prediction)
        values_pred, values_test = [], []
        index = np.array(self.indexes_of_test)
        for i, v in enumerate(index):
            values_test.append(self.test[i, v])
            values_pred.append(pred[i, v])
        
        p, n = [], []
        for v in values_test:
            for i in v:
                n.append(i)
        for v in values_pred:
            for i in v:
                p.append(i)
                
                
        if err == 'mae':
            loss = mean_absolute_error(p, n)
        
        elif err == 'mse':
            loss = mean_squared_error(p, n)
        return loss
    
    def predict_parameters(self, users, items, kbest=10):
        """
            Returns:
            pred_dictionary -- dictionary for 25k users with kbest icds
        """
        pred = np.dot(self.user_predicted, self.item_predicted)
        df = pd.DataFrame(pred, index=users, columns=items)
        pred_dictionary = {}
        for i in range(df.shape[0]):
            pred_dictionary[df.index[i]] = df.iloc[i, np.argsort(pred[i])[-kbest:]]
        self.dictionary = pred_dictionary
        return pred_dictionary

### Testing a class

In [11]:
# create a mini-batch for experiments
batch = UI.toarray()[:10000]

In [12]:
## test set
train = np.copy(batch)
test = np.copy(batch)
indexes_of_test = []
for user in range(batch.shape[0]):
    test_rating = np.random.choice(batch[user,:].nonzero()[0],size=2)
    indexes_of_test.append(test_rating)
    train[user, test_rating] = 0
    test[user, test_rating] = batch[user, test_rating]

In [15]:
cf = CollaborativeFiltering(predict_matrix.values[:10000])

In [17]:
%%time
cf.fit(n_factors=50, verbose=1, num_epochs=25)

Start
After epoch 0 cost: 2.1064
After epoch 5 cost: 0.2649
After epoch 10 cost: 0.3672
After epoch 15 cost: 0.2327
After epoch 20 cost: 0.1824
CPU times: user 4min 29s, sys: 1min 1s, total: 5min 30s
Wall time: 2min 4s
