# Recommendation Systems Assignment

### MIE451/1513 UofT

## Imports

In [1]:
# import required libraries
import os
import os.path
import numpy as np
import pandas as pd
from math import sqrt
from heapq import nlargest
from tqdm import trange
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error

## Support functions and variables

In [2]:
#!unzip ml-100k.zip -d .

In [3]:
MOVIELENS_DIR = "ml-100k"

In [4]:
!ls {MOVIELENS_DIR}

README
allbut.pl
mku.sh
u.data
u.genre
u.info
u.item
u.occupation
u.user
u1.base
u1.test
u2.base
u2.test
u3.base
u3.test
u4.base
u4.test
u5.base
u5.test
ua.base
ua.test
ub.base
ub.test


In [5]:
def getData(folder_path, file_name):
    fields = ['userID', 'itemID', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(folder_path, file_name), sep='\t', names=fields)
    return data 

In [6]:
rating_df = getData(MOVIELENS_DIR, 'u.data')

In [99]:
num_users = len(rating_df.userID.unique())
num_items = len(rating_df.itemID.unique())
print("Number of users:", num_users)
print("Number of items:", num_items)

Number of users: 943
Number of items: 1682


## Q1

### (a)

In [8]:
def dataPreprocessor(rating_df, num_users, num_items):
    """
        INPUT: 
            data: pandas DataFrame. columns=['userID', 'itemID', 'rating' ...]
            num_row: int. number of users
            num_col: int. number of items
            
        OUTPUT:
            matrix: 2D numpy array. row IDs are (userID-1), columns IDs are (itemID-1),
            and the rating for (userID,itemID,rating) is the value at this row and column.  
            Any observed ratings are zero.
            
        NOTE 1: see where something very similar is done in the lab in function 'buildUserItemMatrix'    
            
        NOTE 2: data can have more columns, but your function should ignore 
              additional columns.
              
    """
    matrix = np.zeros((num_users, num_items))
    ########### your code goes here ###########
    
    rating_df = rating_df[["userID","itemID","rating","timestamp"]]
    
    for (index, userID, itemID, rating, timestamp) in rating_df.itertuples():
        matrix[userID-1, itemID-1] = rating
    
    
    ###########         end         ###########
    return matrix

In [13]:
dataPreprocessor(rating_df, num_users, num_items)

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

### (b)

In [14]:
class BaseLineRecSys(object):
    def __init__(self, method, processor=dataPreprocessor):
        """
            method: string. From ['popularity','useraverage']
            processor: function name. dataPreprocessor by default
        """
        self.method_name = method
        self.method = self._getMethod(self.method_name)
        self.processor = processor
        self.pred_column_name = self.method_name
        
    def _getMethod(self, method_name):
        """
            Don't change this
        """
        switcher = {
            'popularity': self.popularity,
            'useraverage': self.useraverage,
        }
        
        return switcher[method_name]
    
    @staticmethod
    def useraverage(train_matrix, num_users, num_items):
        """
            INPUT:
                train_matrix: 2D numpy array.
                num_users: int. Number of Users.
                num_items: int. Number of Items.
            OUTPUT:
                predictionMatrix: 2D numpy array. this is the same dimensions and 
                row/column IDs as train_matrix, but anywhere there is a 0 in train_matrix, 
                there should be a predicted value in predictedMatrix.
                
            NOTE: see where something very similar is done in the lab in function 'predictByUserAverage'  
            
        """
        
        predictionMatrix = np.zeros((num_users, num_items))
        ########### your code goes here ###########
        
        for (user,item), rating in np.ndenumerate(train_matrix):
        # Predict rating for every item that wasn't ranked by the user (rating == 0)
        #if rating == 0:
        # Extract the items the user already rated
            userVector = train_matrix[user, :]
            ratedItems = userVector[userVector.nonzero()]

            # If not empty, calculate average and set as rating for the current item
            if ratedItems.size == 0:
                itemAvg = 0
            else:
                itemAvg = ratedItems.mean()
            predictionMatrix[user, item] = itemAvg
            
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
        
        

        ###########         end         ###########
        return predictionMatrix
    
    @staticmethod
    def popularity(train_matrix, num_users, num_items):
        """
            INPUT:
                train_matrix: 2D numpy array.
                num_users: int. Number of Users.
                num_items: int. Number of Items.
            OUTPUT:
                predictionMatrix: 2D numpy array. this is the same dimensions and 
                row/column IDs as train_matrix, but anywhere there is a 0 in train_matrix, 
                there should be a predicted value in predictedMatrix.
                
            NOTE: see where something very similar is done in the lab in function 'predictByPopularity'    
        """
        
        predictionMatrix = np.zeros((num_users, num_items))
        ########### your code goes here ###########

        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)

        # For every item calculate the number of people liked (4-5) divided by the number of people that rated
        itemPopularity = np.zeros((num_items))
        for item in range(num_items):
            numOfUsersRated = len(train_matrix[:, item].nonzero()[0])
            numOfUsersLiked = len(vf(train_matrix[:, item]).nonzero()[0])
            if numOfUsersRated == 0:
                itemPopularity[item] = 0
            else:
                itemPopularity[item] = numOfUsersLiked/numOfUsersRated

        for (user,item), rating in np.ndenumerate(train_matrix):
            # Predict rating for every item that wasn't ranked by the user (rating == 0)
            #if rating == 0:
            predictionMatrix[user, item] = itemPopularity[item]

            # report progress every 100 users
            if (user % 100 == 0 and item == 1):
                print ("calculated %d users" % (user,))
        
        
                
        ###########         end         ###########
        return predictionMatrix    
    
    def predict_all(self, train_df, num_users, num_items):
        
        train_matrix = self.processor(train_df, num_users, num_items)
        self.__model = self.method(train_matrix, num_users, num_items)
        
    def evaluate_test(self, test_df, copy=False):
        
        if copy:
            prediction = test_df.copy()
        else:
            prediction = test_df
            
        prediction[self.pred_column_name] = np.nan
        
        for (index, 
             userID, 
             itemID) in tqdm(prediction[['userID','itemID']].itertuples()):
            prediction.ix[index, self.pred_column_name] = self.__model[userID-1, itemID-1]

        return prediction
        
    def getModel(self):
        """
            return predicted user-item matrix
        """
        return self.__model
    
    def getPredColName(self):
        """
            return prediction column name
        """
        return self.pred_column_name
    
    def reset(self):
        """
            reuse the instance of the class by removing model
        """
        try:
            self.model = None
        except:
            print("You don not have model..")
            

In [15]:
popularity_recsys = BaseLineRecSys('popularity')

In [16]:
popularity_recsys.predict_all(rating_df, num_users, num_items)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


In [17]:
popularity_recsys.getModel()

array([[ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.71017699,  0.38931298,  0.37777778, ...,  0.        ,
         0.        ,  0.        ]])

In [18]:
rating_df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [19]:
popularity_recsys.evaluate_test(rating_df,copy=True).head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
100000it [01:05, 1516.85it/s]


Unnamed: 0,userID,itemID,rating,timestamp,popularity
0,196,242,3,881250949,0.760684
1,186,302,3,891717742,0.804714
2,22,377,1,878887116,0.076923
3,244,51,2,880606923,0.555556
4,166,346,1,886397596,0.611111


In [20]:
average_user_rating_recsys = BaseLineRecSys('useraverage')

In [21]:
average_user_rating_recsys.predict_all(rating_df, num_users, num_items)

In [22]:
average_user_rating_recsys.getModel()

array([[ 3.61029412,  3.61029412,  3.61029412, ...,  3.61029412,
         3.61029412,  3.61029412],
       [ 3.70967742,  3.70967742,  3.70967742, ...,  3.70967742,
         3.70967742,  3.70967742],
       [ 2.7962963 ,  2.7962963 ,  2.7962963 , ...,  2.7962963 ,
         2.7962963 ,  2.7962963 ],
       ..., 
       [ 4.04545455,  4.04545455,  4.04545455, ...,  4.04545455,
         4.04545455,  4.04545455],
       [ 4.26582278,  4.26582278,  4.26582278, ...,  4.26582278,
         4.26582278,  4.26582278],
       [ 3.41071429,  3.41071429,  3.41071429, ...,  3.41071429,
         3.41071429,  3.41071429]])

In [23]:
average_user_rating_recsys.evaluate_test(rating_df,copy=True).head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
100000it [01:05, 1524.65it/s]


Unnamed: 0,userID,itemID,rating,timestamp,useraverage
0,196,242,3,881250949,3.615385
1,186,302,3,891717742,3.413043
2,22,377,1,878887116,3.351562
3,244,51,2,880606923,3.651261
4,166,346,1,886397596,3.55


## Q2

### (a) The cosine metric works better when the data is sparse as in the users don't have rating values for most movies. While the Euclidean metric is better when the data is dense, almost all user-item ratings are non-zero and the magnitude of the rating matters. Since the data is mostly sparse and with the assumption that 2 users watch specific similar movies the ratings (magnitude) may not matter if they both had attempted to watch the same movies, therefore the cosine metric may work better.

In [70]:
class SimBasedRecSys(object):

    def __init__(self, base, method, processor=dataPreprocessor):
        """
            base: string. From ['user', 'item']. User-based Similarity or Item-based
            method: string. From ['cosine', 'euclidean', 'somethingelse']
            processor: function name. dataPreprocessor by default
        """
        self.base = base
        self.method_name = method
        self.method = self._getMethod(self.method_name)
        self.processor = processor
        self.pred_column_name = self.base+'-'+self.method_name
    
    def _getMethod(self, method_name):
        """
            Don't change this
        """
        switcher = {
            'cosine': self.cosine,
            'euclidean': self.euclidean,
            'somethingelse': self.somethingelse,
        }
        
        return switcher[method_name]
    
    @staticmethod
    def cosine(matrix):
        """
            cosine similarity
        """
        similarity_matrix = 1 - pairwise_distances(matrix, metric='cosine')
        return similarity_matrix
    
    @staticmethod
    def euclidean(matrix):
        """
            euclidean similarity
        
        INPUT
            matrix: same as the rating matrix generated by dataPreprocessor 
            with R rows and C columns.  Outputs an R x R similarity_matrix S 
            where each S_ij should be the euclidean similarity between row i and 
            row j of matrix.
        """
        ########### your code goes here ###########

        similarity_matrix = 1 / (1 + pairwise_distances(matrix, metric='euclidean'))
    
        ###########         end         ###########    
        
        return similarity_matrix
    
    @staticmethod
    def somethingelse(matrix):
        """
            manhattan? or super-natural intuition similarity
            
        INPUT
            matrix: same as the rating matrix generated by dataPreprocessor 
            with R rows and C columns.  Outputs an R x R similarity_matrix S 
            where each S_ij should be the somethingelse similarity between row i and 
            row j of matrix.
        """
        ########### your code goes here ###########
    
        similarity_matrix = 1 / (1 + pairwise_distances(matrix, metric='manhattan'))
    
        ###########         end         ###########        
        return similarity_matrix
        
    def predict_all(self, train_df, num_users, num_items):
        """
            INPUT: 
                data: pandas DataFrame. columns=['userID', 'itemID', 'rating'...]
                num_row: scalar. number of users
                num_col: scalar. number of items
            OUTPUT:
                no return... this method assigns the result to self.__model
                
                self.__model: this is the same dimensions and row/column IDs as train_matrix, 
                but anywhere there is a 0 in train_matrix, there should be a predicted value 
                in self.__model.
            
            NOTES:
                self.__model should contain predictions for *all* user and items
                (don't worry about predicting for observed (user,item) pairs,
                 since we won't be using these predictions in the evaluation)
                (see 'vectorizedUserSimRecSys' code in for an efficient vectorized example)
                
        """
        train_matrix = self.processor(train_df, num_users, num_items)
        
        if self.base == 'user':
            ########### your code goes here ###########

            # Initialize the predicted rating matrix with zeros
            temp_matrix = np.zeros(train_matrix.shape)
            temp_matrix[train_matrix.nonzero()] = 1
            uu_similarity = 1 - pairwise_distances(train_matrix, metric='cosine')# self.method(train_matrix)
            normalizer = np.matmul(uu_similarity, temp_matrix)
            normalizer[normalizer == 0] = 1e-5
            predictionMatrix = np.matmul(uu_similarity, train_matrix)/normalizer
            
            # Cold Start
            useraverage = np.sum(train_matrix, axis=1)/np.sum(temp_matrix, axis=1)
            columns = np.sum(predictionMatrix, axis=0)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(useraverage, axis=1)
            
            self.__model = predictionMatrix
            
            ###########         end         ###########
            
        elif self.base == 'item':
            ########### your code goes here ###########
            
            train_matrix = train_matrix.transpose()
            
            # Initialize the predicted rating matrix with zeros
            temp_matrix = np.zeros(train_matrix.shape)
            temp_matrix[train_matrix.nonzero()] = 1
            uu_similarity = 1 - pairwise_distances(train_matrix, metric='cosine')# self.method(train_matrix)
            normalizer = np.matmul(uu_similarity, temp_matrix)
            normalizer[normalizer == 0] = 1e-5
            predictionMatrix = np.matmul(uu_similarity, train_matrix)/normalizer
            
            # Cold Start
            useraverage = np.sum(train_matrix, axis=1)/np.sum(temp_matrix, axis=1)
            columns = np.sum(predictionMatrix, axis=0)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(useraverage, axis=1)
            
            self.__model = predictionMatrix.transpose()
            
            ###########         end         ###########
        else:
            print('No other option available')
        
    def evaluate_test(self, test_df, copy=False):
        """
            INPUT:
                data: pandas DataFrame. columns=['userID', 'itemID', 'rating'...]
            OUTPUT:
                predictions:  pandas DataFrame. 
                              columns=['userID', 'itemID', 'rating', 'base-method'...]
                              
            NOTE: 1. data can have more columns, but your function should ignore 
                  additional columns.
                  2. 'base-method' depends on your 'base' and 'method'. For example,
                  if base == 'user' and method == 'cosine', 
                  then base-method == 'user-cosine'
                  3. your predictions go to 'base-method' column
        """
        if copy:
            prediction = test_df.copy()
        else:
            prediction = test_df
        prediction[self.pred_column_name] = np.nan
        
        for (index, 
             userID, 
             itemID) in tqdm(prediction[['userID','itemID']].itertuples()):
            prediction.ix[index, self.pred_column_name] = self.__model[userID-1, itemID-1]
    
        return prediction
    
    def getModel(self):
        """
            return predicted user-item matrix
        """
        return self.__model
    
    def getPredColName(self):
        """
            return prediction column name
        """
        return self.pred_column_name
    
    def reset(self):
        """
            reuse the instance of the class by removing model
        """
        try:
            self.model = None
        except:
            print("You do not have model..")

In [64]:
# Examples of how to call similarity functions.
I = np.eye(3)
SimBasedRecSys.cosine(I)

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [65]:
SimBasedRecSys.euclidean(I)

array([[ 1.        ,  0.41421356,  0.41421356],
       [ 0.41421356,  1.        ,  0.41421356],
       [ 0.41421356,  0.41421356,  1.        ]])

In [66]:
SimBasedRecSys.somethingelse(I)

array([[ 1.        ,  0.33333333,  0.33333333],
       [ 0.33333333,  1.        ,  0.33333333],
       [ 0.33333333,  0.33333333,  1.        ]])

### (b)

For the somethingelse function, I chose to do the manhattan distance, in computation efficiency it is easy to compute the sum of distances for user or item comparison when the number of attributes in the comparison is very large vs squaring the distances for euclidean distance. If the time of the year when a movie is watched, year of the movie, age of the individual watching or based on social media likes or follows of a particular page is also used when recommending a movie.

## Q3

### (a)

In [75]:
user_cosine_recsys = SimBasedRecSys('user','cosine')

In [76]:
user_cosine_recsys.predict_all(rating_df, num_users, num_items)

In [77]:
user_cosine_recsys.getModel()

array([[ 3.89911175,  3.19022667,  3.0261129 , ...,  2.        ,
         3.        ,  3.        ],
       [ 3.84034456,  3.17139889,  2.92626717, ...,  2.        ,
         3.        ,  3.        ],
       [ 3.87104065,  3.12823798,  3.03250708, ...,  2.        ,
         3.        ,  3.        ],
       ..., 
       [ 3.90754645,  3.20227238,  3.05776201, ...,  2.        ,
         3.        ,  3.        ],
       [ 3.91100649,  3.21591021,  2.98854017, ...,  2.        ,
         3.        ,  3.        ],
       [ 3.91593122,  3.24268207,  3.08255897, ...,  0.        ,
         3.        ,  3.        ]])

In [78]:
rating_df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [79]:
user_cosine_recsys.evaluate_test(rating_df,copy=True).head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
100000it [01:07, 1472.51it/s]


Unnamed: 0,userID,itemID,rating,timestamp,user-cosine
0,196,242,3,881250949,4.025213
1,186,302,3,891717742,4.142828
2,22,377,1,878887116,1.92208
3,244,51,2,880606923,3.431884
4,166,346,1,886397596,3.424963


### (b)

In [80]:
class CrossValidation(object):
    def __init__(self, metric, data_path=MOVIELENS_DIR):
        """
            INPUT:
                metric: string. from['RMSE','P@K','R@K']
        """
        self.folds = self._getData(MOVIELENS_DIR)
        self.metric_name = metric
        self.metric = self._getMetric(self.metric_name)
        
    def _getMetric(self, metric_name):
        """
            Don't change this
        """
        switcher = {
            'RMSE': self.rmse,
            'P@K': self.patk,
            'R@K': self.ratk,
        }
        
        return switcher[metric_name]
    
    @staticmethod
    def rmse(data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        return sqrt(mean_squared_error(data[pred], data[true]))
    
    # Precision at k
    def patk(self, data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            k: top-k items retrived
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        prediction = self.getMatrix(data, num_users, num_items, pred)
        testSet =  self.getMatrix(data, num_users, num_items, true)
    
        # Initialize sum and count vars for average calculation
        sumPrecisions = 0
        countPrecisions = 0

        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)

        for userID in range(num_users):
            # Pick top K based on predicted rating
            userVector = prediction[userID,:]
            topK = nlargest(k, range(len(userVector)), userVector.take)

            # Convert test set ratings to like / don't like
            userTestVector = vf(testSet[userID,:]).nonzero()[0]

            # Calculate precision
            precision = len([item for item in topK if item in userTestVector])/len(topK)

            # Update sum and count
            sumPrecisions += precision
            countPrecisions += 1

        # Return average P@k
        return sumPrecisions/countPrecisions
    
    # Recall at k
    def ratk(self, data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            k: top-k items relevant
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        prediction = self.getMatrix(data, num_users, num_items, pred)
        testSet =  self.getMatrix(data, num_users, num_items, true)
        # Initialize sum and count vars for average calculation
        sumRecalls = 0
        countRecalls = 0

        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)

        for userID in range(num_users):
            # Pick top K based on predicted rating
            userVector = prediction[userID,:]
            topK = nlargest(k, range(len(userVector)), userVector.take)

            # Convert test set ratings to like / don't like
            userTestVector = vf(testSet[userID,:]).nonzero()[0]

            # Ignore user if has no ratings in the test set
            if (len(userTestVector) == 0):
                continue

            # Calculate recall
            recall = len([item for item in topK if item in userTestVector])/len(userTestVector)

            # Update sum and count
            sumRecalls += recall
            countRecalls += 1

        # Return average R@k
        return sumRecalls/countRecalls
    
    @staticmethod
    def getMatrix(rating_df, num_users, num_items, column_name):
        matrix = np.zeros((num_users, num_items))
    
        for (index, userID, itemID, value) in rating_df[['userID','itemID', column_name]].itertuples():
            matrix[userID-1, itemID-1] = value
            
        return matrix
    
    @staticmethod
    def _getData(data_path):
        """
            Don't change this function
        """
        folds = []
        data_types = ['u{0}.base','u{0}.test']
        for i in range(1,6):
            train_set = getData(data_path, data_types[0].format(i))
            test_set = getData(data_path, data_types[1].format(i))
            folds.append([train_set, test_set])
        return folds
    
    def run(self, algorithms, num_users, num_items, k=1):
        """
            5-fold cross-validation
            algorithms: list. a list of algorithms. 
                        eg: [user_cosine_recsys, item_euclidean_recsys]
        """
        
        scores = {}
        for algorithm in algorithms:
            print('Processing algorithm {0}'.format(algorithm.getPredColName()))
            fold_scores = []
            for fold in self.folds:
                algorithm.reset()
                algorithm.predict_all(fold[0], num_users, num_items)
                prediction = algorithm.evaluate_test(fold[1])
                pred_col = algorithm.getPredColName()
                fold_scores.append(self.metric(prediction, k, num_users, num_items, pred_col))
            scores[algorithm.getPredColName()] = fold_scores
            
        results = scores    
    
        return results
            

In [34]:
# How to use CrossValidation Class?

In [35]:
# 1. gather your algorithms in previous steps.
algorithm_instances = [popularity_recsys, 
                       average_user_rating_recsys, 
                       user_cosine_recsys]

In [36]:
# 2. Instantiate a CrossValidation instance and assign the measurement that you want to use
# RMSE, P@K, R@K
# Precision at K in this example
cv_patk = CrossValidation('P@K')

In [37]:
# 3. Run CV by giving:
#    1> algorithms just gathered
#    2> number of users in the full dataset
#    3> number of items in the full dataset
#    4> precision or recall at K need a K value, so k=5 means precision at 5 in this example
cv_patk.run(algorithm_instances, num_users, num_items,k=5)

Processing algorithm popularity
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
20000it [00:09, 2097.75it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2067.26it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2075.12it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2148.69it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2146.15it/s]


Processing algorithm useraverage


20000it [00:09, 2068.49it/s]
20000it [00:09, 2128.34it/s]
20000it [00:09, 2068.25it/s]
20000it [00:09, 2106.14it/s]
20000it [00:09, 2065.69it/s]


Processing algorithm user-cosine


20000it [00:09, 2099.91it/s]
20000it [00:09, 2162.63it/s]
20000it [00:09, 2105.48it/s]
20000it [00:09, 2100.69it/s]
20000it [00:09, 2154.24it/s]


{'popularity': [0.36924708377518656,
  0.4965005302226948,
  0.6152704135737019,
  0.6426299045599162,
  0.6292682926829279],
 'user-cosine': [0.37179215270413657,
  0.503923647932133,
  0.621633085896077,
  0.6483563096500541,
  0.6335100742311777],
 'useraverage': [0.30604453870625714,
  0.4305408271474029,
  0.5321314952279973,
  0.5520678685047737,
  0.5474019088016986]}

In [81]:
item_cosine_recsys = SimBasedRecSys('item','cosine')
print(item_cosine_recsys.predict_all(rating_df, num_users, num_items))
print(item_cosine_recsys.getModel())
print(item_cosine_recsys.evaluate_test(rating_df,copy=True).head())

cv_patk2 = CrossValidation('RMSE')
algorithm_instances2 = [user_cosine_recsys,item_cosine_recsys]
rmse = []
rmse = cv_patk2.run(algorithm_instances2, num_users, num_items,k=5)

None
[[ 3.75429099  3.66419957  3.73222997 ...,  3.60248287  3.79662696
   3.90232044]
 [ 3.83658867  3.80424519  3.77473905 ...,  3.72798332  3.9109779
   3.79775927]
 [ 2.84492718  2.89389328  2.84327324 ...,  2.99504451  3.16444153
   2.9858119 ]
 ..., 
 [ 4.11427954  4.0558267   4.00963139 ...,  4.          3.87872799
   4.14814803]
 [ 4.37096823  4.39679254  4.33543016 ...,  3.955358    4.41891089
   4.57995134]
 [ 3.52030345  3.46948821  3.52393064 ...,  0.          3.6110641
   3.59656861]]


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
100000it [01:07, 1480.67it/s]


   userID  itemID  rating  timestamp  item-cosine
0     196     242       3  881250949     3.591314
1     186     302       3  891717742     3.344077
2      22     377       1  878887116     2.965365
3     244      51       2  880606923     3.637332
4     166     346       1  886397596     3.333013
Processing algorithm user-cosine


20000it [00:09, 2089.62it/s]
20000it [00:10, 1991.84it/s]
20000it [00:09, 2145.21it/s]
20000it [00:09, 2119.09it/s]
20000it [00:09, 2127.65it/s]


Processing algorithm item-cosine


20000it [00:09, 2076.41it/s]
20000it [00:09, 2094.01it/s]
20000it [00:09, 2042.48it/s]
20000it [00:09, 2083.98it/s]
20000it [00:09, 2072.74it/s]


In [82]:
print (rmse)

{'item-cosine': [1.0377631264364244, 1.0207280585350078, 1.0101820660011798, 1.0136832839209695, 1.0180579656376574], 'user-cosine': [1.026449013124381, 1.0214387664779507, 1.0132940326457187, 1.0094003999022947, 1.0161883961525586]}


In [83]:
#Confidence Interval Function
import scipy.stats
from math import sqrt
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    mu,sd = np.mean(a),np.std(a)
    z = scipy.stats.t.ppf(confidence, n)
    h=z*sd/sqrt(n)
    return mu, h

In [94]:
rmse_user_mean, rmse_user_ci = mean_confidence_interval(rmse['user-cosine'])
print("RMSE-user\
    \nResult of trials:{0} \
    \nAverage Accuracy: {1} \
    \nConfidence Interval: {2}\n".format(rmse['user-cosine'], rmse_user_mean, rmse_user_ci)
     )
rmse_item_mean, rmse_item_ci = mean_confidence_interval(rmse['item-cosine'])
print("RMSE-item\
    \nResult of trials:{0} \
    \nAverage Accuracy: {1} \
    \nConfidence Interval: {2}\n".format(rmse['item-cosine'], rmse_item_mean, rmse_item_ci)
     )

print("Average no. ratings per user: ",  round(100000/num_users,2))
print("Average no. ratings per item: ", round(100000/num_items,2))

print ("\nThe user-user method of collaborative filtering seems as it performed better, because on average there are almost twice as many ratings per user than there are for items. The user-user matrix is more dense than the item-item matrix which is more sparse.")

RMSE-user    
Result of trials:[1.026449013124381, 1.0214387664779507, 1.0132940326457187, 1.0094003999022947, 1.0161883961525586]     
Average Accuracy: 1.0173541216605808     
Confidence Interval: 0.0054145411296886384

RMSE-item    
Result of trials:[1.0377631264364244, 1.0207280585350078, 1.0101820660011798, 1.0136832839209695, 1.0180579656376574]     
Average Accuracy: 1.020082900106248     
Confidence Interval: 0.00860676763717415

Average no. ratings per user:  106.04
Average no. ratings per item:  59.45

The user-user method of collaborative filtering seems as it performed better, because on average there are almost twice as many ratings per user than there are for items. The user-user matrix is more dense than the item-item matrix which is more sparse.


## Q4

### (a)

In [96]:
algorithm_instances_q4 = [user_cosine_recsys,item_cosine_recsys,popularity_recsys,average_user_rating_recsys]
tests = ["RMSE","R@K","P@K"]

for test in tests:
    cv_patk_q4 = CrossValidation(test)
    run_q4 = []
    run_q4 = cv_patk_q4.run(algorithm_instances_q4, num_users, num_items,k=5)

    rec_user_mean, rec_user_ci = mean_confidence_interval(run_q4['user-cosine'])
    print(test + "-user\
        \nResult of trials:{0} \
        \nAverage Accuracy: {1} \
        \nConfidence Interval: {2}\n".format(run_q4['user-cosine'], rec_user_mean, rec_user_ci)
         )
    rec_item_mean, rec_item_ci = mean_confidence_interval(run_q4['item-cosine'])
    print(test + "-item\
        \nResult of trials:{0} \
        \nAverage Accuracy: {1} \
        \nConfidence Interval: {2}\n".format(run_q4['item-cosine'], rec_item_mean, rec_item_ci)
         )
    popularity_mean, popularity_ci = mean_confidence_interval(run_q4['popularity'])
    print(test + "-popularity\
        \nResult of trials:{0} \
        \nAverage Accuracy: {1} \
        \nConfidence Interval: {2}\n".format(run_q4['popularity'], popularity_mean, popularity_ci)
         )
    user_avg_mean, user_avg_ci = mean_confidence_interval(run_q4['useraverage'])
    print(test + "-user average\
        \nResult of trials:{0} \
        \nAverage Accuracy: {1} \
        \nConfidence Interval: {2}\n".format(run_q4['useraverage'], user_avg_mean, user_avg_ci)
         )

Processing algorithm user-cosine


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
20000it [00:10, 1990.25it/s]
20000it [00:09, 2072.96it/s]
20000it [00:10, 1992.23it/s]
20000it [00:09, 2047.50it/s]
20000it [00:09, 2071.46it/s]


Processing algorithm item-cosine


20000it [00:09, 2051.28it/s]
20000it [00:09, 2038.74it/s]
20000it [00:09, 2011.25it/s]
20000it [00:10, 1976.87it/s]
20000it [00:10, 1981.96it/s]


Processing algorithm popularity
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2005.62it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2087.90it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2100.18it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2026.70it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2068.78it/s]


Processing algorithm useraverage


20000it [00:10, 1997.59it/s]
20000it [00:09, 2069.32it/s]
20000it [00:09, 2070.83it/s]
20000it [00:09, 2055.07it/s]
20000it [00:09, 2081.71it/s]


RMSE-user        
Result of trials:[1.026449013124381, 1.0214387664779507, 1.0132940326457187, 1.0094003999022947, 1.0161883961525586]         
Average Accuracy: 1.0173541216605808         
Confidence Interval: 0.0054145411296886384

RMSE-item        
Result of trials:[1.0377631264364244, 1.0207280585350078, 1.0101820660011798, 1.0136832839209695, 1.0180579656376574]         
Average Accuracy: 1.020082900106248         
Confidence Interval: 0.00860676763717415

RMSE-popularity        
Result of trials:[3.177941281084362, 3.1750480150769977, 3.147474655005899, 3.146164503024159, 3.1488360007536382]         
Average Accuracy: 3.1590928909890112         
Confidence Interval: 0.012853154473572641

RMSE-user average        
Result of trials:[1.0629951276561334, 1.0467467492319966, 1.0328964562995389, 1.0366575971298078, 1.0392923504800367]         
Average Accuracy: 1.0437176561595025         
Confidence Interval: 0.009599098624283816

Processing algorithm user-cosine


20000it [00:09, 2070.60it/s]
20000it [00:09, 2049.81it/s]
20000it [00:09, 2079.00it/s]
20000it [00:09, 2078.35it/s]
20000it [00:09, 2017.02it/s]


Processing algorithm item-cosine


20000it [00:09, 2011.81it/s]
20000it [00:09, 2030.04it/s]
20000it [00:09, 2041.23it/s]
20000it [00:10, 1980.98it/s]
20000it [00:09, 2095.12it/s]


Processing algorithm popularity
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2078.52it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2075.98it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2088.35it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2016.74it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2046.24it/s]


Processing algorithm useraverage


20000it [00:09, 2027.58it/s]
20000it [00:09, 2085.50it/s]
20000it [00:09, 2052.97it/s]
20000it [00:09, 2027.84it/s]
20000it [00:09, 2075.55it/s]


R@K-user        
Result of trials:[0.34778041993806913, 0.4314035774468209, 0.5293633772333985, 0.5553818201403046, 0.5674144230096255]         
Average Accuracy: 0.4862687235536437         
Confidence Interval: 0.07583394433544757

R@K-item        
Result of trials:[0.3277711938444533, 0.4237782250680911, 0.5191391022223312, 0.5448659224612776, 0.5593011306991799]         
Average Accuracy: 0.4749711148590666         
Confidence Interval: 0.0788048043395744

R@K-popularity        
Result of trials:[0.3466588624187514, 0.4274468698270901, 0.5269205125667804, 0.5518738761026849, 0.5674793185065369]         
Average Accuracy: 0.4840758878843688         
Confidence Interval: 0.07590999969463498

R@K-user average        
Result of trials:[0.30505841002027845, 0.39554692074366876, 0.48030412192442223, 0.5045885853815734, 0.5211179870422066]         
Average Accuracy: 0.44132320502242983         
Confidence Interval: 0.07271258429815301

Processing algorithm user-cosine


20000it [00:09, 2056.28it/s]
20000it [00:09, 2062.92it/s]
20000it [00:09, 2020.20it/s]
20000it [00:09, 2015.11it/s]
20000it [00:10, 1987.56it/s]


Processing algorithm item-cosine


20000it [00:09, 2056.72it/s]
20000it [00:09, 2023.27it/s]
20000it [00:09, 2001.40it/s]
20000it [00:10, 1965.60it/s]
20000it [00:10, 1983.34it/s]


Processing algorithm popularity
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2047.92it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2031.29it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:10, 1930.88it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2003.61it/s]


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


20000it [00:09, 2009.89it/s]


Processing algorithm useraverage


20000it [00:09, 2089.43it/s]
20000it [00:09, 2068.68it/s]
20000it [00:10, 1993.61it/s]
20000it [00:09, 2082.90it/s]
20000it [00:09, 2049.40it/s]


P@K-user        
Result of trials:[0.37179215270413657, 0.503923647932133, 0.621633085896077, 0.6483563096500541, 0.6335100742311777]         
Average Accuracy: 0.5558430540827157         
Confidence Interval: 0.09493385045164594

P@K-item        
Result of trials:[0.34316012725344736, 0.483563096500532, 0.6021208907741271, 0.6248144220572649, 0.6074231177094392]         
Average Accuracy: 0.5322163308589621         
Confidence Interval: 0.0964082200644799

P@K-popularity        
Result of trials:[0.36924708377518656, 0.4965005302226948, 0.6152704135737019, 0.6426299045599162, 0.6292682926829279]         
Average Accuracy: 0.5505832449628855         
Confidence Interval: 0.09421819530259093

P@K-user average        
Result of trials:[0.30604453870625714, 0.4305408271474029, 0.5321314952279973, 0.5520678685047737, 0.5474019088016986]         
Average Accuracy: 0.4736373276776259         
Confidence Interval: 0.08545210012578403



### (b)

The popularity baseline cannot be (properly) evaluated with the RMSE metric, because the value for average popularity has to be between [0,1], but has a value of 3.16, which identifies high error. This is due to outliers, popularity is based only on ratings of 4-5 and divided by the total ratings given to the movie, and since the item-item matrix is sparse the results are scewed. (items > users and average no. user rating > average no. of item ratings). 

### (c)

The best algorithm across all metrics is the user-user cosine algorithm. This is significantly due to the fact that on average there are more ratings given by a user than the average number of item ratings.

### (d)

Good performance on RMSE may not imply good performance on ranking metrics. For example from the results, we can see that the algorithms with small average rmse (good) had P@K and R@K average accuracy around 0.5, which is not significantly good (half of the recomendations are actually good recomendations). This is mainly due to the fact that a user may like one movie and not like other movies that a user that is classified as similar also likes.

## Q5

### (a)

In [159]:
def itemTopK(prediction, moviesDataset, itemID, k):
    # Pick top K based on predicted rating
    itemVector = prediction[itemID,:]
    topK = nlargest(k, range(len(itemVector)), itemVector.take)
    namesTopK = list(map(lambda x: moviesDataset[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

In [163]:
fieldsMovies = ['movieID', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL', 'unknown', 'action', 'adventure',
          'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmNoir', 'horror',
          'musical', 'mystery', 'romance','sciFi', 'thriller', 'war', 'western']
moviesDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u.item'), sep='|', names=fieldsMovies, encoding='latin-1')

myMovies = [293,248,26]

train_matrix = dataPreprocessor(rating_df, num_users, num_items).transpose()
            
# Initialize the predicted rating matrix with zeros
temp_matrix = np.zeros(train_matrix.shape)
temp_matrix[train_matrix.nonzero()] = 1
item_item_similarity = 1 - pairwise_distances(train_matrix, metric='cosine')# self.method(train_matrix)

for movie in myMovies:
    i = 1
    topMovies = itemTopK(item_item_similarity,moviesDF,movie,6)
    print("Not-so-popular Movie: " + topMovies[0])
    for similar in topMovies[1:]:
        print ("Rank " + str(i) + ": " + similar)
        i += 1
        

Not-so-popular Movie: Liar Liar (1997)
Rank 1: Air Force One (1997)
Rank 2: Scream (1996)
Rank 3: Contact (1997)
Rank 4: Saint, The (1997)
Rank 5: Star Wars (1977)
Not-so-popular Movie: Austin Powers: International Man of Mystery (1997)
Rank 1: Men in Black (1997)
Rank 2: Fifth Element, The (1997)
Rank 3: Grosse Pointe Blank (1997)
Rank 4: Face/Off (1997)
Rank 5: Con Air (1997)
Not-so-popular Movie: Bad Boys (1995)
Rank 1: Cliffhanger (1993)
Rank 2: Crow, The (1994)
Rank 3: Demolition Man (1993)
Rank 4: From Dusk Till Dawn (1996)
Rank 5: Batman Returns (1992)


### (b)

Yes, I can somewhat justify these similarities, because I know that if I rated a movie like Liar Liar highly the similar movies generated are based on other users who also voted it highly and then displays movies they also rated highly. Given the ranked similar movies, they are some of which I would also like to watch or haved watched.

## Q6 [GRAD ONLY]

### (a)

# Validation

In [140]:
# Constants for validation only
ROW_NUM = 943
COL_NUM = 1682
RATING_COL = 'rating'

### dataPreprocessor

In [141]:
def testDataPreprocessor(path=MOVIELENS_DIR, getData=getData, getMatrix=CrossValidation.getMatrix):
    validation_df = getData(MOVIELENS_DIR, 'u1.test')
    try:
        matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    except:
        print('dataPreprocessor function has error')
        return
    try:
        assert(matrix.shape == (ROW_NUM,COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape (943,1682)".format(matrix.shape)
    except Exception as e:
        print(e)
    return validation_df

In [142]:
validation_df = testDataPreprocessor()

## Baseline Recommendation Systems

### Popularity Based Recommendation

In [143]:
def testPopularityRecSys(validation_df=validation_df, BaseLineRecSys = BaseLineRecSys):
    popularity_recsys = BaseLineRecSys('popularity')
    try:
        popularity_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
    except Exception as e:        
        print('popularity function has error')
        print(e)
        return
    try:
        predictionMatrix = popularity_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [144]:
testPopularityRecSys()

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


### User Average Based Recommendation

In [145]:
def testUserAverRecSys(validation_df=validation_df, BaseLineRecSys = BaseLineRecSys):
    useraverage_recsys = BaseLineRecSys('average_user_rating')
    try:
        useraverage_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
    except:
        print('useraverage function has error')
        return
    try:
        predictionMatrix = useraverage_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [146]:
testPopularityRecSys()

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


## Similary Based Recommendation Systems

### Euclidean Similarity Function

In [147]:
def testEuclidean(validation_df=validation_df, getMatrix=CrossValidation.getMatrix):
    matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    try:
        sim_matrix = SimBasedRecSys.euclidean(matrix)
        assert(sim_matrix.shape == (ROW_NUM, ROW_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(sim_matrix.shape,ROW_NUM,ROW_NUM)
        assert(np.any(sim_matrix <= 1)),\
               "Exist similarity value that is not less or equal to 1."
    except Exception as e:
        print(e)        

In [148]:
testEuclidean()

### Customized Similarity Function (test somethingelse function)

In [149]:
def testCustomizedSim(validation_df=validation_df, getMatrix=CrossValidation.getMatrix):
    matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    try:
        sim_matrix = SimBasedRecSys.somethingelse(matrix)
        assert(sim_matrix.shape == (ROW_NUM, ROW_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(sim_matrix.shape,ROW_NUM,ROW_NUM)
        assert(np.any(sim_matrix <= 1)),\
               "Exist similarity value that is not less or equal to 1."
    except Exception as e:
        print(e) 

In [150]:
testCustomizedSim()

### User-User Similarity Based Recommendation System

In [151]:
def testUUSimBasedRecSys(validation_df=validation_df, dataPreprocessor=dataPreprocessor):
    try:
        user_cosine_recsys = SimBasedRecSys('user','cosine', dataPreprocessor)
    except:
        print("Framework error, please contact TA if you see this.")
        return
    
    try:
        user_cosine_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
        predictionMatrix = user_cosine_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [152]:
testUUSimBasedRecSys()

### Item-Item Similarity Based Recommendation System

In [153]:
def testIISimBasedRecSys(validation_df=validation_df, dataPreprocessor=dataPreprocessor):
    try:
        user_cosine_recsys = SimBasedRecSys('item','cosine', dataPreprocessor)
    except:
        print("Framework error, please contact TA if you see this.")
        return
    
    try:
        user_cosine_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
        predictionMatrix = user_cosine_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [154]:
testIISimBasedRecSys()