# Collaborative Filtering Recommendation
Collaborative filtering is considered an advanced recommender system because it relies on user interaction data to make recommendations. It can be based on user-user collaborative filtering or item-item collaborative filtering. In the context of the MovieLens dataset, user-user collaborative filtering would recommend movies to a user based on the preferences of users with similar viewing habits. Item-item collaborative filtering would recommend movies that are similar to ones the user has already rated positively.

In [205]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate
from surprise import SVD
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display, clear_output, HTML
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [206]:
class CF_recomsys:
    
    def __init__(self,user_path, movies_path, ratings_path):
        
        #Load the u.user file into a dataframe
        u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
        self.users = pd.read_csv(user_path, sep='|', names=u_cols, encoding='latin-1')
        
        #Load the u.item file into a dataframe
        i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
         'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
        self.movies = pd.read_csv(movies_path, sep='|', names=i_cols, encoding='latin-1')
        
        #Remove all information except Movie ID and title
        self.movies = self.movies[['movie_id', 'title']]
        
        #Load the u.data file into a dataframe
        r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        self.ratings = pd.read_csv(ratings_path, sep='\t', names=r_cols, encoding='latin-1')
        
        #Drop the timestamp column
        self.ratings = self.ratings.drop('timestamp', axis=1)

    def prep_train_data(self):
        
        """Function to get test, train, data using train_test_split function"""
        
        #Assign X as the original ratings dataframe and y as the user_id column of ratings.
        self.X = self.ratings.copy()
        self.y = self.ratings['user_id']

        #Split into training and test datasets, stratified along user_id
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.25, stratify=self.y, random_state=42)
    
    
    def build_rmatrix(self):
        
        """Function to get r_matrix"""
        
        #Build the ratings matrix using pivot_table function
        self.r_matrix = self.X_train.pivot_table(values='rating', index='user_id', columns='movie_id')
        
    
    def cosine_sim_cal(self):
        
        """Function to caculate consine similarity"""
        
        self.r_matrix_fillnan =  self.r_matrix.copy().fillna(0)
        
        #Compute the cosine similarity matrix using the dummy ratings matrix
        cosine_sim = cosine_similarity(self.r_matrix_fillnan, self.r_matrix_fillnan)
        
        #Convert into pandas dataframe 
        self.cosine_sim = pd.DataFrame(cosine_sim, index=self.r_matrix.index, columns=self.r_matrix.index)

    def get_demographics(self):
        
        #Merge the original users dataframe with the training set 
        
        self.merged_df = pd.merge(self.X_train, self.users)
        
        #Compute the mean rating of every movie by gender
        self.gender_mean = self.merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
        
        #Set the index of the users dataframe to the user_id
        self.users_demmograph = self.users.set_index('user_id')
    
        #Compute the mean rating by gender and occupation
        self.gen_occ_mean = self.merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(
            values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')

        
    #########################  Pediction test #######################  
    def pred_ratings(self, model):
        
        """Function to clean calculate the prediction socre with rmse
        
        Args:
            model(object): the model to predict rating of the user 
            
        Return:
            rmse(float): the score, which indicate the accuracy of model
            
        """
        
        #Construct a list of user-movie tuples from the testing dataset
        id_pairs = zip(self.X_test['user_id'], self.X_test['movie_id'])

        #Predict the rating for every user-movie tuple
        y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])

        #Extract the actual ratings given by the users in the test data
        y_true = np.array(self.X_test['rating'])

        #Return the final RMSE score
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        
        return rmse
    
    #######################    Models #####################################     
    def baseline(self, user_id, movie_id):
        """Function to define the baseline model to always return rating_baseline = 3"""
        
        rating_baseline = 3
        
        return rating_baseline
    
    
    def cf_user_mean(self, user_id, movie_id):
        
        """Function to define model with User Based Collaborative Filter using Mean Ratings"""
        
        #Check if movie_id exists in r_matrix
        if movie_id in self.r_matrix:
            #Compute the mean of all the ratings given to the movie
            mean_rating = self.r_matrix[movie_id].mean()

        else:
            #Default to a rating of 3.0 in the absence of any information
            mean_rating = 3.0

        return mean_rating
    
    
    def cf_user_wmean(self, user_id, movie_id):
        
        """Function to define model with User Based Collaborative Filter using Weighted Mean Ratings"""
        
        #Check if movie_id exists in r_matrix
        if movie_id in self.r_matrix:

            #Get the similarity scores for the user in question with every other user
            sim_scores = self.cosine_sim[user_id]

            #Get the user ratings for the movie in question
            m_ratings = self.r_matrix[movie_id]

            #Extract the indices containing NaN in the m_ratings series
            idx = m_ratings[m_ratings.isnull()].index

            #Drop the NaN values from the m_ratings Series
            m_ratings = m_ratings.drop(idx)

            #Drop the corresponding cosine scores from the sim_scores series
            sim_scores = sim_scores.drop(idx)

            if sim_scores.sum() <= 0:
                wmean_rating = 3.0

            else:
                #Compute the final weighted mean
                wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()

        else:
            #Default to a rating of 3.0 in the absence of any information
            wmean_rating = 3.0

        return wmean_rating
        
    def cf_gender(self, user_id, movie_id):
        
        "Gender Based Collaborative Filter using Mean Ratings"
        
        #Check if movie_id exists in r_matrix (or training set)
        if movie_id in self.r_matrix:
            #Identify the gender of the user
            gender = self.users_demmograph.loc[user_id]['sex']

            #Check if the gender has rated the movie
            if gender in self.gender_mean[movie_id]:

                #Compute the mean rating given by that gender to the movie
                gender_rating = self.gender_mean[movie_id][gender]

            else:
                gender_rating = 3.0

        else:
            #Default to a rating of 3.0 in the absence of any information
            gender_rating = 3.0

        return gender_rating
    

    def cf_gen_occ(self, user_id, movie_id):
        
        "Gender and Occupation Based Collaborative Filter using Mean Ratings"
        
        #Check if movie_id exists in gen_occ_mean
        if movie_id in self.gen_occ_mean.index:

            #Identify the user
            user = self.users_demmograph.loc[user_id]

            #Identify the gender and occupation
            gender = user['sex']
            occ = user['occupation']

            #Check if the occupation has rated the movie
            if occ in self.gen_occ_mean.loc[movie_id]:

                #Check if the gender has rated the movie
                if gender in self.gen_occ_mean.loc[movie_id][occ]:

                    #Extract the required rating
                    rating = self.gen_occ_mean.loc[movie_id][occ][gender]

                    #Default to 3.0 if the rating is null
                    if np.isnan(rating):
                        rating = 3.0

                    return rating

        else:
            #Return the default rating    
            return 3.0
    
    def knn_model(self):
        
        #Define a Reader object
        #The Reader object helps in parsing the file or dataframe containing ratings
        reader = Reader()

        #Create the dataset to be used for building the filter
        data = Dataset.load_from_df(self.ratings, reader)

        #Define the algorithm object; in this case kNN
        knn = KNNBasic()

        #Evaluate the performance in terms of RMSE
        return cross_validate(knn, data, measures=['RMSE'])

    def svd_model(self):
        
        #Define a Reader object
        #The Reader object helps in parsing the file or dataframe containing ratings
        reader = Reader()

        #Create the dataset to be used for building the filter
        data = Dataset.load_from_df(self.ratings, reader)
        
        #Define the SVD algorithm object
        svd = SVD()

        #Evaluate the performance in terms of RMSE
        return cross_validate(svd, data, measures=['RMSE'])

    def main(self, model):
        
        """Function to predict ratings ralvent r_matrix"""
        
        self.prep_train_data()
        self.build_rmatrix()
        self.cosine_sim_cal()
        self.get_demographics()
        
        rmse = self.pred_ratings(model)
        print("RMSE:",rmse)

In [207]:
user_path = r'C:\Users\MMIL\Panithan\Git_projects\MovieLensDataset\ml-100k\u.user'
movies_path = r'C:\Users\MMIL\Panithan\Git_projects\MovieLensDataset\ml-100k\u.item'
ratings_path = r'C:\Users\MMIL\Panithan\Git_projects\MovieLensDataset\ml-100k\u.data'
A1=CF_recomsys(user_path, movies_path, ratings_path)

In [208]:
A1.main(A1.baseline)

RMSE: 1.2488234462885457


In [209]:
A1.main(A1.cf_user_mean)

RMSE: 1.0300824802393536


In [210]:
A1.main(A1.cf_user_wmean)

RMSE: 1.0237210431087944


In [211]:
A1.main(A1.cf_gender)

RMSE: 1.0392906999935203


In [212]:
A1.main(A1.cf_gen_occ)

RMSE: 1.1419651376788005


In [213]:
A1.knn_model()

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.98229459, 0.97325008, 0.98289412, 0.97735831, 0.97681482]),
 'fit_time': (0.22104978561401367,
  0.23005223274230957,
  0.23205208778381348,
  0.23005223274230957,
  0.23005175590515137),
 'test_time': (1.5105390548706055,
  1.5323472023010254,
  1.6313700675964355,
  1.5723559856414795,
  1.4923381805419922)}

In [214]:
A1.svd_model()

{'test_rmse': array([0.94105508, 0.93650033, 0.9268558 , 0.93780555, 0.93030478]),
 'fit_time': (2.596588134765625,
  2.7026121616363525,
  2.582584857940674,
  2.5405755043029785,
  2.5385751724243164),
 'test_time': (0.07001566886901855,
  0.07001638412475586,
  0.07001590728759766,
  0.17003822326660156,
  0.07001566886901855)}