# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Define the Content based Recommendation class which uses cosine similarity to find the similarity between 2 movies. Based on the similaity score top n movies are recommended

In [2]:
def transpose_set(x,col_name):
    values = x[col_name]
    for value in values:
        x[value] = 1
    return x

def transpose_value(x,col_name):
    value = str(x[col_name])
    x[value] = 1
    return x

class ImdbRecSys(object):
    
    def __init__(self,matrix,movie_names):
        self.matrix = matrix
        self.movie_names = movie_names
        
    def save(self,path):
        with open(path+'/matrix.pkl', 'wb') as handle:
            pickle.dump(self.matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(path+'/movie_names.pkl', 'wb') as handle:
            pickle.dump(self.movie_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    @classmethod
    def load(cls, path):
        with open(path+'/matrix.pkl', 'rb') as handle:
            matrix = pickle.load(handle)
        with open(path+'/movie_names.pkl', 'rb') as handle:
            movie_names = pickle.load(handle)
            
        return ImdbRecSys(matrix=matrix,movie_names=movie_names)
            
            
    @classmethod
    def train(cls,path):
        
        #The crawled data of top 250 movies are loaded in pandas dataframe for further processing
        dataframe = pd.read_json(path)
        
        # movie names are seperately cached for O(1) access of the data during prediction
        movie_names = np.array(dataframe['movie_name'])
        
        # Create OnehotEncoder representation for columns ['actors','writers',
        # 'genres','director','movie_year','keywords']
        
        res = dataframe.apply(transpose_set,1,args=(['actors'])).\
                            apply(transpose_set,1,args=(['writers'])).\
                            apply(transpose_set,1,args=(['keywords'])).\
                            apply(transpose_value,1,args=(['director'])).\
                            apply(transpose_value,1,args=(['movie_year'])).\
                            apply(transpose_set,1,args=(['genres']))
                            
        # Remove the categorical columns from the dataframe as the information is encoded as Onehotencoder
        res.drop(labels=['actors','writers','genres','director','movie_year','movie_name','keywords'],axis=1,inplace=True)
        
        # Scale the values of continuous columns
        res[['critic_count','rating','user_count']] /= res[['critic_count','rating','user_count']].max()
        
        #For onehotcode representation fill the nan values by 0.0
        res.fillna(value=0.0,inplace=True)
        
        # Use Scikit-Learn cosine similarity to calculate the distance
        similarity_matrix = cosine_similarity(res)
        
        # Initialize the ImdbRecSys object with the values of the similarity matrix and the movie names as nparray
        return ImdbRecSys(matrix=similarity_matrix,movie_names=movie_names)
        
        
    
    def predict(self,query,top_n=10):
        # get the movie index from the query
        movie_ix = np.where(np.in1d(self.movie_names, query))[0]
        
        # get the relevant rows from the similarity matrix
        matrix_rows = self.matrix[movie_ix,:]
        if matrix_rows.shape[0]==0:
            return np.array(['one or more than one movies are not found'])
        
        # in case there are more than one movies calculate the mean of 
        # the similarity scores for each of the candidate movies
        rows_mean = matrix_rows.mean(axis=0)
        
        # sort the movie indices based on the increasing order of the similarity score
        top_movie_ix = np.argsort(rows_mean)
        
        # remove the movie indices of the query and select top 10 movies
        ix = np.where(np.in1d(top_movie_ix, movie_ix))[0]
        top_movie_ix=np.delete(top_movie_ix,ix)
        
        # taking top_n movie indices based on similarity score
        top_n_movie_ix = np.flip(top_movie_ix,axis=0)[:top_n]
        
        return self.movie_names[top_n_movie_ix]
    
    

# Train the model by feeding 250 imdb movie data

In [3]:
rec_sys1 = ImdbRecSys.train(path="../rec_platform/data/imdb_data.json")

In [4]:
rec_sys1.save(path = "../rec_platform/data/")

In [5]:
rec_sys = ImdbRecSys.load(path = "../rec_platform/data/")

# Get recommendation for a Single Movie in the descending order of similarity score

In [6]:
query = ['The Shawshank Redemption']
rec_sys.predict(query=query)

array(['The Green Mile', 'Pulp Fiction', 'American History X', 'Léon',
       'Eskiya', 'Se7en', 'Cool Hand Luke', 'The Dark Knight',
       'The Godfather', 'The Shining'], dtype=object)

# Get recommendation for a list of Movies in the descending order of similarity score

In [7]:
queries = ['The Godfather', 'The Dark Knight', 'Cool Hand Luke']
rec_sys.predict(query=queries)

array(['The Godfather: Part II', 'Eskiya', 'Heat',
       'The Shawshank Redemption', 'M', 'Goodfellas', 'Dog Day Afternoon',
       'On the Waterfront', 'The Silence of the Lambs',
       'American History X'], dtype=object)