# Metadata-based Recommender
We aim to calculate the similarity of the movie content using cosine similarity. Accordingly, we recommend a movie with similar content. In Content-based, we only consider the story description.
Here, we improve by calculating cosine similarity from the soup of 'content' and 'metadata' of the movie such as star, director, crew, and genres

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class MergeCleanData:
    
    def __init__(self, metadata, credits, keywords):

        self.df = pd.read_csv(metadata)
        self.cred_df = pd.read_csv(credits)
        self.key_df = pd.read_csv(keywords)

        
    def clean_as_int(self, x):
        
        """Function to convert 'x' to integers, if can not, return Nan"""
        
        try:
            return int(x)
        except:
            return np.nan
        
    
    def clean_ids(self, df):
        
        """Function to clean df for none-integer data
        
        Args:
            df(object): the dataframe(pandas), which is the dataset
            
        Return:
            df(object): the cleaned data where 'id' was converted as 'int'
            
        """
        
        #Clean the ids of df
        df['id'] = df['id'].apply(self.clean_as_int)    
                                  
        #Filter all rows that have a null ID
        df = df[df['id'].notnull()]
        
        return df
    
    def main(self):
        
        """Function to return combineed with 'id' reference
        
        Args:
            none: 
        
        Return:
            combined dataframe (object):
        """
        self.df = self.clean_ids(self.df)
        self.cred_df = self.clean_ids(self.cred_df)
        self.key_df = self.clean_ids(self.key_df)
        
        # Merge keywords and credits into your main metadata dataframe
        self.df = self.df.merge(self.cred_df, on='id')
        self.df = self.df.merge(self.key_df, on='id')
        
        return self.df 
        
class CreateSoup:
    
    def __init__(self, cleaned_data):
        
        self.df = cleaned_data
        
    def get_native_obj(self, df):
        
        """Function to return combineed with 'id' reference
        
        Args:
            df(object): the dataframe(pandas), which is the dataset that contains 'features'
        
        Return:
            dataframe (object): the dataframe that applied 'literal_eval' function
        """
        from ast import literal_eval
        
        # Convert the stringified objects into the native python objects
        features = ['cast', 'crew', 'keywords', 'genres']
        
        for feature in features:
            df[feature] = df[feature].apply(literal_eval)
        
        return df
    
    def get_director(self, x):
        
        """Function to extract the director's name. If director is not listed, return NaN"""

        for crew_member in x:
            if crew_member['job'] == 'Director':
                return crew_member['name']
        
        return np.nan
    
    def generate_list(self, x, n=3):
        
        """Function to returns the list top 'n' elements or entire list"""
        
        if isinstance(x, list):
            
            names = [i['name'] for i in x]
            #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
            if len(names) > n:
                names = names[:n]
            return names

        #Return empty list in case of missing/malformed data
        
        return []

    def sanitize(self, x):
        
        """Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase"""
        
        if isinstance(x, list):
            #Strip spaces and convert to lowercase
            return [str.lower(i.replace(" ", "")) for i in x]
        
        else:
            #Check if director exists. If not, return empty string
            if isinstance(x, str):
                return str.lower(x.replace(" ", ""))
            else:
                return ''
    
    def create_soup(self, x):
        """Function that creates a soup out of the desired metadata"""
        
        return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

    def main(self):
        
        """Function to returns the soup of keywords, genres, cast, directer"""
        
        self.df = self.get_native_obj(self.df)
        
        #Define the new director feature
        self.df['director'] =  self.df['crew'].apply(self.get_director)
        
        #Apply the generate_list function to cast and keywords
        self.df['cast'] = self.df['cast'].apply(self.generate_list)
        self.df['keywords'] = self.df['keywords'].apply(self.generate_list)
        self.df['genres'] = self.df['genres'].apply(self.generate_list)
        
        #Only consider a maximum of 3 genres
        n=3
        self.df['genres'] = self.df['genres'].apply(lambda x: x[:n])

        #Apply the generate_list function to cast, keywords, director and genres
        for feature in ['cast', 'director', 'genres', 'keywords']:
            #print(feature)
            self.df[feature] = self.df[feature].apply(self.sanitize)
            
        # Create the new soup feature
        self.df['soup'] = self.df.apply(self.create_soup, axis=1)
        
        return self.df
    
class ContentBasedRecommender:
    
    def __init__(self, database_soup):
        
        self.df = database_soup
    
    def cal_tfidf(self, df, stop_words_list=['english']):
        
        """Function to creat the Term Frequency-Inverse Document Frequency (TF-IDF) matrix

        Args:
            df(object): the dataframe(pandas), which is the dataset that contain 'overview' documents of movies
            stop_words(list): the words that extremly commom in the 'overview' documents of movies
        
        Return:
            tfidf_matrix (tensor): the word vecterized-matrix
        """
        
        #Define a TF-IDF Vectorizer Object. Remove all english stopwords
        tfidf = TfidfVectorizer(stop_words=stop_words_list)

        #Replace NaN with an empty string
        df['soup'] = df['soup'].fillna('')

        #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
        tfidf_matrix = tfidf.fit_transform(df['soup'])
        
        return tfidf_matrix
        
    def get_cosine_sim(self, tfidf_matrix):
        
        """Function to compute the cosine similarity matrix 

        Args:
            tfidf_matrix (tensor): the word vecterized-matrix
        
        Return:
            cosine similarity matrix(tensor)
        """
        
        # Compute the cosine similarity matrix
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        return cosine_sim
    
    def get_indices(self, df):
        
        """Function to construct a reverse mapping of indices and movie titles, 
        and drop duplicate titles(if any)"""
        
        indices = pd.Series(df.index, index=df['title']).drop_duplicates()
        
        return indices
        
    def main(self, title_input, see_top =25):
        
        """Function to takes in movie title as input and gives recommendations
        
        Args:
            title_input (string): the movie name
        
        Return:
            recommendation (object):
        """
    
        # Obtain the index of the movie that matches the title
        indices = self.get_indices(self.df)
        idx = indices[title_input]

        # Get the pairwsie similarity scores of all movies with that movie
        # And convert it into a list of tuples 
        tfidf_matrix = self.cal_tfidf(self.df)
        cosine_sim = self.get_cosine_sim(tfidf_matrix)
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the cosine similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 'see_top' most similar movies. Ignore the first movie(itself).
        sim_scores = sim_scores[1:see_top+1]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        
        return self.df['title'].iloc[movie_indices]

In [2]:
#set the CSV File into df
metadata = r'C:\Users\MMIL\Panithan\Git_projects\movies_metadata.csv'
credits = r'C:\Users\MMIL\Panithan\Git_projects\credits.csv'
keywords = r'C:\Users\MMIL\Panithan\Git_projects\keywords.csv'

prepro = MergeCleanData(metadata, credits, keywords)
database = prepro.main()

make_soup= CreateSoup(database)
database_soup = make_soup.main()

recommender= ContentBasedRecommender(database_soup)
recommender.main(title_input='The Lion King', see_top =10)

  self.df = pd.read_csv(metadata)


27768                                 The Little Matchgirl
33119                                          The Prophet
29607                                          Cheburashka
40904                   VeggieTales: Josh and the Big Wall
40913    VeggieTales: Minnesota Cuke and the Search for...
270                                       Man of the House
986                                               Infinity
29198                                      Superstar Goofy
811                            The Adventures of Pinocchio
15209             Spiderman: The Ultimate Villain Showdown
Name: title, dtype: object

In [3]:
recommender.main(title_input='Iron Man 2', see_top =10)

12696                      Iron Man
21033                    Iron Man 3
26773       Avengers: Age of Ultron
26782    Captain America: Civil War
4358                           Made
18008                  The Avengers
3698                          X-Men
6264                             X2
26777                       Ant-Man
26780                Thor: Ragnarok
Name: title, dtype: object