# Knowledge Recommender
We calculate the score for the IMDB movies and asking user for preferrence. 
Then, system recommend high score movies according to the preferrence

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

class KnowledgeRecommender:
    
    def __init__(self, database):
        path_database = database
        self.df = pd.read_csv(path_database, low_memory=True)
        #print("keys:", self.df.keys())
    
    def convert_int(self, x):
        """Function to convert NaN to 0 and all other years to integers"""
        try:
            return int(x)
        except:
            return 0
        
    def get_release_year(self, df):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
        Return:
            df(object): the dataframe(pandas) with the new collum 'year'

        """
        
        #Convert release_date into pandas datetime format
        df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

        #Extract year from the datetime
        df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
        
        #Apply convert_int to the year feature
        df['year'] = df['year'].apply(self.convert_int)

        return df
    
    def get_genre(self, df):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
        Return:
            df(object): the dataframe(pandas) with the new collum 'genre' 

        """
        #Convert all NaN into stringified empty lists
        df['genres'] = df['genres'].fillna('[]')

        #Apply literal_eval to convert stringified empty lists to the list object
        df['genres'] = df['genres'].apply(literal_eval)

        #Convert list of dictionaries to a list of strings
        df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])
        
        #Create a new feature by exploding genres
        s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

        #Name the new feature as 'genre'
        s.name = 'genre'

        #Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
        df = df.join(s)
        
        return df
    
    def precondition(self, df, quantile_num=0.80, runtime=[45,300]):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
            quantile_num(float): the number to indicate the 80th percentile
            runtime(array): rumtime movies longer than array[0] minutes and shorter than array[1] minutes
            
        Return:
            df_q_movies(object): the movies that made the cut
            
        """
        
        #m (int): the minimum number of votes requred for the movie to be in the chart
        self.m = df['vote_count'].quantile(quantile_num)
        #C (float): the mean rating of all the movies in the dataset
        self.C = df['vote_average'].mean()
        
        #Only consider some movies with runtime conditions 
        df_q_movies = df[(df['runtime'] >= runtime[0]) & (df['runtime'] <= runtime[1])]

        #Only consider movies that have garnered more than m votes
        df_q_movies = df_q_movies[df_q_movies['vote_count'] >= self.m]
        
        return df_q_movies
    
    def weighted_rating(self, df):
        
        """Function to compute the IMDB weighted rating for each movie

        Args:
            df(object): the dataframe(pandas), which is the dataset
        
        Return:
            weight_score(float): the weighted score
            
        """
        v = df['vote_count']
        R = df['vote_average']
        weight_score = (v/(v+self.m) * R) + (self.m/(self.m+v) * self.C)

        return weight_score
    
    def get_preferrence(self):
        
        """Function to get user-preferrence """
            
        #Ask for preferred genres
        print("Input preferred genre")
        self.genre = input()

        #Ask for lower limit of duration
        print("Input shortest duration(mins)")
        self.low_time = int(input())

        #Ask for upper limit of duration
        print("Input longest duration(mins)")
        self.high_time = int(input())

        #Ask for lower limit of timeline
        print("Input earliest year")
        self.low_year = int(input())

        #Ask for upper limit of timeline
        print("Input latest year")
        self.high_year = int(input())
    
    
    def main(self, see_top =25):
        
        """Function to recommend movies from IMDB database relative to the weight_score 

        Args:
            see_top(int): number of movies to display on recommedation 
        
        Return:
            recommendation list(object)
        """
        
        self.df_q_movies = self.precondition(self.df)
        
        self.df_q_movies = self.get_release_year(self.df_q_movies)
        self.df_q_movies = self.get_genre(self.df_q_movies)
        
        self.df_q_movies['score'] = self.df_q_movies.apply(self.weighted_rating, axis=1)
        #Sort movies in descending order of their scores
        self.df_q_movies = self.df_q_movies.sort_values('score', ascending=False)

        #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
        movies = self.df_q_movies.copy()
        self.get_preferrence()
        
        #Filter based on the condition
        movies = movies[(movies['genre'] == self.genre) & 
                        (movies['runtime'] >= self.low_time) & 
                        (movies['runtime'] <= self.high_time) & 
                        (movies['year'] >= self.low_year) & 
                        (movies['year'] <= self.high_year)]
        
        movies = movies[['title','genre', 'year', 'runtime', 'vote_average', 'vote_count', 'score']]
        #Sort movies in descending order of their scores
        return movies.sort_values('score', ascending=False).head(see_top)

##### genre: 'comedy, romance, drama, crime, drama, action, fantasy, science fiction, thriller, adventure'

In [2]:
#set the CSV File into df
database = r'C:\Users\MMIL\Panithan\Git_projects\movies_metadata.csv'

preference_scored= KnowledgeRecommender(database)
preference_scored.main(see_top =10)

  self.df = pd.read_csv(path_database, low_memory=True)
  s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Input preferred genre
comedy
Input shortest duration(mins)
40
Input longest duration(mins)
180
Input earliest year
1993
Input latest year
2012


Unnamed: 0,title,genre,year,runtime,vote_average,vote_count,score
2211,Life Is Beautiful,comedy,1997,116.0,8.3,3643.0,8.263691
351,Forrest Gump,comedy,1994,142.0,8.2,8147.0,8.184252
18465,The Intouchables,comedy,2011,112.0,8.2,5410.0,8.176357
13724,Up,comedy,2009,96.0,7.8,7048.0,7.784631
1604,The Truman Show,comedy,1998,103.0,7.8,4702.0,7.777043
4843,Amélie,comedy,2001,122.0,7.8,3403.0,7.768407
1650,The Big Lebowski,comedy,1998,117.0,7.8,3001.0,7.764245
0,Toy Story,comedy,1995,81.0,7.7,5415.0,7.680953
14732,3 Idiots,comedy,2009,170.0,7.8,850.0,7.678789
14310,Mary and Max,comedy,2009,92.0,7.8,596.0,7.631131


In [3]:
preference_scored.main(see_top =10)

  s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Input preferred genre
action
Input shortest duration(mins)
40
Input longest duration(mins)
150
Input earliest year
1900
Input latest year
2010


Unnamed: 0,title,genre,year,runtime,vote_average,vote_count,score
1154,The Empire Strikes Back,action,1980,124.0,8.2,5998.0,8.178656
15480,Inception,action,2010,148.0,8.1,14075.0,8.091215
256,Star Wars,action,1977,121.0,8.1,6778.0,8.081826
9430,Oldboy,action,2003,120.0,8.0,2000.0,7.941907
2458,The Matrix,action,1999,136.0,7.9,9079.0,7.887503
1167,Return of the Jedi,action,1983,135.0,7.9,4763.0,7.876296
1171,Alien,action,1979,117.0,7.9,4564.0,7.875273
1215,M,action,1931,117.0,8.0,465.0,7.768758
1189,Das Boot,action,1981,149.0,7.9,623.0,7.730476
6212,Castle in the Sky,action,1986,124.0,7.8,877.0,7.68232
