In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings;warnings.simplefilter('ignore')
from IPython.display import HTML,Image
pd.set_option('display.max_colwidth', 150)

## Content
Simple Recommender :
1.1 IMDB Top 250: Top 250 Movies based on calculated IMDB ratings
1.2 Recommendation by Genre: Top Movies for every genre in database 

Content Based Recommender:
2.1 Movie Description Based: Recommend movies based on overview and tagline
2.2 Metadata based: Recommend movies based on Cast, Director and keywords of movie

Collaborative Filtering using Surprise:
The personalised recommender based on users Past history of ratings and similarity between his and other users rating history

Hybrid Recommender:
Recommeder system based on collaberation of Contend based recommender and collaberative filtering, It leverages features of both recommender for better recommendations

In [2]:
df = pd.read_csv('movies_metadata.csv')

In [3]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy'...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who...",...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1u...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",,15602,tt0113228,en,Grumpier Old Men,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée ope...",...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for Love.,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stell...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself... and never let you forget it.,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJ...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is ex...",...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's In For The Surprise Of His Life!,Father of the Bride Part II,False,5.7,173.0


In [4]:
df.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [5]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [7]:
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])

# 1. Simple Recommender
This is a very basic model based on movies popularity and critical acclaimation, this model does not give personalised recommendation

I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:
Weighted Rating (WR) = ((v/(v+m)).R)+((m/(v+m)).C)
where,
v is the number of votes for the movie
m is the minimum votes required to be listed in the chart
R is the average rating of the movie
C is the mean vote across the whole report

In [8]:
base_poster_url = 'http://image.tmdb.org/t/p/w185/'
df['poster_path'] = "<img src='" + base_poster_url + df['poster_path'] + "' style='height:100px;'>"

In [9]:
df['vote_count'] = df[df['vote_count'].notnull()]['vote_count'].astype('int')

In [10]:
df['vote_average'] = df[df['vote_average'].notnull()]['vote_average'].astype('int')

In [11]:
c = df['vote_average'].mean()

In [12]:
m = df['vote_count'].quantile(0.95)

In [13]:
df['year'] = pd.to_datetime(df['release_date'],errors='coerce').apply(lambda x : str(x).split('-')[0] if x!= np.nan else np.nan)

In [14]:
qualified = df[(df['vote_count'] >= m) & (df['vote_average'].notnull()) & (df['vote_count'].notnull())][['poster_path','title','year','popularity','vote_count','vote_average','genres']]
HTML(qualified.head(2).to_html(escape=False))

Unnamed: 0,poster_path,title,year,popularity,vote_count,vote_average,genres
0,,Toy Story,1995,21.9469,5415.0,7.0,"[Animation, Comedy, Family]"
1,,Jumanji,1995,17.0155,2413.0,6.0,"[Adventure, Fantasy, Family]"


In [15]:
qualified.shape

(2274, 7)

There are total 2274 movies that are qualified to be considered for the recommender charts.
Lets calculated weighted rating according to mentioned IMDB's formulae

In [16]:
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

In [17]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return(v/(v+m) * R) + (m/(m+v) * c)


In [18]:
qualified['wr'] = qualified.apply(weighted_rating,axis=1)
HTML(qualified.head(2).to_html(escape=False))

Unnamed: 0,poster_path,title,year,popularity,vote_count,vote_average,genres,wr
0,,Toy Story,1995,21.9469,5415,7,"[Animation, Comedy, Family]",6.86977
1,,Jumanji,1995,17.0155,2413,6,"[Adventure, Fantasy, Family]",5.884891


# 1.1 IMDB TOP 250

In [19]:
qualified.sort_values('wr',axis=0,ascending=False).head(250)
HTML(qualified.sort_values('wr',axis=0,ascending=False).head(250).to_html(escape=False))

Unnamed: 0,poster_path,title,year,popularity,vote_count,vote_average,genres,wr
15480,,Inception,2010,29.1081,14075,8,"[Action, Thriller, Science Fiction, Mystery, Adventure]",7.917588
12481,,The Dark Knight,2008,123.167,12269,8,"[Drama, Action, Crime, Thriller]",7.905871
22879,,Interstellar,2014,32.2135,11187,8,"[Adventure, Drama, Science Fiction]",7.897107
2843,,Fight Club,1999,63.8696,9678,8,[Drama],7.881753
4863,,The Lord of the Rings: The Fellowship of the Ring,2001,32.0707,8892,8,"[Adventure, Fantasy, Action]",7.871787
292,,Pulp Fiction,1994,140.95,8670,8,"[Thriller, Crime]",7.86866
314,,The Shawshank Redemption,1994,51.6454,8358,8,"[Drama, Crime]",7.864
7000,,The Lord of the Rings: The Return of the King,2003,29.3244,8226,8,"[Adventure, Fantasy, Action]",7.861927
351,,Forrest Gump,1994,48.3072,8147,8,"[Comedy, Drama, Romance]",7.860656
5814,,The Lord of the Rings: The Two Towers,2002,29.4235,7641,8,"[Adventure, Fantasy, Action]",7.851924


# 1.2 Recommendation by genre

In [20]:
s= df.apply(lambda x : pd.Series(x['genres']),axis=1).stack().reset_index(level=1,drop=True)
s.name = 'genres'

In [21]:
gen_df = df.drop('genres',axis=1).join(s)

In [22]:
gen_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genres
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy'...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.0,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy'...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.0,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy'...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.0,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who...",17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.0,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who...",17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.0,2413.0,1995,Fantasy


In [23]:
def build_chart(genre,percentile=0.85):
    df=gen_df[gen_df['genres']== genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_average'].notnull()) & (df['vote_count'].notnull())][['poster_path','title','year','popularity','vote_count','vote_average','genres']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr',ascending=False).head(250)
    return qualified
    
    

In [24]:
build_chart('Romance',percentile=0.85)
HTML(build_chart('Romance',percentile=0.85).head(15).to_html(escape=False))

Unnamed: 0,poster_path,title,year,popularity,vote_count,vote_average,genres,wr
10309,,Dilwale Dulhania Le Jayenge,1995,34.457,661,9,Romance,8.565285
351,,Forrest Gump,1994,48.3072,8147,8,Romance,7.971357
876,,Vertigo,1958,18.2082,1162,8,Romance,7.811667
40251,,Your Name.,2016,34.461252,1030,8,Romance,7.789489
883,,Some Like It Hot,1959,11.8451,835,8,Romance,7.745154
1132,,Cinema Paradiso,1988,14.177,834,8,Romance,7.744878
19901,,Paperman,2012,7.19863,734,8,Romance,7.713951
37863,,Sing Street,2016,10.672862,669,8,Romance,7.689483
882,,The Apartment,1960,11.9943,498,8,Romance,7.599317
38718,,The Handmaiden,2016,16.727405,453,8,Romance,7.566166


As we move Percentile value, the movies to qualify for recomendation get selected or dropped, Higher the percentile, higher number of votes required to qualify.

In [25]:
build_chart('Thriller',percentile=0.85).head(15)
HTML(build_chart('Thriller',percentile=0.85).head(15).to_html(escape=False))

Unnamed: 0,poster_path,title,year,popularity,vote_count,vote_average,genres,wr
15480,,Inception,2010,29.1081,14075,8,Thriller,7.95646
12481,,The Dark Knight,2008,123.167,12269,8,Thriller,7.950165
292,,Pulp Fiction,1994,140.95,8670,8,Thriller,7.929996
46,,Se7en,1995,18.4574,5915,8,Thriller,7.898573
24860,,The Imitation Game,2014,31.5959,5895,8,Thriller,7.898242
586,,The Silence of the Lambs,1991,4.30722,4549,8,Thriller,7.869538
11354,,The Prestige,2006,16.9456,4510,8,Thriller,7.868463
289,,Leon: The Professional,1994,20.4773,4293,8,Thriller,7.862142
4099,,Memento,2000,15.4508,4168,8,Thriller,7.858217
1213,,The Shining,1980,19.6116,3890,8,Thriller,7.848633


Well these are quite good thriller  movies, but Pulp Fiction is still my favourite thriller  movie, whatever this list says
The above recommender is not personalised and it will only generate recommendation by genre, irrespective of persons own interest
## 2. Content Based Recommender
To personalise our recommendations more, I am going to build an engine that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked. Since we will be using movie metadata (or content) to build this engine, this also known as Content Based Filtering.

Even though content based recommender is still not personalised recommender, we gonna use it letter for hybrid recommender

I will build 2 content based Recommender based on:

1.Movie Overview and tagline

2.Movie cast,director, genre,keywords

For upcoming recommenders, we gonna need sparse matrix formed using Cosine similarities/linear similarities, so for computation conveninece we will subset of dataset

In [26]:
links_small = pd.read_csv('links_small.csv')

In [27]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [28]:
links_small.shape

(9125, 3)

In [29]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [30]:
df = df.drop([19730, 29503, 35587])

In [31]:
df['id'] = df['id'].astype('int')

In [32]:
smd = df[df['id'].isin(links_small)]

In [33]:
smd.shape

(9099, 25)

In [34]:
smd.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy'...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.0,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who...",...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.0,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1u...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée ope...",...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for Love.,Grumpier Old Men,False,6.0,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stell...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself... and never let you forget it.,Waiting to Exhale,False,6.0,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJ...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is ex...",...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's In For The Surprise Of His Life!,Father of the Bride Part II,False,5.0,173.0,1995


 So we will work with 9099 movies

## 2.1 Movie description based recommender
Based on overview and tagline

In [35]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [36]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')

In [37]:
tfid_matrix = tf.fit_transform(smd['description'])

# Cosine Similarity
We Will use cosine similarity to calculate numeric quantity that denote similarity between 2 movies. More info about cosine similarities . https://scikit-learn.org/stable/modules/metrics.html

In [38]:
cosin_sim= linear_kernel(tfid_matrix,tfid_matrix)

In [39]:
cosin_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [40]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index,index=smd['title'])

In [41]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosin_sim[idx]))
    sim_scores = sorted(sim_scores,key = lambda x : x[1] ,reverse= True)
    sim_scores = sim_scores[1:31]
    movies_indices = [i[0] for i in sim_scores]
    return smd.iloc[movies_indices][['poster_path','title','year','description']]
    

In [42]:
get_recommendations('The Godfather')
HTML(get_recommendations('The Godfather').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
973,,The Godfather: Part II,1974,"In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily and in 1910s New York. In the 1950s, Michael Corleon..."
8387,,The Family,2013,"The Manzoni family, a notorious mafia clan, is relocated to Normandy, France under the witness protection program, where fitting in soon becomes c..."
3509,,Made,2001,Two aspiring boxers lifelong friends get involved in a money-laundering scheme through a low-level organized crime group.Welcome to disorganized c...
4196,,Johnny Dangerously,1984,"Set in the 1930s, an honest, goodhearted man is forced to turn to a life of crime to finance his neurotic mother's skyrocketing medical bills.Orga..."
29,,Shanghai Triad,1995,A provincial boy related to a Shanghai crime family is recruited by his uncle into cosmopolitan Shanghai in the 1930s to be a servant to a ganglor...
5667,,Fury,1936,"When a prisoner barely survives a lynch mob attack and is presumed dead, he vindictively decides to frame the mob for his murder.TWO LOVERS...VICT..."
2412,,American Movie,1999,"AMERICAN MOVIE is the story of filmmaker Mark Borchardt, his mission, and his dream. Spanning over two years of intense struggle with his film, hi..."
1582,,The Godfather: Part III,1990,"In the midst of trying to legitimize his business dealings in 1979 New York and Italy, aging mafia don, Michael Corleone seeks forgiveness for his..."
4221,,8 Women,2002,"Eight women gather to celebrate Christmas in a snowbound cottage, only to find the family patriarch dead with a knife in his back. Trapped in the ..."
2159,,Summer of Sam,1999,"Spike Lee's take on the ""Son of Sam"" murders in New York City during the summer of 1977 centering on the residents of an Italian-American South Br..."


In [43]:
get_recommendations('The Dark Knight')
HTML(get_recommendations('The Dark Knight').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
7931,,The Dark Knight Rises,2012,"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation an..."
132,,Batman Forever,1995,"The Dark Knight of Gotham City confronts a dastardly duo: Two-Face and the Riddler. Formerly District Attorney Harvey Dent, Two-Face believes Batm..."
1113,,Batman Returns,1992,"Having defeated the Joker, Batman now faces the Penguin - a warped and deformed individual who is intent on being accepted into Gotham society. Cr..."
8227,,"Batman: The Dark Knight Returns, Part 2",2013,Batman has stopped the reign of terror that The Mutants had cast upon his city. Now an old foe wants a reunion and the government wants The Man o...
7565,,Batman: Under the Red Hood,2010,"Batman faces his ultimate challenge as the mysterious Red Hood takes Gotham City by firestorm. One part vigilante, one part criminal kingpin, Red ..."
524,,Batman,1989,"The Dark Knight of Gotham City begins his war on crime with his first major enemy being the clownishly homicidal Joker, who has seized control of ..."
7901,,Batman: Year One,2011,Two men come to Gotham City: Bruce Wayne after years abroad feeding his lifelong obsession for justice and Jim Gordon after being too honest a cop...
2579,,Batman: Mask of the Phantasm,1993,"An old flame of Bruce Wayne's strolls into town, re-heating up the romance between the two. At the same time, a mass murderer with an axe for one ..."
2696,,JFK,1991,New Orleans District Attorney Jim Garrison discovers there's more to the Kennedy assassination than the official story.The story that won’t go away.
8165,,"Batman: The Dark Knight Returns, Part 1",2012,"Batman has not been seen for ten years. A new breed of criminal ravages Gotham City, forcing 55-year-old Bruce Wayne back into the cape and cowl. ..."


 As we can see the recommeder is recommending the movie whose plotline is similar to 'The Dark Knight', which is no surprise that it is showing most of the batman movies

In [44]:
get_recommendations('Q & A')
HTML(get_recommendations('Q & A').to_html(escape=False))

Unnamed: 0,poster_path,title,year,description
2696,,JFK,1991,New Orleans District Attorney Jim Garrison discovers there's more to the Kennedy assassination than the official story.The story that won’t go away.
8680,,The Young Savages,1961,A district attorney investigates the racially charged case of three teenagers accused of the murder of a blind Puerto Rican boy.HERE IS THE RAW TR...
1135,,Night Falls on Manhattan,1996,A newly elected District attorney finds himself in the middle of a police corruption investigation that may involve his father and his partner.In ...
6667,,Fracture,2007,"A husband is on trial for the attempted murder of his wife, in what is seemingly an open/shut case for the ambitious district attorney trying to p..."
7242,,The File on Thelma Jordon,1950,A woman seduces a District Attorney and pulls him into a web of theft and murder....SHE'LL LIE...KILL OR KISS HER WAY OUT OF ANYTHING!
9060,,I Am Wrath,2016,A man is out for justice after a group of corrupt police officers are unable to catch his wife's killer.I lay my vengeance upon them.
7931,,The Dark Knight Rises,2012,"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation an..."
2624,,Bad Lieutenant,1992,"While investigating a young nun's rape, a corrupt New York City police detective, with a serious drug and gambling addiction, tries to change his ..."
7344,,Law Abiding Citizen,2009,A frustrated man decides to take justice into his own hands after a plea bargain sets one of his family's killers free. He targets not only the ki...
231,,Kiss of Death,1995,"Jimmy Kilmartin is an ex-con living in Astoria in the New York City borough of Queens, trying to stay clean and raising a family with his wife Bev..."


# 2.2 Metadata Based Recommender¶
It will be based on cast, crew, genre, keywords

In [45]:
credits =pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [46]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')

In [47]:
df.shape

(45463, 25)

In [48]:
df = df.merge(credits,on='id')
df = df.merge(keywords,on='id')

In [49]:
smd = df[df['id'].isin(links_small)]

In [50]:
smd.shape

(9219, 28)

In [51]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x : len(x))
smd['crew_size'] = smd['crew'].apply(lambda x : len(x))

In Cast column, there are total 10-15 actors are present for each movie, but we need only top 3-4 main actors who have worked in movie.
Now we will get director

In [52]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan        

In [53]:
smd['director'] = smd['crew'].apply(get_director)

In [54]:
smd['cast'] = smd['cast'].apply(lambda x : [i['name'] for i in x]if isinstance (x,list) else [])
smd['cast'] = smd['cast'].apply(lambda x : x[:3] if len(x) >= 3 else x)

In [55]:
smd['cast'] = smd['cast'].apply(lambda x : [(str.lower(i.replace(" ","")) for i in x)])

In [56]:
smd['director'] = smd['director'].astype('str').apply(lambda x :str.lower(x.replace(" ","")))
smd['director'] = smd['director'].apply(lambda x : [x,x, x])

In [57]:
k = smd.apply(lambda x : pd.Series(x['keywords']),axis=1).stack().reset_index(level=1,drop=True)
k.name = 'keyword'

In [58]:
k.head()

0       {'id': 931, 'name': 'jealousy'}
0           {'id': 4290, 'name': 'toy'}
0           {'id': 5202, 'name': 'boy'}
0    {'id': 6054, 'name': 'friendship'}
0       {'id': 9713, 'name': 'friends'}
Name: keyword, dtype: object

In [59]:
k = k.value_counts()
k[:5]

{'id': 10183, 'name': 'independent film'}         610
{'id': 187056, 'name': 'woman director'}          550
{'id': 9826, 'name': 'murder'}                    399
{'id': 179431, 'name': 'duringcreditsstinger'}    327
{'id': 818, 'name': 'based on novel'}             318
Name: keyword, dtype: int64

In [60]:
k = k[k>1]

In [61]:
k.head()

{'id': 10183, 'name': 'independent film'}         610
{'id': 187056, 'name': 'woman director'}          550
{'id': 9826, 'name': 'murder'}                    399
{'id': 179431, 'name': 'duringcreditsstinger'}    327
{'id': 818, 'name': 'based on novel'}             318
Name: keyword, dtype: int64

 Now using Snowball stemmer we will stem keywords

In [62]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [63]:
def filter_words(x):
    words = []
    for i in x :
        if i in k:
            words.append(i)
    return words

In [64]:
smd['keywords'] = smd['keywords'].apply(lambda x : [i['name'] for i in x] if isinstance (x,list) else [])

In [65]:
smd['keywords'] = smd['keywords'].apply(filter_words)
smd['keywords'] = smd['keywords'].apply(lambda x : [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x : [str.lower(i.replace(" ","")) for i in x])

In [66]:
smd['soup'] = smd['keywords'] + smd['director'] + smd['cast'] + smd['genres']
smd['soup'] = smd['soup'].astype('str')
smd['soup'] = smd['soup'].apply(lambda x : ''.join(x))

In [67]:
count = CountVectorizer(analyzer = 'word',min_df=0,ngram_range=(1,2),stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [68]:
cosin_sim = linear_kernel(count_matrix,count_matrix)

In [69]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index , index= smd['title'])

In [70]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosin_sim[idx]))
    sim_scores = sorted(sim_scores,key = lambda x : x[1] ,reverse= True)
    sim_scores = sim_scores[1:31]
    movies_indices = [i[0] for i in sim_scores]
    j= smd.iloc[movies_indices][['poster_path','title','year','cast','director']]
    j['director'] = j['director'].apply(lambda x:x[0])
    return j

As Our Cosine function is changed, lets see what recommendation we get now

In [71]:
get_recommendations('The Dark Knight')
HTML(get_recommendations('The Dark Knight').to_html(escape=False))

Unnamed: 0,poster_path,title,year,cast,director
8031,,The Dark Knight Rises,2012,[.. at 0x00000158C1F8AA98>],christophernolan
6218,,Batman Begins,2005,[.. at 0x00000158DCBE5B88>],christophernolan
2085,,Following,1998,[.. at 0x00000158E2CAD7C8>],christophernolan
4145,,Insomnia,2002,[.. at 0x00000158C6AE8E58>],christophernolan
6623,,The Prestige,2006,[.. at 0x00000158CFA7F048>],christophernolan
7648,,Inception,2010,[.. at 0x00000158C1E80138>],christophernolan
3381,,Memento,2000,[.. at 0x00000158DC696C00>],christophernolan
8613,,Interstellar,2014,[.. at 0x00000158E2A4C4F8>],christophernolan
3864,,The Gauntlet,1977,[.. at 0x00000158CFB9E660>],clinteastwood
5943,,Thursday,1998,[.. at 0x00000158E2948660>],skipwoods


In [72]:
get_recommendations('Mean Girls')
HTML(get_recommendations('Mean Girls').to_html(escape=False))

Unnamed: 0,poster_path,title,year,cast,director
1329,,The House of Yes,1997,[.. at 0x00000158DAAF6930>],markwaters
3319,,Head Over Heels,2001,[.. at 0x00000158DC67EDE0>],markwaters
4763,,Freaky Friday,2003,[.. at 0x00000158C5A8FA20>],markwaters
6277,,Just Like Heaven,2005,[.. at 0x00000158DCBC8840>],markwaters
7332,,Ghosts of Girlfriends Past,2009,[.. at 0x00000158C697A7C8>],markwaters
7905,,Mr. Popper's Penguins,2011,[.. at 0x00000158C5C04D68>],markwaters
6959,,The Spiderwick Chronicles,2008,[.. at 0x00000158C1CE9318>],markwaters
0,,Toy Story,1995,[.. at 0x00000158E13BE750>],johnlasseter
2,,Grumpier Old Men,1995,[.. at 0x00000158E13BE390>],howarddeutch
3,,Waiting to Exhale,1995,[.. at 0x00000158E13BE318>],forestwhitaker


# 2.3 improved recommendations 

In [73]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosin_sim[idx]))
    sim_scores = sorted(sim_scores,key=lambda x : x[1],reverse=True)
    sim_scores = sim_scores[1:26]
    movies_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movies_indices][['poster_path','title','vote_count','vote_average','year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [74]:
improved_recommendations('The Dark Knight').head(10)
HTML(improved_recommendations('The Dark Knight').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,wr
7648,,Inception,14075,8,2010,7.917588
8613,,Interstellar,11187,8,2014,7.897107
6623,,The Prestige,4510,8,2006,7.758148
3381,,Memento,4168,8,2000,7.740175
8031,,The Dark Knight Rises,9263,7,2012,6.921448
6218,,Batman Begins,7511,7,2005,6.904127
5,,Heat,1886,7,1995,6.671675
380,,Carlito's Way,805,7,1993,6.385218
4145,,Insomnia,1181,6,2002,5.797081
149,,Hackers,406,6,1995,5.609863


# 3. Collaborative Filtering
The above recommendation models has one limitation, It is not user centric, It will only recommend the movies which are close to the given movies. I will use Surprise library for more user centric recommendations

In [75]:
from surprise import SVD ,Reader,Dataset,evaluate

In [76]:
reader = Reader()

In [77]:
ratings = pd.read_csv('ratings_small.csv')

In [78]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [79]:
data= Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

In [80]:
data.split(n_folds=5)

In [81]:
svd = SVD()
evaluate(svd,data,measures=['RMSE','MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8954
MAE:  0.6899
------------
Fold 2
RMSE: 0.9018
MAE:  0.6923
------------
Fold 3
RMSE: 0.9008
MAE:  0.6913
------------
Fold 4
RMSE: 0.8942
MAE:  0.6878
------------
Fold 5
RMSE: 0.8904
MAE:  0.6888
------------
------------
Mean RMSE: 0.8965
Mean MAE : 0.6900
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8954385982162387,
                             0.9018135020707515,
                             0.9008467774689152,
                             0.8941614983332761,
                             0.8904498725968937],
                            'mae': [0.6898542646156821,
                             0.6923200416316841,
                             0.6912794457578506,
                             0.6878368018938282,
                             0.6888287700368798]})

In [82]:
trainset = data.build_full_trainset()

In [83]:
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x158bc342358>

In [84]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [85]:
svd.predict(1,302,3)

Prediction(uid=1, iid=302, r_ui=3, est=2.717807478839709, details={'was_impossible': False})

 The model predict rating given by userid 1 to movie id 33 is 2.60. This is purely on how other user have given rating to the movie and how their ratings are similar to userid1

## 4. Hybrid Recommender
We will collaborate the content based(Metadata based) and collaborative filtering recommenders

In [86]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [87]:
id_map = pd.read_csv('links_small.csv')[['movieId','tmdbId']]

In [88]:
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)

In [89]:
id_map.columns = ['movieId','id']

In [90]:
id_map= id_map.merge(smd[['title','id']],on='id').set_index('title')

In [91]:
indices_map = id_map.set_index('id')

In [92]:
def hybrid(userId,title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosin_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['poster_path','title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [93]:
hybrid(1,'Avatar')
HTML(hybrid(1,'Avatar').to_html(escape=False))

Unnamed: 0,poster_path,title,vote_count,vote_average,year,id,est
1011,,The Terminator,4208.0,7.0,1984,218,3.098819
8658,,X-Men: Days of Future Past,6155.0,7.0,2014,127585,3.05192
974,,Aliens,3282.0,7.0,1986,679,2.869965
522,,Terminator 2: Judgment Day,4274.0,7.0,1991,280,2.811856
3060,,Sinbad and the Eye of the Tiger,39.0,6.0,1977,11940,2.749137
4278,,Mothra vs. Godzilla,38.0,6.0,1964,1682,2.644592
922,,The Abyss,822.0,7.0,1989,2756,2.618159
4966,,Hercules in New York,63.0,3.0,1969,5227,2.575035
5310,,Frank Herbert's Dune,114.0,6.0,2000,876,2.546498
7265,,Dragonball Evolution,475.0,2.0,2009,14164,2.545947
