# Movies Recommender System

In [1]:
#conda install -c conda-forge scikit-surprise

In [2]:
#conda update -n base -c defaults conda

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

from surprise.model_selection import KFold
from surprise.model_selection.validation import cross_validate

import warnings; warnings.simplefilter('ignore')

In [4]:
md = pd. read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [6]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [7]:
m = vote_counts.quantile(0.95)
m

434.0

In [8]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [9]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [10]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [12]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [13]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [14]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [15]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [16]:
build_chart('Action').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.955099
12481,The Dark Knight,2008,12269,8,123.167,7.94861
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.929579
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.924031
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.918382
256,Star Wars,1977,6778,8,42.1497,7.908327
1154,The Empire Strikes Back,1980,5998,8,19.471,7.896841
4135,Scarface,1983,3017,8,11.2997,7.802046
9430,Oldboy,2003,2000,8,10.6169,7.711649
1910,Seven Samurai,1954,892,8,15.0178,7.426145


In [17]:
links_small = pd.read_csv('links.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [18]:
md = md.drop([19730, 29503, 35587])

In [19]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int') #Type casting

In [20]:
smd = md[md['id'].isin(links_small)] #preprocessing.
smd.shape

(45463, 25)

In [21]:
smd['tagline'] = smd['tagline'].fillna('') #null values replaced by na
smd['description'] = smd['overview'] + smd['tagline'] #combine col
smd['description'] = smd['description'].fillna('') 

In [22]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')  #ngram:,stop word:removes #tfid:returns freq of words
tfidf_matrix = tf.fit_transform(smd['description']) #transforms col to vector

In [23]:
tfidf_matrix.shape

(45463, 1104495)

In [24]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) #we can know similarity bten two sentence

In [25]:
cosine_sim[0]

array([1.        , 0.00511811, 0.        , ..., 0.        , 0.00236862,
       0.        ])

In [26]:
smd = smd.reset_index() #rearrange
titles = smd['title'] 
indices = pd.Series(smd.index, index=smd['title']) #

In [27]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx])) #calculate cos for each index
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) #Sorted in desc
    sim_scores = sim_scores[1:31] #take top 30 scores for similarity
    movie_indices = [i[0] for i in sim_scores] #top 30 index
    return titles.iloc[movie_indices]

In [28]:
get_recommendations('The Godfather').head(10)

44027    The Godfather Trilogy: 1972-1990
1178               The Godfather: Part II
31971                    Honor Thy Father
21613                          The Family
23125                          Blood Ties
38027            A Mother Should Be Loved
18322                     The Outside Man
11297                    Household Saints
10821                            Election
4324                                 Made
Name: title, dtype: object

In [29]:
#get_recommendations('The Dark Knight')

In [30]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [31]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [32]:
md.shape

(45463, 25)

In [33]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [34]:
smd = md[md['id'].isin(links_small)]
smd.shape

(46628, 28)

In [35]:
smd['cast'] = smd['cast'].apply(literal_eval) #apply:applies for every row at once  #literal eval
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x)) #cal len of cast
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [36]:
def get_director(x):  #return dirctr name
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [37]:
smd['director'] = smd['crew'].apply(get_director)

In [38]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])  #dirctr name storing in list
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x) #

In [39]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])  #dirctr name storing in list

In [40]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x]) #removed spaces and appended in list

In [41]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", ""))) #removed space and dirctr name in string
smd['director'] = smd['director'].apply(lambda x: [x,x, x]) #duplicate elements

In [42]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True) #Store rows which are droped #stack:implements stack
s.name = 'keyword'

In [43]:
s = s.value_counts() #unique count
s[:5]

woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
Name: keyword, dtype: int64

In [44]:
s = s[s > 1] #Creates unique list

In [45]:
stemmer = SnowballStemmer('english') #conversion of word to root words
stemmer.stem('dogs')

'dog'

In [46]:
def filter_keywords(x):     #appending root word to list
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [47]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [48]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres'] #
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x)) #join by space

In [49]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') #Bag of words #CV #tfid vs CV
count_matrix = count.fit_transform(smd['soup']) #Fiting model

In [50]:
#cosine_sim = cosine_similarity(count_matrix, count_matrix) 

In [51]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [52]:
#get_recommendations('The Dark Knight').head(10)

In [53]:
get_recommendations('Mean Girls').head(10)

16629                           Seven Thieves
32946                                 Phantom
7299                                    Shade
11554                                  Eragon
38101                        The Black Castle
11575                       The Good Shepherd
29448                               Hot Moves
27475                          Son of Morning
1034                       That Thing You Do!
31066    Scooby-Doo! Legend of the Phantosaur
Name: title, dtype: object

In [54]:
def improved_recommendations(title):  #vote consideration
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60) #simi Above 60 percentile
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [55]:
#improved_recommendations('The Dark Knight')

In [56]:
improved_recommendations('Mean Girls')

Unnamed: 0,title,vote_count,vote_average,year,wr
1276,Forbidden Planet,238,7,1956,5.866496
1154,"The Cook, the Thief, His Wife & Her Lover",137,7,1989,5.665998
11575,The Good Shepherd,342,6,2006,5.577687
1983,The 'Burbs,312,6,1989,5.560704
1034,That Thing You Do!,230,6,1996,5.506454
2681,The Yards,98,6,2000,5.383995
7299,Shade,74,6,2003,5.354892
33257,Intruders,153,5,2016,5.181065
4480,American Ninja 3: Blood Hunt,33,4,1989,5.156927
11554,Eragon,990,4,2006,4.379414


In [57]:
reader = Reader() #Reader:

In [58]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [59]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf=KFold(n_splits=5)
kf.split(data)
#data.split(n_folds=5)


<generator object KFold.split at 0x00000234B8C28DD0>

In [None]:
svd = SVD()
#evaluate(svd, data, measures=['RMSE', 'MAE'])
cross_validate(svd, data, measures=['RMSE','MAE'], cv=5)
#evaluate(svd, data, measures=['RMSE','MAE'], cv=5)

In [None]:
trainset = data.build_full_trainset() 
#svd.train(trainset)

Let us pick user 5000 and check the ratings s/he has given.

In [None]:
ratings[ratings['userId'] == 1]

In [None]:
svd.predict(1, 302, 3) #

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_map = pd.read_csv('links.csv')[['movieId', 'tmdbId']] 
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title') #title la index set karun merge
#id_map = id_map.set_index('tmdbId')

In [None]:
indices_map = id_map.set_index('id') #mapping index

In [None]:
def hybrid(userId, title):
    try:
        idx = indices[title]
        tmdbId = id_map.loc[title]['id']
        #print(idx)
        movie_id = id_map.loc[title]['movieId']
    
        sim_scores = list(enumerate(cosine_sim[int(idx)]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:26]
        movie_indices = [i[0] for i in sim_scores]
    
        movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
        movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
        movies = movies.sort_values('est', ascending=False)
        return movies.head(10)
    except:
        return "404!! NO MOVIES FOUND !!!"

In [None]:
hybrid(600, 'Inception')

In [None]:
hybrid(100, 'Avatar')