In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Dataset,SVD,Reader
from surprise.model_selection import cross_validate
import warnings; warnings.simplefilter('ignore')

In [2]:
md=pd.read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres']=md['genres'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x, list)else[])

vote_counts=md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages=md[md['vote_average'].notnull()]['vote_average'].astype('int')
C=vote_averages.mean()
C

5.244896612406511

In [4]:
m=vote_counts.quantile(0.95)
m

434.0

In [5]:
md['year']=pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x:str(x).split('-')[0] if x != np.nan else np.nan)

In [6]:
qualified = md[(md['vote_count']>= m) & (md['vote_count'].notnull())][['title','year','vote_count','vote_average','popularity','genres']]

qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [7]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return ( v/(v+m) * R ) + (m/(m+v)*C) 

qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr',ascending=False).head(250)

In [8]:
#TOP MOVIES
qualified.head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [9]:
# recommendation based over genres 

s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1,drop=True) 
s.name='genre'
gen_md = md.drop('genres',axis=1).join(s)

def build_chart(genre,percentile=0.85): 
    df= gen_md[gen_md['genre']==genre]
    vote_counts=df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_average=df[df['vote_average'].notnull()]['vote_average'].astype('int')

    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified=df[(df['vote_count']>=m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity']]
    qualified['vote_count']=qualified['vote_count'].astype('int')
    qualified['vote_average']=qualified['vote_average'].astype('int')

    qualified['wr']= qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m)* x['vote_average'])+(m/(m+x['vote_count']) * C), axis=1)
    
    qualified = qualified.sort_values('wr',ascending=False).head(250)
    
    return qualified


    

In [10]:
build_chart('Action').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.956262
12481,The Dark Knight,2008,12269,8,123.167,7.94994
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.931402
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.925998
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.920495
256,Star Wars,1977,6778,8,42.1497,7.910701
1154,The Empire Strikes Back,1980,5998,8,19.471,7.899511
4135,Scarface,1983,3017,8,11.2997,7.807171
9430,Oldboy,2003,2000,8,10.6169,7.719114
1910,Seven Samurai,1954,892,8,15.0178,7.441002


In [11]:
build_chart('Romance').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.546032
351,Forrest Gump,1994,8147,8,48.3072,7.969599
876,Vertigo,1958,1162,8,18.2082,7.800113
40251,Your Name.,2016,1030,8,34.461252,7.776573
883,Some Like It Hot,1959,835,8,11.8451,7.729518
1132,Cinema Paradiso,1988,834,8,14.177,7.729226
19901,Paperman,2012,734,8,7.19863,7.696401
37863,Sing Street,2016,669,8,10.672862,7.670432
882,The Apartment,1960,498,8,11.9943,7.574734
38718,The Handmaiden,2016,453,8,16.727405,7.53955


In [12]:
build_chart('Comedy').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.451774
351,Forrest Gump,1994,8147,8,48.3072,7.962309
1225,Back to the Future,1985,6239,8,25.7785,7.950988
18465,The Intouchables,2011,5410,8,16.0869,7.943631
22841,The Grand Budapest Hotel,2014,4644,8,14.442,7.934554
2211,Life Is Beautiful,1997,3643,8,39.395,7.917112
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964,1472,8,9.80398,7.803579
3342,Modern Times,1936,881,8,8.15956,7.686794
883,Some Like It Hot,1959,835,8,11.8451,7.671596
1236,The Great Dictator,1940,756,8,9.24175,7.641741


In [13]:
build_chart('Thriller').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.957087
12481,The Dark Knight,2008,12269,8,123.167,7.950882
292,Pulp Fiction,1994,8670,8,140.95,7.931004
46,Se7en,1995,5915,8,18.4574,7.900034
24860,The Imitation Game,2014,5895,8,31.5959,7.899707
586,The Silence of the Lambs,1991,4549,8,4.30722,7.871417
11354,The Prestige,2006,4510,8,16.9456,7.870357
289,Leon: The Professional,1994,4293,8,20.4773,7.864127
4099,Memento,2000,4168,8,15.4508,7.860259
1213,The Shining,1980,3890,8,19.6116,7.850813


In [14]:
# recommendation based on plot 

links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

md = md.drop([19730,29503,35587])
md['id'] = md['id'].astype('int')

smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [15]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

tfidf_matrix.shape

(9099, 268124)

In [16]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [17]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [18]:
# defining the recommending function

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [19]:
get_recommendations('The Godfather')

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
618                     Thinner
3609              Harlem Nights
8816              Run All Night
3288          Jaws: The Revenge
2192           The Color Purple
5406            The Kid Brother
3715                   3 Ninjas
7657          The Tillman Story
3607            Family Business
6398                Renaissance
Name: title, dtype: object

In [20]:
# now we work with credits and keywords

credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

md['id'] = md['id'].astype('int')
md.shape

(45463, 25)

In [21]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [22]:
# evaluation based upon cast,crew,keywords etc.

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

smd['cast_size'] = smd['cast'].apply(lambda x:len(x))
smd['crew_size'] = smd['crew'].apply(lambda x:len(x))

#extract the director

def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
    return np.nan

smd['director'] = smd['crew'].apply(get_director)

smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance( x , list ) else [])

smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

smd['keywords'] = smd['keywords'].apply(lambda x:[i['name'] for i in x ]if isinstance(x, list) else [])


In [23]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" "," ")) for i in x])

smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" "," ")))
smd['director'] = smd['director'].apply(lambda x: [x,x,x])

# let's count frequency of keywords
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1,drop=True)
s.name='keyword'

s = s.value_counts()
s[:10]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
violence                264
love                    222
musical                 219
sex                     219
suspense                212
Name: keyword, dtype: int64

In [24]:
# we are going to filter keywords now

s = s[s>1]
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')  #test case

def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [ stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" "," ")) for i in x])

smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

count = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

cosine_sim = cosine_similarity(count_matrix,count_matrix)

smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])


In [25]:
get_recommendations('The Godfather')

994                           The Godfather: Part II
3616                   Tucker: The Man and His Dream
1346                                   The Rainmaker
1602                         The Godfather: Part III
3300                                Gardens of Stone
3705                                 The Cotton Club
4518                              One from the Heart
2998                                The Conversation
5867                                     Rumble Fish
1992                           Peggy Sue Got Married
981                                   Apocalypse Now
642                                             Jack
1691                                   The Outsiders
1100                                         Dracula
5907    Hearts of Darkness: A Filmmaker's Apocalypse
4631                    A Decade Under the Influence
2858                             The Virgin Suicides
2174                                 The Dinner Game
8422                                  The Blin

In [26]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate (cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x: x[1] , reverse=True)
    sim_scores = sim_scores[1:26]
    
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year']]
    
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending= False).head(15)
    return qualified

In [27]:
improved_recommendations('The Godfather')

Unnamed: 0,title,vote_count,vote_average,year,wr
994,The Godfather: Part II,3418,8,1974,7.689586
981,Apocalypse Now,2112,8,1979,7.530356
1602,The Godfather: Part III,1589,7,1990,6.623473
1100,Dracula,1087,7,1992,6.499201
2858,The Virgin Suicides,841,7,1999,6.402577
2174,The Dinner Game,408,7,1998,6.095351
2998,The Conversation,377,7,1974,6.060771
7867,Water for Elephants,879,6,2011,5.750408
6510,Marie Antoinette,662,6,2006,5.70099
8422,The Bling Ring,1205,5,2013,5.064848


In [28]:
improved_recommendations('Inception')

Unnamed: 0,title,vote_count,vote_average,year,wr
6981,The Dark Knight,12269,8,2008,7.905871
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
8673,Mission: Impossible - Rogue Nation,3274,7,2015,6.794575
4145,Insomnia,1181,6,2002,5.797081
8570,Paranormal Activity: The Marked Ones,455,5,2014,5.119556
8500,Don Jon,1708,5,2013,5.04962


In [29]:
improved_recommendations('Pulp Fiction')

Unnamed: 0,title,vote_count,vote_average,year,wr
898,Reservoir Dogs,3821,8,1992,7.718986
8310,Django Unchained,10297,7,2012,6.929017
7280,Inglourious Basterds,6598,7,2009,6.891679
4903,Kill Bill: Vol. 1,5091,7,2003,6.862133
8905,The Hateful Eight,4405,7,2015,6.842588
5200,Kill Bill: Vol. 2,4061,7,2004,6.830542
1381,Jackie Brown,1580,7,1997,6.62179
8110,The Raid,1076,7,2011,6.495553
65,From Dusk Till Dawn,1644,6,1996,5.842293
6788,Death Proof,1359,6,2007,5.817225


In [30]:
#now we will use different present algoritms to imrove our recommendations

reader = Reader()

In [31]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [32]:
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], Reader())

In [33]:
# we will use singular value decomposition algoritm
svd = SVD()
cross_validate(svd,data, measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8907  0.8936  0.8959  0.9001  0.8994  0.8959  0.0035  
MAE (testset)     0.6894  0.6879  0.6887  0.6923  0.6939  0.6904  0.0023  
Fit time          4.03    3.66    3.81    3.91    3.78    3.84    0.12    
Test time         0.33    0.11    0.11    0.22    0.12    0.18    0.09    


{'test_rmse': array([0.89072841, 0.89361226, 0.89589084, 0.90012985, 0.89935247]),
 'test_mae': array([0.68939444, 0.68785259, 0.68870941, 0.69228892, 0.69393717]),
 'fit_time': (4.032122611999512,
  3.6610875129699707,
  3.8134148120880127,
  3.9067349433898926,
  3.7814955711364746),
 'test_time': (0.3281219005584717,
  0.10936331748962402,
  0.1093895435333252,
  0.21949100494384766,
  0.12499117851257324)}

In [34]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21fd2c80848>

In [35]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [36]:
svd.predict(1,302,3)

Prediction(uid=1, iid=302, r_ui=3, est=2.8837411418673202, details={'was_impossible': False})

In [37]:
#now we will take user id and title as input and give output
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
    
    
id_map = pd.read_csv('links_small.csv')[['movieId','tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId','id']
id_map = id_map.merge(smd[['title','id']], on='id').set_index('title')

indices_map = id_map.set_index('id')

In [38]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId=id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year','id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est',ascending=False)
    return movies.head(15)

In [39]:
hybrid(69,'The Godfather')

Unnamed: 0,title,vote_count,vote_average,year,id,est
994,The Godfather: Part II,3418.0,8.3,1974,240,4.826312
2454,Mister Roberts,43.0,7.3,1955,37853,4.793299
5907,Hearts of Darkness: A Filmmaker's Apocalypse,61.0,8.0,1991,4539,4.612456
981,Apocalypse Now,2112.0,8.0,1979,28,4.541308
2998,The Conversation,377.0,7.5,1974,592,4.468514
4631,A Decade Under the Influence,9.0,8.0,2003,38868,4.458487
1602,The Godfather: Part III,1589.0,7.1,1990,242,4.358917
2707,The Searchers,332.0,7.7,1956,3114,4.305157
2858,The Virgin Suicides,841.0,7.1,1999,1443,4.270102
1100,Dracula,1087.0,7.1,1992,6114,4.206351


In [40]:
hybrid(100,'The Godfather')

Unnamed: 0,title,vote_count,vote_average,year,id,est
994,The Godfather: Part II,3418.0,8.3,1974,240,4.294044
2454,Mister Roberts,43.0,7.3,1955,37853,4.142863
981,Apocalypse Now,2112.0,8.0,1979,28,4.100313
2998,The Conversation,377.0,7.5,1974,592,3.832855
5907,Hearts of Darkness: A Filmmaker's Apocalypse,61.0,8.0,1991,4539,3.786323
2707,The Searchers,332.0,7.7,1956,3114,3.714173
2174,The Dinner Game,408.0,7.7,1998,9421,3.709337
1346,The Rainmaker,239.0,6.7,1997,11975,3.695832
4631,A Decade Under the Influence,9.0,8.0,2003,38868,3.628383
2858,The Virgin Suicides,841.0,7.1,1999,1443,3.615123
