In [0]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD, evaluate, SlopeOne, SVDpp, KNNBaseline
from collections import defaultdict 

import warnings; warnings.simplefilter('ignore')

In [0]:
movie_db = pd. read_csv('~/movies_metadata.csv')
movie_db.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [0]:
movie_db['genres'] = movie_db['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [0]:
vote_counts = movie_db[movie_db['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movie_db[movie_db['vote_average'].notnull()]['vote_average'].astype('int')


In [0]:
C = vote_averages.mean()  #C = the mean average vote across the dataset
C

5.244896612406511

In [0]:
m = vote_counts.quantile(0.90)  #minimum votes required to be eigible for the list
m

160.0

In [0]:
movie_db['year'] = pd.to_datetime(movie_db['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [0]:
eligible = movie_db[(movie_db['vote_count'] >= m) & (movie_db['vote_count'].notnull()) & (movie_db['vote_average'].notnull())][['title', 'year', 'genres', 'vote_count', 'vote_average', 'popularity' ]]

eligible['vote_count'] = eligible['vote_count'].astype('int')
eligible['vote_average'] = eligible['vote_average'].astype('int')

eligible.shape

(4555, 6)

In [0]:
def weighted_rating(x):
    
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

In [0]:
eligible['wr'] = eligible.apply(weighted_rating, axis=1)

In [0]:
eligible = eligible.sort_values('wr', ascending=False)

In [0]:
eligible.head(250)

Unnamed: 0,title,year,genres,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,"[Comedy, Drama, Romance]",661,9,34.457,8.268189
15480,Inception,2010,"[Action, Thriller, Science Fiction, Mystery, A...",14075,8,29.1081,7.969033
12481,The Dark Knight,2008,"[Drama, Action, Crime, Thriller]",12269,8,123.167,7.964533
22879,Interstellar,2014,"[Adventure, Drama, Science Fiction]",11187,8,32.2135,7.961151
2843,Fight Club,1999,[Drama],9678,8,63.8696,7.955192
4863,The Lord of the Rings: The Fellowship of the Ring,2001,"[Adventure, Fantasy, Action]",8892,8,32.0707,7.951302
292,Pulp Fiction,1994,"[Thriller, Crime]",8670,8,140.95,7.950077
314,The Shawshank Redemption,1994,"[Drama, Crime]",8358,8,51.6454,7.948249
7000,The Lord of the Rings: The Return of the King,2003,"[Adventure, Fantasy, Action]",8226,8,29.3244,7.947434
351,Forrest Gump,1994,"[Comedy, Drama, Romance]",8147,8,48.3072,7.946934


In [0]:
gen = movie_db.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
gen.name = 'genre'

gen_movie_db = movie_db.drop('genres', axis=1).join(gen) # identifying the main genre of a movie

In [0]:
gen_movie_db

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Family
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,Romance
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,Comedy
3,False,,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,Comedy
3,False,,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,Drama


In [0]:
def genre_list(genre):
    
    df = gen_movie_db[gen_movie_db['genre'] == genre]
    
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.90)
    
    eligible = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'genre', 'vote_count', 'vote_average', 'popularity']]
    eligible['vote_count'] = eligible['vote_count'].astype('int')
    eligible['vote_average'] = eligible['vote_average'].astype('int')
    
    eligible['wr'] = eligible.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    eligible = eligible.sort_values('wr', ascending=False).head(250)
    
    return eligible

In [0]:
# recommendation list for a particular genre
genre_list('Crime').head(10)

Unnamed: 0,title,year,genre,vote_count,vote_average,popularity,wr
12481,The Dark Knight,2008,Crime,12269,8,123.167,7.926567
292,Pulp Fiction,1994,Crime,8670,8,140.95,7.897324
314,The Shawshank Redemption,1994,Crime,8358,8,51.6454,7.893652
834,The Godfather,1972,Crime,6024,8,41.1093,7.854816
46,Se7en,1995,Crime,5915,8,18.4574,7.852298
586,The Silence of the Lambs,1991,Crime,4549,8,4.30722,7.811259
289,Leon: The Professional,1994,Crime,4293,8,20.4773,7.800891
3030,The Green Mile,1999,Crime,4166,8,19.9668,7.795313
1057,Reservoir Dogs,1992,Crime,3821,8,12.2203,7.778453
1178,The Godfather: Part II,1974,Crime,3418,8,36.6293,7.754867


In [0]:
genre_list('Animation').head(10)  

Unnamed: 0,title,year,genre,vote_count,vote_average,popularity,wr
359,The Lion King,1994,Animation,5520,8,21.6058,7.837175
5481,Spirited Away,2001,Animation,3968,8,41.0489,7.780031
9698,Howl's Moving Castle,2004,Animation,2049,8,16.136,7.611398
2884,Princess Mononoke,1997,Animation,2041,8,17.1667,7.610152
5833,My Neighbor Totoro,1988,Animation,1730,8,13.5073,7.554643
40251,Your Name.,2016,Animation,1030,8,34.461252,7.344597
5553,Grave of the Fireflies,1988,Animation,974,8,0.010902,7.318899
19901,Paperman,2012,Animation,734,8,7.19863,7.181326
39386,Piper,2016,Animation,487,8,11.243161,6.96648
20779,Wolf Children,2012,Animation,483,8,10.2495,6.962069


In [0]:
credits = pd.read_csv('~/credits.csv')
keywords = pd.read_csv('~/keywords.csv')

In [0]:
movie_db = movie_db.drop([19730, 29503, 35587])

In [0]:
keywords['id'] = keywords['id'].astype('int')

credits['id'] = credits['id'].astype('int')

movie_db['id'] = movie_db['id'].astype('int')

In [0]:
movie_db.shape

(45463, 25)

In [0]:
movie_db = movie_db.merge(credits, on='id')

movie_db = movie_db.merge(keywords, on='id')

In [0]:
movie_db.shape

(46628, 28)

In [0]:
movie_db['cast'] = movie_db['cast'].apply(literal_eval)
movie_db['crew'] = movie_db['crew'].apply(literal_eval)

movie_db['keywords'] = movie_db['keywords'].apply(literal_eval)

movie_db['cast_size'] = movie_db['cast'].apply(lambda x: len(x))
movie_db['crew_size'] = movie_db['crew'].apply(lambda x: len(x))

In [0]:
def filter_director(x):   # finding the director from the crew list
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [0]:
movie_db['director'] = movie_db['crew'].apply(filter_director)

In [0]:
movie_db['cast'] = movie_db['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

movie_db['cast'] = movie_db['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)  # finding the top 3 cast members from the cast list

In [0]:
movie_db['keywords'] = movie_db['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [0]:
key_word = movie_db.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)

key_word.name = 'keyword'            

In [0]:
key_word = key_word.value_counts()  

key_word[:10]

woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
sex                  685
violence             651
biography            641
nudity               636
revenge              626
Name: keyword, dtype: int64

In [0]:
key_word = key_word[key_word > 5]      # finding frequently appeared keywords

In [0]:
key_word[:10]

woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
sex                  685
violence             651
biography            641
nudity               636
revenge              626
Name: keyword, dtype: int64

In [0]:
def filter_keywords(x):        
    words = []
    for i in x:
        if i in key_word:
            words.append(i)
    return words


In [0]:
movie_db['keywords'] = movie_db['keywords'].apply(filter_keywords)

In [0]:
movie_db['keywords']=movie_db['keywords'].to_frame()  #converting keyword list to a dataframe

In [0]:
movie_db['keywords']

0        [jealousy, toy, boy, friendship, friends, riva...
1        [board game, disappearance, based on children'...
2             [fishing, best friend, duringcreditsstinger]
3        [based on novel, interracial relationship, sin...
4        [baby, midlife crisis, confidence, aging, daug...
5        [robbery, detective, bank, obsession, chase, s...
6        [paris, brother brother relationship, chauffeu...
7                                                       []
8                          [terrorist, hostage, explosive]
9        [cuba, falsely accused, secret identity, compu...
10         [white house, usa president, new love, widower]
11                                        [dracula, spoof]
12                        [wolf, alaska, dog, bear attack]
13       [usa president, presidential election, waterga...
14            [exotic island, treasure, map, ship, pirate]
15       [poker, drug abuse, 1970s, overdose, illegal p...
16       [bowling, based on novel, servant, country lif.

In [0]:
movie_db['genres']= movie_db['genres'].to_frame()           #converting genre list to a dataframe

In [0]:
movie_db['genres']

0                        [Animation, Comedy, Family]
1                       [Adventure, Fantasy, Family]
2                                  [Romance, Comedy]
3                           [Comedy, Drama, Romance]
4                                           [Comedy]
5                   [Action, Crime, Drama, Thriller]
6                                  [Comedy, Romance]
7                 [Action, Adventure, Drama, Family]
8                      [Action, Adventure, Thriller]
9                      [Adventure, Action, Thriller]
10                          [Comedy, Drama, Romance]
11                                  [Comedy, Horror]
12                    [Family, Animation, Adventure]
13                                  [History, Drama]
14                               [Action, Adventure]
15                                    [Drama, Crime]
16                                  [Drama, Romance]
17                                   [Crime, Comedy]
18                        [Crime, Comedy, Adve

In [0]:
movie_db['cast']= movie_db['cast'].to_frame()       #converting cast list to a dataframe

In [0]:
movie_db['cast']

0                      [Tom Hanks, Tim Allen, Don Rickles]
1           [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2               [Walter Matthau, Jack Lemmon, Ann-Margret]
3        [Whitney Houston, Angela Bassett, Loretta Devine]
4               [Steve Martin, Diane Keaton, Martin Short]
5                  [Al Pacino, Robert De Niro, Val Kilmer]
6              [Harrison Ford, Julia Ormond, Greg Kinnear]
7        [Jonathan Taylor Thomas, Brad Renfro, Rachael ...
8        [Jean-Claude Van Damme, Powers Boothe, Dorian ...
9           [Pierce Brosnan, Sean Bean, Izabella Scorupco]
10       [Michael Douglas, Annette Bening, Michael J. Fox]
11               [Leslie Nielsen, Mel Brooks, Amy Yasbeck]
12               [Kevin Bacon, Bob Hoskins, Bridget Fonda]
13            [Anthony Hopkins, Joan Allen, Powers Boothe]
14           [Geena Davis, Matthew Modine, Frank Langella]
15               [Robert De Niro, Sharon Stone, Joe Pesci]
16               [Kate Winslet, Emma Thompson, Hugh Gran

In [0]:
movie_db['director']= movie_db['director'].to_frame()          #converting director list to a dataframe

In [0]:
movie_db['director']

0              John Lasseter
1               Joe Johnston
2              Howard Deutch
3            Forest Whitaker
4              Charles Shyer
5               Michael Mann
6             Sydney Pollack
7               Peter Hewitt
8                Peter Hyams
9            Martin Campbell
10                Rob Reiner
11                Mel Brooks
12               Simon Wells
13              Oliver Stone
14              Renny Harlin
15           Martin Scorsese
16                   Ang Lee
17            Allison Anders
18            Steve Oedekerk
19              Joseph Ruben
20          Barry Sonnenfeld
21                 Jon Amiel
22            Richard Donner
23              Victor Salva
24               Mike Figgis
25             Oliver Parker
26       Lesli Linka Glatter
27             Roger Michell
28        Jean-Pierre Jeunet
29               Zhang Yimou
                ...         
46598               Uwe Boll
46599             Beth David
46600       Diederick Koopal
46601       Re

In [0]:
movie_db['combined'] =  movie_db['cast'] + movie_db['genres']

In [0]:
movie_db['combined']

0        [Tom Hanks, Tim Allen, Don Rickles, Animation,...
1        [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2        [Walter Matthau, Jack Lemmon, Ann-Margret, Rom...
3        [Whitney Houston, Angela Bassett, Loretta Devi...
4        [Steve Martin, Diane Keaton, Martin Short, Com...
5        [Al Pacino, Robert De Niro, Val Kilmer, Action...
6        [Harrison Ford, Julia Ormond, Greg Kinnear, Co...
7        [Jonathan Taylor Thomas, Brad Renfro, Rachael ...
8        [Jean-Claude Van Damme, Powers Boothe, Dorian ...
9        [Pierce Brosnan, Sean Bean, Izabella Scorupco,...
10       [Michael Douglas, Annette Bening, Michael J. F...
11       [Leslie Nielsen, Mel Brooks, Amy Yasbeck, Come...
12       [Kevin Bacon, Bob Hoskins, Bridget Fonda, Fami...
13       [Anthony Hopkins, Joan Allen, Powers Boothe, H...
14       [Geena Davis, Matthew Modine, Frank Langella, ...
15       [Robert De Niro, Sharon Stone, Joe Pesci, Dram...
16       [Kate Winslet, Emma Thompson, Hugh Grant, Dram.

In [0]:
movie_db['combined']= movie_db['combined'] + movie_db['keywords']

In [0]:
movie_db['combined']

0        [Tom Hanks, Tim Allen, Don Rickles, Animation,...
1        [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2        [Walter Matthau, Jack Lemmon, Ann-Margret, Rom...
3        [Whitney Houston, Angela Bassett, Loretta Devi...
4        [Steve Martin, Diane Keaton, Martin Short, Com...
5        [Al Pacino, Robert De Niro, Val Kilmer, Action...
6        [Harrison Ford, Julia Ormond, Greg Kinnear, Co...
7        [Jonathan Taylor Thomas, Brad Renfro, Rachael ...
8        [Jean-Claude Van Damme, Powers Boothe, Dorian ...
9        [Pierce Brosnan, Sean Bean, Izabella Scorupco,...
10       [Michael Douglas, Annette Bening, Michael J. F...
11       [Leslie Nielsen, Mel Brooks, Amy Yasbeck, Come...
12       [Kevin Bacon, Bob Hoskins, Bridget Fonda, Fami...
13       [Anthony Hopkins, Joan Allen, Powers Boothe, H...
14       [Geena Davis, Matthew Modine, Frank Langella, ...
15       [Robert De Niro, Sharon Stone, Joe Pesci, Dram...
16       [Kate Winslet, Emma Thompson, Hugh Grant, Dram.

In [0]:
movie_db['director'] = movie_db['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))


In [0]:
movie_db['director'] = movie_db['director'].apply(lambda x: [x, x, x])

In [0]:
movie_db['director']

0               [johnlasseter, johnlasseter, johnlasseter]
1                  [joejohnston, joejohnston, joejohnston]
2               [howarddeutch, howarddeutch, howarddeutch]
3         [forestwhitaker, forestwhitaker, forestwhitaker]
4               [charlesshyer, charlesshyer, charlesshyer]
5                  [michaelmann, michaelmann, michaelmann]
6            [sydneypollack, sydneypollack, sydneypollack]
7                  [peterhewitt, peterhewitt, peterhewitt]
8                     [peterhyams, peterhyams, peterhyams]
9         [martincampbell, martincampbell, martincampbell]
10                       [robreiner, robreiner, robreiner]
11                       [melbrooks, melbrooks, melbrooks]
12                    [simonwells, simonwells, simonwells]
13                 [oliverstone, oliverstone, oliverstone]
14                 [rennyharlin, rennyharlin, rennyharlin]
15        [martinscorsese, martinscorsese, martinscorsese]
16                                [anglee, anglee, angle

In [0]:
movie_db['combined']= movie_db['combined'] + movie_db['director']

In [0]:
movie_db['combined']           #top 3 cast, keyword, genre and director in one dataframe

0        [Tom Hanks, Tim Allen, Don Rickles, Animation,...
1        [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2        [Walter Matthau, Jack Lemmon, Ann-Margret, Rom...
3        [Whitney Houston, Angela Bassett, Loretta Devi...
4        [Steve Martin, Diane Keaton, Martin Short, Com...
5        [Al Pacino, Robert De Niro, Val Kilmer, Action...
6        [Harrison Ford, Julia Ormond, Greg Kinnear, Co...
7        [Jonathan Taylor Thomas, Brad Renfro, Rachael ...
8        [Jean-Claude Van Damme, Powers Boothe, Dorian ...
9        [Pierce Brosnan, Sean Bean, Izabella Scorupco,...
10       [Michael Douglas, Annette Bening, Michael J. F...
11       [Leslie Nielsen, Mel Brooks, Amy Yasbeck, Come...
12       [Kevin Bacon, Bob Hoskins, Bridget Fonda, Fami...
13       [Anthony Hopkins, Joan Allen, Powers Boothe, H...
14       [Geena Davis, Matthew Modine, Frank Langella, ...
15       [Robert De Niro, Sharon Stone, Joe Pesci, Dram...
16       [Kate Winslet, Emma Thompson, Hugh Grant, Dram.

In [0]:
movie_db['combined'] = movie_db['combined'].apply(lambda x: ' '.join(x))

In [0]:
movie_db['combined']

0        Tom Hanks Tim Allen Don Rickles Animation Come...
1        Robin Williams Jonathan Hyde Kirsten Dunst Adv...
2        Walter Matthau Jack Lemmon Ann-Margret Romance...
3        Whitney Houston Angela Bassett Loretta Devine ...
4        Steve Martin Diane Keaton Martin Short Comedy ...
5        Al Pacino Robert De Niro Val Kilmer Action Cri...
6        Harrison Ford Julia Ormond Greg Kinnear Comedy...
7        Jonathan Taylor Thomas Brad Renfro Rachael Lei...
8        Jean-Claude Van Damme Powers Boothe Dorian Har...
9        Pierce Brosnan Sean Bean Izabella Scorupco Adv...
10       Michael Douglas Annette Bening Michael J. Fox ...
11       Leslie Nielsen Mel Brooks Amy Yasbeck Comedy H...
12       Kevin Bacon Bob Hoskins Bridget Fonda Family A...
13       Anthony Hopkins Joan Allen Powers Boothe Histo...
14       Geena Davis Matthew Modine Frank Langella Acti...
15       Robert De Niro Sharon Stone Joe Pesci Drama Cr...
16       Kate Winslet Emma Thompson Hugh Grant Drama Ro.

In [0]:
count = CountVectorizer(analyzer='word', decode_error='ignore', encoding='utf-8',ngram_range=(1, 2), min_df=0, stop_words='english')

In [0]:
count

CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding='utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [0]:
term_document_matrix = count.fit_transform(movie_db['combined'])  # returns term-document matrix by feature extraction

In [0]:
cosine_sim = cosine_similarity(term_document_matrix, term_document_matrix)

In [0]:
cosine_sim

array([[1.        , 0.01923433, 0.02302046, ..., 0.        , 0.        ,
        0.        ],
       [0.01923433, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02302046, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [0]:
movie_db = movie_db.reset_index()


In [0]:
titles = movie_db['title']

indice = pd.Series(movie_db.index, index = movie_db['title'])

In [0]:
def movie_recommendation(title):          # to get the recommendation for a particular movie
    idx = indice[title]
    
    sim_score = list(enumerate(cosine_sim[idx]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    
    sim_score = sim_score[1:26]
    movie_indice = [i[0] for i in sim_score]
    
    movies = movie_db.iloc[movie_indice][['title', 'year', 'genres', 'vote_count', 'vote_average', 'popularity']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    eligible = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['title', 'year', 'genres', 'vote_count', 'vote_average', 'popularity', ]]
    eligible['vote_count'] = eligible['vote_count'].astype('int')
    eligible['vote_average'] = eligible['vote_average'].astype('int')
    eligible['wr'] = eligible.apply(weighted_rating, axis=1)
    eligible = eligible.sort_values('wr', ascending=False)
    
    return eligible

In [0]:
#recommendation based on a particular movie
movie_recommendation('Memento').head(10)

Unnamed: 0,title,year,genres,vote_count,vote_average,popularity,wr
15651,Inception,2010,"[Action, Thriller, Science Fiction, Mystery, A...",14075,8,29.1081,7.969033
12589,The Dark Knight,2008,"[Drama, Action, Crime, Thriller]",12269,8,123.167,7.964533
23076,Interstellar,2014,"[Adventure, Drama, Science Fiction]",11187,8,32.2135,7.961151
11463,The Prestige,2006,"[Drama, Mystery, Thriller]",4510,8,16.9456,7.905607
18442,The Dark Knight Rises,2012,"[Action, Crime, Drama, Thriller]",9263,7,20.5826,6.970199
10210,Batman Begins,2005,"[Action, Crime, Drama]",7511,7,28.5053,6.963392
45843,Dunkirk,2017,"[Action, Drama, History, Thriller, War]",2712,7,30.938854,6.902223
5302,Insomnia,2002,"[Crime, Mystery, Thriller]",1181,6,11.425,5.909906
11855,Disturbia,2007,"[Thriller, Drama, Mystery]",1038,6,18.0069,5.899151
29787,Regression,2015,"[Horror, Mystery, Thriller]",600,5,10.0363,5.051557


In [0]:
movie_recommendation('The Godfather').head(10)

Unnamed: 0,title,year,genres,vote_count,vote_average,popularity,wr
1199,The Godfather: Part II,1974,"[Drama, Crime]",3418,8,36.6293,7.876798
1186,Apocalypse Now,1979,"[Drama, War]",2112,8,13.5963,7.805979
1934,The Godfather: Part III,1990,"[Crime, Drama, Thriller]",1589,7,17.1853,6.839442
1312,Dracula,1992,"[Romance, Horror]",1087,7,16.7777,6.774806
3635,The Conversation,1974,"[Crime, Drama, Mystery]",377,7,13.2456,6.477064
24284,The Drop,2014,"[Drama, Crime]",859,6,11.695,5.881436
2025,The Outsiders,1983,"[Crime, Drama]",293,6,6.43593,5.733297
1614,The Rainmaker,1997,"[Drama, Crime, Thriller]",239,6,6.6834,5.697202
8911,Rumble Fish,1983,"[Action, Adventure, Crime, Drama, Romance]",141,6,8.20519,5.598616
754,Jack,1996,"[Comedy, Drama, Science Fiction]",340,5,6.28724,5.078367


In [0]:
#Collaborative Filtering based Recommendation
reader = Reader()

In [0]:
ratings = pd.read_csv('~/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [0]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies)

Number of users = 671 | Number of movies = 9066


In [0]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data.split(n_folds=5)

In [0]:
svd = SVD()
perf1= evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8984
MAE:  0.6904
------------
Fold 2
RMSE: 0.8993
MAE:  0.6908
------------
Fold 3
RMSE: 0.8955
MAE:  0.6890
------------
Fold 4
RMSE: 0.8998
MAE:  0.6935
------------
Fold 5
RMSE: 0.8913
MAE:  0.6883
------------
------------
Mean RMSE: 0.8968
Mean MAE : 0.6904
------------
------------


In [0]:
slp = SlopeOne()
perf2= evaluate(slp, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SlopeOne.

------------
Fold 1
RMSE: 0.9224
MAE:  0.7082
------------
Fold 2
RMSE: 0.9358
MAE:  0.7156
------------
Fold 3
RMSE: 0.9278
MAE:  0.7072
------------
Fold 4
RMSE: 0.9325
MAE:  0.7153
------------
Fold 5
RMSE: 0.9269
MAE:  0.7106
------------
------------
Mean RMSE: 0.9291
Mean MAE : 0.7114
------------
------------


In [0]:
knn = KNNBaseline()
perf3= evaluate(knn, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm KNNBaseline.

------------
Fold 1
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8947
MAE:  0.6862
------------
Fold 2
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9021
MAE:  0.6894
------------
Fold 3
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8963
MAE:  0.6858
------------
Fold 4
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8994
MAE:  0.6900
------------
Fold 5
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8940
MAE:  0.6861
------------
------------
Mean RMSE: 0.8973
Mean MAE : 0.6875
------------
------------


In [0]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a2ff13d50>

In [0]:
testset = trainset.build_anti_testset()  
predictions = svd.test(testset)

In [0]:
#predicting ratings for unseen movies using SVD
df = pd.DataFrame(predictions)
df= df.sort_values(['est','uid'], ascending=[False, True])
df

Unnamed: 0,uid,iid,r_ui,est,details
1037906,4,527,3.543608,5.0,{u'was_impossible': False}
1037912,4,923,3.543608,5.0,{u'was_impossible': False}
1037941,4,4993,3.543608,5.0,{u'was_impossible': False}
1037986,4,593,3.543608,5.0,{u'was_impossible': False}
1038018,4,1304,3.543608,5.0,{u'was_impossible': False}
1038048,4,608,3.543608,5.0,{u'was_impossible': False}
1038114,4,1172,3.543608,5.0,{u'was_impossible': False}
1038118,4,2571,3.543608,5.0,{u'was_impossible': False}
1038146,4,48516,3.543608,5.0,{u'was_impossible': False}
1038191,4,2973,3.543608,5.0,{u'was_impossible': False}


In [0]:
def get_top_n_recommendation(predictions, n=10):
    
    top_n = defaultdict(list)
    
    for uid, iid, r_ui, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [0]:
top_n = defaultdict(list)    # getting top-10 recommendation by predicted ratings 

top_n = get_top_n_recommendation(predictions, n=10)

for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

(1, [4973, 318, 899, 2542, 3462, 1197, 2858, 4011, 48516, 1228])
(2, [6016, 1252, 908, 1203, 3462, 2064, 926, 2318, 1197, 923])
(3, [1221, 913, 2064, 3462, 858, 969, 904, 1252, 923, 1276])
(4, [527, 923, 4993, 593, 1304, 608, 1172, 2571, 48516, 2973])
(5, [3462, 2064, 926, 1945, 904, 1252, 1212, 1224, 232, 2692])
(6, [58559, 3462, 913, 969, 318, 2064, 1203, 3504, 1221, 1223])
(7, [858, 3462, 926, 913, 7502, 969, 1252, 1221, 922, 1304])
(8, [1221, 2064, 1204, 3462, 1193, 1217, 2019, 2318, 2973, 913])
(9, [858, 1252, 1203, 2858, 50, 1228, 296, 1945, 926, 969])
(10, [969, 750, 1221, 3462, 858, 1204, 913, 912, 1172, 922])
(11, [858, 232, 745, 58559, 2064, 1221, 1172, 1208, 904, 1719])
(12, [4993, 3037, 1172, 5952, 58559, 2064, 7502, 260, 1254, 3462])
(13, [858, 2064, 5995, 1172, 1276, 922, 926, 954, 745, 1945])
(14, [3683, 2064, 1617, 913, 318, 50, 750, 1252, 1060, 3359])
(15, [1927, 1104, 3037, 1080, 7502, 89774, 1537, 475, 1278, 31658])
(16, [1212, 926, 858, 2019, 608, 2186, 1304, 953, 8