In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import pandas as pd

In [14]:
from ast import literal_eval

In [15]:
path_to_dataset = 'movies_metadata_fixed.csv'
dataset = pd.read_csv(path_to_dataset)
dataset.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92


In [16]:
dataset['genres'].iloc[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [17]:
[i['name'] for i in literal_eval(dataset['genres'].iloc[0])]

['Animation', 'Comedy', 'Family']

In [18]:
dataset['genres'] = dataset['genres'].fillna('[]')\
                                     .apply(literal_eval)\
                                     .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [19]:
dataset['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [20]:
import functools
available_genres = functools.reduce(lambda x,y: set(x).union(set(y)), dataset.genres.tolist())
available_genres

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [21]:
dataset['year'] = pd.to_datetime(dataset['release_date']).dt.year

In [22]:
dataset[['vote_count', 'vote_average']].dtypes

vote_count        int64
vote_average    float64
dtype: object

In [26]:
def process_dataset(df): 
    
    ok_rows = (dataset['vote_count'].notnull()) & (dataset['vote_average'].notnull())
    rank_dataset = dataset[ok_rows]
    
    usecols_ = ['title', 'year', 'vote_count', 'vote_average', 'genres']
    rank_dataset = rank_dataset[usecols_]
    
    return rank_dataset

In [27]:
dataset  = process_dataset(dataset.copy())
dataset.head()

Unnamed: 0,title,year,vote_count,vote_average,genres
0,Toy Story,1995.0,5415,7.7,"[Animation, Comedy, Family]"
1,Jumanji,1995.0,2413,6.9,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,1995.0,92,6.5,"[Romance, Comedy]"
3,Waiting to Exhale,1995.0,34,6.1,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,1995.0,173,5.7,[Comedy]


In [28]:
def simple_formula(x):
    R = x['vote_average']
    return R

def recommend_simple(input_films, n_items):
    rank = input_films.copy()
    rank['formula'] = rank.apply(lambda x: simple_formula(x), axis=1)
    rank.sort_values('formula', ascending=False, inplace=True)
    return rank.head(n_items).reset_index(drop=True)

recommend_simple(dataset, 10)

Unnamed: 0,title,year,vote_count,vote_average,genres,formula
0,Zig Zag Story,1983.0,1,10.0,"[Drama, Comedy]",10.0
1,Tall Story,1960.0,1,10.0,[Comedy],10.0
2,Birch Interval,1976.0,1,10.0,[Drama],10.0
3,Mad at the Moon,1992.0,1,10.0,[],10.0
4,The White Shadow,1924.0,1,10.0,[],10.0
5,Forever,2006.0,1,10.0,[Documentary],10.0
6,Chilly Scenes of Winter,1979.0,1,10.0,"[Comedy, Drama, Romance]",10.0
7,The Great Kidnapping,1973.0,1,10.0,[Crime],10.0
8,Backyard Dogs,2001.0,1,10.0,"[Action, Comedy]",10.0
9,"Oh, Bomb!",1964.0,1,10.0,"[Music, Comedy, Action, Crime]",10.0


In [29]:
def better_formula(x):
    v = float(x['vote_count'])
    R = x['vote_average']
    return v*R

def recommend_better(input_films, n_items):
    rank = input_films.copy()
    rank['formula'] = rank.apply(lambda x: better_formula(x), axis=1)
    rank.sort_values('formula', ascending=False, inplace=True)
    return rank.head(n_items).reset_index(drop=True)

recommend_better(dataset, 10)

Unnamed: 0,title,year,vote_count,vote_average,genres,formula
0,Inception,2010.0,14075,8.1,"[Action, Thriller, Science Fiction, Mystery, A...",114007.5
1,The Dark Knight,2008.0,12269,8.3,"[Drama, Action, Crime, Thriller]",101832.7
2,Interstellar,2014.0,11187,8.1,"[Adventure, Drama, Science Fiction]",90614.7
3,The Avengers,2012.0,12000,7.4,"[Science Fiction, Action, Adventure]",88800.0
4,Avatar,2009.0,12114,7.2,"[Action, Adventure, Fantasy, Science Fiction]",87220.8
5,Deadpool,2016.0,11444,7.4,"[Action, Adventure, Comedy]",84685.6
6,Fight Club,1999.0,9678,8.3,[Drama],80327.4
7,Django Unchained,2012.0,10297,7.8,"[Drama, Western]",80316.6
8,Guardians of the Galaxy,2014.0,10014,7.9,"[Action, Science Fiction, Adventure]",79110.6
9,Pulp Fiction,1994.0,8670,8.3,"[Thriller, Crime]",71961.0


In [30]:
def the_best_formula(x, Q, C):
    v = float(x['vote_count'])
    R = x['vote_average']
    return (v/(v+Q) * R) + (Q/(Q+v) * C)

def recommend_best(input_films, n_items):
    rank = input_films.copy()
    
    C = rank.vote_average.mean()
    Q = rank.vote_count.quantile(0.95)
    print("Minimum number of votes: {}. Average overall rating: {}".format(int(Q),C))
    
    rank['formula'] = rank.apply(lambda x: the_best_formula(x, Q, C), axis=1)
    rank.sort_values('formula', ascending=False, inplace=True)
    return rank.head(n_items).reset_index(drop=True)

recommend_best(dataset, 10)

Minimum number of votes: 433. Average overall rating: 5.618217011635541


Unnamed: 0,title,year,vote_count,vote_average,genres,formula
0,The Shawshank Redemption,1994.0,8358,8.5,"[Drama, Crime]",8.357778
1,The Godfather,1972.0,6024,8.5,"[Drama, Crime]",8.306376
2,The Dark Knight,2008.0,12269,8.3,"[Drama, Action, Crime, Thriller]",8.208397
3,Fight Club,1999.0,9678,8.3,[Drama],8.184925
4,Pulp Fiction,1994.0,8670,8.3,"[Thriller, Crime]",8.172184
5,Forrest Gump,1994.0,8147,8.2,"[Comedy, Drama, Romance]",8.06945
6,Schindler's List,1993.0,4436,8.3,"[Drama, History, War]",8.061058
7,Whiplash,2014.0,4376,8.3,[Drama],8.058077
8,Spirited Away,2001.0,3968,8.3,"[Fantasy, Adventure, Animation, Family]",8.035654
9,The Empire Strikes Back,1980.0,5998,8.2,"[Adventure, Action, Science Fiction]",8.025831


In [31]:
def recommend_by_genre(input_films, n_items, genres):
    rank = input_films.copy()
    
    C = rank.vote_average.mean()
    Q = rank.vote_count.quantile(0.95)
    print("Minimum number of votes: {}. Average overall rating: {}".format(int(Q),C))
    rank['formula'] = rank.apply(lambda x: the_best_formula(x, Q, C), axis=1)
    rank.sort_values('formula', ascending=False, inplace=True)
    
    rank['contains_genre'] = [all( y in x for y in genres) for x in rank.genres]
    rank = rank[rank.contains_genre == True]
    rank.drop('contains_genre', axis=1, inplace=True)

    return rank.head(n_items).reset_index(drop=True)

In [32]:
recommend_by_genre(dataset, 10, genres=['Adventure', 'Science Fiction'])

Minimum number of votes: 433. Average overall rating: 5.618217011635541


Unnamed: 0,title,year,vote_count,vote_average,genres,formula
0,The Empire Strikes Back,1980.0,5998,8.2,"[Adventure, Action, Science Fiction]",8.025831
1,Inception,2010.0,14075,8.1,"[Action, Thriller, Science Fiction, Mystery, A...",8.02578
2,Interstellar,2014.0,11187,8.1,"[Adventure, Drama, Science Fiction]",8.007335
3,Star Wars,1977.0,6778,8.1,"[Adventure, Action, Science Fiction]",7.950685
4,Back to the Future,1985.0,6239,8.0,"[Adventure, Comedy, Science Fiction, Family]",7.845126
5,Guardians of the Galaxy,2014.0,10014,7.9,"[Action, Science Fiction, Adventure]",7.805238
6,Return of the Jedi,1983.0,4763,7.9,"[Adventure, Action, Science Fiction]",7.709489
7,2001: A Space Odyssey,1968.0,3075,7.9,"[Science Fiction, Mystery, Adventure]",7.617842
8,The Martian,2015.0,7442,7.6,"[Drama, Adventure, Science Fiction]",7.490819
9,Captain America: The Winter Soldier,2014.0,5881,7.6,"[Action, Adventure, Science Fiction]",7.463831


In [33]:
def recommend_by_genre_and_year(input_films, n_items, min_votes, genres, not_older_than):
    
    rank = input_films.copy()
    
    C = rank.vote_average.mean()
    Q = rank.vote_count.quantile(0.95)
    print("Minimum number of votes: {}. Average overall rating: {}".format(int(Q),C))
    rank['formula'] = rank.apply(lambda x: the_best_formula(x, Q, C), axis=1)
    rank.sort_values('formula', ascending=False, inplace=True)
        
    rank['contains_genre'] = [any( y in x for y in genres) for x in rank.genres]
    rank = rank[rank.contains_genre == True]
    rank.drop('contains_genre', axis=1, inplace=True)
    
    rank = rank[rank.year >= not_older_than]
    
    return rank.head(n_items).reset_index(drop=True)

In [34]:
recommend_by_genre_and_year(dataset, 15, 150, genres=['Adventure', 'Science Fiction'], not_older_than=2015)

Minimum number of votes: 433. Average overall rating: 5.618217011635541


Unnamed: 0,title,year,vote_count,vote_average,genres,formula
0,Zootopia,2016.0,4961,7.7,"[Animation, Adventure, Family, Comedy]",7.532567
1,The Martian,2015.0,7442,7.6,"[Drama, Adventure, Science Fiction]",7.490819
2,Logan,2017.0,6310,7.6,"[Action, Drama, Science Fiction]",7.472493
3,Kingsman: The Secret Service,2015.0,6069,7.6,"[Crime, Comedy, Action, Adventure]",7.467767
4,Ex Machina,2015.0,4862,7.6,"[Drama, Science Fiction]",7.43763
5,Guardians of the Galaxy Vol. 2,2017.0,4858,7.6,"[Action, Adventure, Comedy, Science Fiction]",7.437507
6,Captain Fantastic,2016.0,1569,7.9,"[Adventure, Comedy, Drama, Romance]",7.405684
7,Star Wars: The Force Awakens,2015.0,7993,7.5,"[Action, Adventure, Science Fiction, Fantasy]",7.403107
8,Deadpool,2016.0,11444,7.4,"[Action, Adventure, Comedy]",7.334911
9,Rogue One: A Star Wars Story,2016.0,5111,7.4,"[Action, Adventure, Science Fiction]",7.260572
