In [2]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

In [6]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [30]:
# only 20 percent of the movies have more than 50 votes
m = df['vote_count'].quantile(0.95)
m

434.0

In [31]:
q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]
q_movies = q_movies[q_movies['vote_count'] >= m]
q_movies.shape

(2269, 24)

In [32]:
C = df['vote_average'].mean()
C

5.618207215134185

In [33]:
# Function to compute the IMDB weighted rating for each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [34]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [35]:
q_movies.sort_values(by='score', ascending=False)[['title','vote_count', 'vote_average', 'score', 'runtime']]

Unnamed: 0,title,vote_count,vote_average,score,runtime
314,The Shawshank Redemption,8358.0,8.5,8.357746,142.0
834,The Godfather,6024.0,8.5,8.306334,175.0
12481,The Dark Knight,12269.0,8.3,8.208376,152.0
2843,Fight Club,9678.0,8.3,8.184899,139.0
292,Pulp Fiction,8670.0,8.3,8.172155,154.0
...,...,...,...,...,...
21238,Sharknado,484.0,3.8,4.659588,86.0
26563,Fantastic Four,2322.0,4.4,4.591837,100.0
28207,The Boy Next Door,1034.0,4.1,4.548843,91.0
1491,Batman & Robin,1447.0,4.2,4.527221,125.0


In [72]:
new_df = df.copy()[['title','genres', 'release_date', 'runtime', 'vote_average','vote_count']]
new_df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [73]:
#Convert release_date into pandas datetime format
new_df['release_date'] = pd.to_datetime(new_df['release_date'], errors='coerce')
#Extract year from the datetime
new_df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [74]:
#Helper function to convert NaT to 0 and all other years to integers.
def convert_int(x):
    try:
        return int(x)
    except:
        return 0
#Apply convert_int to the year feature
new_df['year'] = new_df['year'].apply(convert_int)

In [75]:
#Drop the release_date column
new_df = new_df.drop('release_date', axis=1)
#Display the dataframe
new_df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [76]:
from ast import literal_eval
#Convert all NaN into stringified empty lists
new_df['genres'] = new_df['genres'].fillna('[]')
#Apply literal_eval to convert to the list object
new_df['genres'] = new_df['genres'].apply(literal_eval)

In [77]:
new_df['genres'] = new_df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
new_df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [81]:
s = new_df.apply(lambda x: pd.Series(x['genres'], dtype='object'),axis=1).stack().reset_index(level=1, drop=True)
s

0        Animation
0           Comedy
0           Family
1        Adventure
1          Fantasy
           ...    
45461       Family
45462        Drama
45463       Action
45463        Drama
45463     Thriller
Length: 91106, dtype: object

In [82]:
s.name = 'genre'

In [83]:
gen_df = new_df.drop('genres', axis=1).join(s)
gen_df.head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


In [87]:
def build_chart(gen_df, percentile=0.8):
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) & (movies['runtime'] >= low_time) & (movies['runtime'] <= high_time) & (movies['year'] >= low_year) & (movies['year'] <= high_year)]
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C),axis=1)
    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    return q_movies

In [88]:
build_chart(gen_df).head()

Input preferred genre
animation
Input shortest duration
30
Input longest duration
120
Input earliest year
1990
Input latest year
2005


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
