In [61]:
import pandas as pd
import ast

In [246]:
movie_md = pd.read_csv('dataset/movies_metadata.csv', low_memory=False)
movie_cr = pd.read_csv('dataset/credits.csv')
movie_kw = pd.read_csv('dataset/keywords.csv')
movie_ra = pd.read_csv('dataset/ratings_small.csv')
movie_lk = pd.read_csv('dataset/links_small.csv')

In [247]:
movie_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [248]:
movie_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [249]:
movie_md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [250]:
# Selecting columns of interest
movie_md = movie_md[['genres', 'id', 'imdb_id', 'release_date', 'title', 'vote_average', 'vote_count', 'popularity', 'runtime']]

# extracting the genres
movie_md['genres'] = movie_md['genres'].apply(lambda x: [genre['name'] for genre in ast.literal_eval(x)])

# change date data type
movie_md['release_date'] = pd.to_datetime(movie_md['release_date'], errors='coerce')
# dropped invalid id
movie_md = movie_md.drop([19730, 29503, 35587])
movie_md['id'] = movie_md['id'].astype('int')

# extract release year
movie_md['release_year'] = movie_md['release_date'].dt.year.fillna(0).astype('int')

movie_md.head()


Unnamed: 0,genres,id,imdb_id,release_date,title,vote_average,vote_count,popularity,runtime,release_year
0,"[Animation, Comedy, Family]",862,tt0114709,1995-10-30,Toy Story,7.7,5415.0,21.946943,81.0,1995
1,"[Adventure, Fantasy, Family]",8844,tt0113497,1995-12-15,Jumanji,6.9,2413.0,17.015539,104.0,1995
2,"[Romance, Comedy]",15602,tt0113228,1995-12-22,Grumpier Old Men,6.5,92.0,11.7129,101.0,1995
3,"[Comedy, Drama, Romance]",31357,tt0114885,1995-12-22,Waiting to Exhale,6.1,34.0,3.859495,127.0,1995
4,[Comedy],11862,tt0113041,1995-02-10,Father of the Bride Part II,5.7,173.0,8.387519,106.0,1995


## POPULARITY BASED

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,

* v is the number of votes for the movie
* m is the minimum votes required to be listed in the chart
* R is the average rating of the movie
* C is the mean vote across the whole report

Let's use 90th percentile as our cutoff point for m.

In [183]:
m = movie_md['vote_count'].quantile(0.9)
m

160.0

In [184]:
C = movie_md['vote_average'].mean()
C

5.618207215133889

In [185]:
# Function to calculate WR
def WR(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    wr = ((v/(v+m)*R) + (m/(v+m)*C)).round(2)
    return wr

In [199]:
# Function to return top overall movies
def top_x_movie(data, m=m, val=100):
    filterd_movie_md = data[data['vote_count'] >= m].copy()
    filterd_movie_md['wr'] = filterd_movie_md.apply(WR, 1)
    top_x = filterd_movie_md.sort_values('wr', ascending=False).loc[:, 'title':'wr'].head(val).reset_index(drop=True)
    return top_x

### Overall Top 10 Movies

In [200]:
top_x_movie(movie_md, val=10)

Unnamed: 0,title,vote_average,vote_count,popularity,runtime,release_year,wr
0,The Shawshank Redemption,8.5,8358.0,51.645403,142.0,1994,8.45
1,The Godfather,8.5,6024.0,41.109264,175.0,1972,8.43
2,Dilwale Dulhania Le Jayenge,9.1,661.0,34.457024,190.0,1995,8.42
3,The Dark Knight,8.3,12269.0,123.167259,152.0,2008,8.27
4,Fight Club,8.3,9678.0,63.869599,139.0,1999,8.26
5,Pulp Fiction,8.3,8670.0,140.950236,154.0,1994,8.25
6,Whiplash,8.3,4376.0,64.29999,105.0,2014,8.21
7,Schindler's List,8.3,4436.0,41.725123,195.0,1993,8.21
8,Spirited Away,8.3,3968.0,41.048867,125.0,2001,8.2
9,Life Is Beautiful,8.3,3643.0,39.39497,116.0,1997,8.19


### Top Movies by Genres

In [195]:
gn_movie_md = movie_md.explode('genres')
gn_movie_md.genres.unique()

array(['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance',
       'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History',
       'Science Fiction', 'Mystery', 'War', 'Foreign', nan, 'Music',
       'Documentary', 'Western', 'TV Movie', 'Carousel Productions',
       'Vision View Entertainment', 'Telescene Film Group Productions',
       'Aniplex', 'GoHands', 'BROSTA TV',
       'Mardock Scramble Production Committee', 'Sentai Filmworks',
       'Odyssey Media', 'Pulser Productions', 'Rogue State', 'The Cartel'],
      dtype=object)

In [197]:
# Function to return top genre movies
def top_x_genre_movie(data, genre='Drama', m=m, val=100):
    data = data[data['genres'] == genre]
    filterd_movie_md = data[data['vote_count'] >= m].copy()
    filterd_movie_md['wr'] = filterd_movie_md.apply(WR, axis=1)
    top_x = filterd_movie_md.sort_values('wr', ascending=False).loc[:, 'title':'wr'].head(val).reset_index(drop=True)
    return top_x

In [220]:
top_x_genre_movie(gn_movie_md, genre='Documentary', val=10)

Unnamed: 0,title,vote_average,vote_count,popularity,runtime,release_year,wr
0,Citizenfour,7.9,516.0,6.584217,114.0,2014,7.36
1,Blackfish,7.9,456.0,5.495138,83.0,2013,7.31
2,Planet Earth,8.8,176.0,4.501137,550.0,2006,7.28
3,Senna,8.1,282.0,11.331859,106.0,2010,7.2
4,Searching for Sugar Man,7.9,286.0,6.245191,85.0,2012,7.08
5,Amy,7.5,482.0,11.12024,128.0,2015,7.03
6,The Cove,7.9,241.0,12.492389,92.0,2009,6.99
7,Going Clear: Scientology and the Prison of Belief,7.7,306.0,5.887113,119.0,2015,6.99
8,One Direction: This Is Us,8.0,209.0,5.988929,92.0,2013,6.97
9,Inside Job,7.7,287.0,10.30535,109.0,2010,6.95


In [225]:
movie_cr

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [245]:
movie_md[movie_md.id.str.contains('-')]

Unnamed: 0,genres,id,imdb_id,release_date,title,vote_average,vote_count,popularity,runtime
19730,"[Carousel Productions, Vision View Entertainme...",1997-08-20,0,NaT,,,,,
29503,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...",2012-09-29,0,NaT,,,,,
35587,"[Odyssey Media, Pulser Productions, Rogue Stat...",2014-01-01,0,NaT,,,,Beware Of Frost Bites,


In [233]:
movie_md.merge(movie_cr, on='id')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat