In [61]:
import pandas as pd
import ast

In [178]:
movie_md = pd.read_csv('dataset/movies_metadata.csv', low_memory=False)
movie_cr = pd.read_csv('dataset/credits.csv')
movie_kw = pd.read_csv('dataset/keywords.csv')
movie_ra = pd.read_csv('dataset/ratings_small.csv')
movie_lk = pd.read_csv('dataset/links_small.csv')

In [179]:
movie_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [180]:
movie_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [181]:
movie_md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [182]:
# Selecting columns of interest
movie_md = movie_md[['genres', 'id', 'imdb_id', 'release_date', 'title', 'vote_average', 'vote_count', 'popularity', 'runtime']]

# extracting the genres
movie_md['genres'] = movie_md['genres'].apply(lambda x: [genre['name'] for genre in ast.literal_eval(x)])

# change date data type
movie_md['release_date'] = pd.to_datetime(movie_md['release_date'], errors='coerce')

# extract release year
movie_md['release_year'] = movie_md['release_date'].dt.year.fillna(0).astype('int')

movie_md.head()


Unnamed: 0,genres,id,imdb_id,release_date,title,vote_average,vote_count,popularity,runtime,release_year
0,"[Animation, Comedy, Family]",862,tt0114709,1995-10-30,Toy Story,7.7,5415.0,21.946943,81.0,1995
1,"[Adventure, Fantasy, Family]",8844,tt0113497,1995-12-15,Jumanji,6.9,2413.0,17.015539,104.0,1995
2,"[Romance, Comedy]",15602,tt0113228,1995-12-22,Grumpier Old Men,6.5,92.0,11.7129,101.0,1995
3,"[Comedy, Drama, Romance]",31357,tt0114885,1995-12-22,Waiting to Exhale,6.1,34.0,3.859495,127.0,1995
4,[Comedy],11862,tt0113041,1995-02-10,Father of the Bride Part II,5.7,173.0,8.387519,106.0,1995


POPULARITY

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,

* v is the number of votes for the movie
* m is the minimum votes required to be listed in the chart
* R is the average rating of the movie
* C is the mean vote across the whole report

Let's use 90th percentile as our cutoff point for m.

In [183]:
m = movie_md['vote_count'].quantile(0.9)
m

160.0

In [184]:
C = movie_md['vote_average'].mean()
C

5.618207215133889

In [185]:
# Function to calculate WR
def WR(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    wr = ((v/(v+m)*R) + (m/(v+m)*C)).round(2)
    return wr

In [188]:
# Function to return top overall movies
def top_x_movie(data, m=m, val=100):
    filterd_movie_md = data[data['vote_count'] >= m].copy()
    filterd_movie_md['wr'] = filterd_movie_md.apply(WR, 1)
    top_x = filterd_movie_md.sort_values('wr', ascending=False).loc[:, 'title':'wr'].head(val).reset_index(drop=True)
    return top_x

In [189]:
top_x_movie(movie_md, val=20)

Unnamed: 0,title,vote_average,vote_count,popularity,runtime,release_year,wr
0,The Shawshank Redemption,8.5,8358.0,51.645403,142.0,1994,8.45
1,The Godfather,8.5,6024.0,41.109264,175.0,1972,8.43
2,Dilwale Dulhania Le Jayenge,9.1,661.0,34.457024,190.0,1995,8.42
3,The Dark Knight,8.3,12269.0,123.167259,152.0,2008,8.27
4,Fight Club,8.3,9678.0,63.869599,139.0,1999,8.26
5,Pulp Fiction,8.3,8670.0,140.950236,154.0,1994,8.25
6,Whiplash,8.3,4376.0,64.29999,105.0,2014,8.21
7,Schindler's List,8.3,4436.0,41.725123,195.0,1993,8.21
8,Spirited Away,8.3,3968.0,41.048867,125.0,2001,8.2
9,Life Is Beautiful,8.3,3643.0,39.39497,116.0,1997,8.19


In [112]:
filterd_movie_md = movie_md[movie_md['vote_count'] >= m].copy()
filterd_movie_md['wr'] = filterd_movie_md.apply(WR, 1)
filterd_movie_md.head()

Unnamed: 0,genres,id,imdb_id,popularity,release_date,runtime,title,vote_average,vote_count,release_year,wr
0,"[Animation, Comedy, Family]",862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995,7.64
1,"[Adventure, Fantasy, Family]",8844,tt0113497,17.015539,1995-12-15,104.0,Jumanji,6.9,2413.0,1995,6.82
4,[Comedy],11862,tt0113041,8.387519,1995-02-10,106.0,Father of the Bride Part II,5.7,173.0,1995,5.66
5,"[Action, Crime, Drama, Thriller]",949,tt0113277,17.924927,1995-12-15,170.0,Heat,7.7,1886.0,1995,7.54
8,"[Action, Adventure, Thriller]",9091,tt0114576,5.23158,1995-12-22,106.0,Sudden Death,5.5,174.0,1995,5.56


Overall Top 100 Movies

In [168]:
top_100 = filterd_movie_md.sort_values('wr', ascending=False)[['title', 'genres', 'release_year', 'runtime', 'popularity', 'wr']].head(100)
top_100.reset_index(drop=True)

Unnamed: 0,title,genres,release_year,runtime,popularity,wr
0,The Shawshank Redemption,"[Drama, Crime]",1994,142.0,51.645403,8.45
1,The Godfather,"[Drama, Crime]",1972,175.0,41.109264,8.43
2,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",1995,190.0,34.457024,8.42
3,The Dark Knight,"[Drama, Action, Crime, Thriller]",2008,152.0,123.167259,8.27
4,Fight Club,[Drama],1999,139.0,63.869599,8.26
...,...,...,...,...,...,...
95,Vertigo,"[Mystery, Romance, Thriller]",1958,128.0,18.20822,7.71
96,The Theory of Everything,"[Drama, Romance]",2014,123.0,11.85302,7.70
97,Amélie,"[Comedy, Romance]",2001,122.0,12.879381,7.70
98,Gran Torino,[Drama],2008,116.0,14.794228,7.70


In [154]:
gn_movie_md = movie_md.explode('genres')
gn_movie_md

Unnamed: 0,genres,id,imdb_id,popularity,release_date,runtime,title,vote_average,vote_count,release_year
0,Animation,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
0,Comedy,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
0,Family,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
1,Adventure,8844,tt0113497,17.015539,1995-12-15,104.0,Jumanji,6.9,2413.0,1995
1,Fantasy,8844,tt0113497,17.015539,1995-12-15,104.0,Jumanji,6.9,2413.0,1995
...,...,...,...,...,...,...,...,...,...,...
45463,Action,67758,tt0303758,0.903007,2003-08-01,90.0,Betrayal,3.8,6.0,2003
45463,Drama,67758,tt0303758,0.903007,2003-08-01,90.0,Betrayal,3.8,6.0,2003
45463,Thriller,67758,tt0303758,0.903007,2003-08-01,90.0,Betrayal,3.8,6.0,2003
45464,,227506,tt0008536,0.003503,1917-10-21,87.0,Satan Triumphant,0.0,0.0,1917


In [150]:
se = movie_md.head(100)

In [147]:
sd = se.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
sd.name = 'genre'
se.join(sd)

  sd = se.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)


In [153]:
se.explode('genres')

Unnamed: 0,genres,id,imdb_id,popularity,release_date,runtime,title,vote_average,vote_count,release_year
0,Animation,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
0,Comedy,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
0,Family,862,tt0114709,21.946943,1995-10-30,81.0,Toy Story,7.7,5415.0,1995
1,Adventure,8844,tt0113497,17.015539,1995-12-15,104.0,Jumanji,6.9,2413.0,1995
1,Fantasy,8844,tt0113497,17.015539,1995-12-15,104.0,Jumanji,6.9,2413.0,1995
...,...,...,...,...,...,...,...,...,...,...
98,Drama,11062,tt0115907,4.807223,1996-02-16,111.0,City Hall,6.0,67.0,1996
98,Thriller,11062,tt0115907,4.807223,1996-02-16,111.0,City Hall,6.0,67.0,1996
99,Comedy,13685,tt0115734,6.904831,1996-02-21,91.0,Bottle Rocket,6.8,285.0,1996
99,Crime,13685,tt0115734,6.904831,1996-02-21,91.0,Bottle Rocket,6.8,285.0,1996


In [151]:
import ast

# Convert string representation of 'genres' column to a list of dictionaries
se['genres'] = se['genres'].apply(ast.literal_eval)

# Extract genre names and explode the DataFrame
se = se.assign(genre=se['genres'].apply(lambda x: [genre['name'] for genre in x])).explode('genre')


TypeError: string indices must be integers