In [1]:
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Processing movies upto year 2016

In [16]:
DATA_PATH = ".\data\movie_metadata_2016.csv"
data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [17]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [18]:
data.shape

(5043, 28)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

In [20]:
sorted(data["title_year"].unique(), reverse = True)[1]

2016.0

In [21]:
required_columns = ['movie_title', 'genres', 'director_name','actor_1_name','actor_2_name','actor_3_name']
data = data[required_columns]

In [22]:
data.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport
2,Spectre,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,The Dark Knight Rises,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,Star Wars: Episode VII - The Force Awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,


In [23]:
data.shape

(5043, 6)

In [24]:
data.isnull().sum()

movie_title        0
genres             0
director_name    104
actor_1_name       7
actor_2_name      13
actor_3_name      23
dtype: int64

In [25]:
for column in required_columns[2:]:
    data[column] = data[column].replace(np.nan, "unknown")

In [26]:
data.isnull().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [27]:
data["genres"] = data["genres"].str.replace("|", " ")

In [29]:
all_movies = list(data["movie_title"].apply(lambda x: x[:-1]).values)
all_movies[:5]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'Star Wars: Episode VII - The Force Awakens\xa0           ']

In [30]:
data['movie_title'] = data['movie_title'].str.lower()

In [31]:
data.head(3)

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman


In [32]:
data['movie_title'][1], data['movie_title'][1][-1]

("pirates of the caribbean: at world's end\xa0", '\xa0')

In [33]:
data["movie_title"] = data["movie_title"].apply(lambda x: x[:-1])

In [34]:
data['movie_title'][1]

"pirates of the caribbean: at world's end"

In [35]:
df_2016 = data.copy()
df_2016.head(2)

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport


## Processing movies upto year 2017

In [36]:
CREDITS_DATA_PATH = "data/credits_2017.csv"
METADATA_PATH = "data/movies_metadata_2017.csv"

In [37]:
credits_data = pd.read_csv(CREDITS_DATA_PATH)
data  = pd.read_csv(METADATA_PATH)

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
credits_data.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [39]:
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [40]:
data["release_date"] = pd.to_datetime(data["release_date"], errors="coerce")
data["year"] = data["release_date"].dt.year

In [41]:
data = data.loc[data.year == 2017, ['id','title','genres','year']]
data.head()

Unnamed: 0,id,title,genres,year
26560,166426,Pirates of the Caribbean: Dead Men Tell No Tales,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",2017.0
26561,141052,Justice League,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0
26565,284053,Thor: Ragnarok,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0
26566,283995,Guardians of the Galaxy Vol. 2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0
30536,245842,The King's Daughter,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2017.0


In [42]:
data["id"] = data["id"].astype(int)

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 532 entries, 26560 to 45465
Data columns (total 4 columns):
id        532 non-null int32
title     532 non-null object
genres    532 non-null object
year      532 non-null float64
dtypes: float64(1), int32(1), object(2)
memory usage: 18.7+ KB


In [44]:
data.shape

(532, 4)

In [45]:
data = pd.merge(data, credits_data, on="id")

In [46]:
data.head()

Unnamed: 0,id,title,genres,year,cast,crew
0,166426,Pirates of the Caribbean: Dead Men Tell No Tales,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,141052,Justice League,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,284053,Thor: Ragnarok,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,283995,Guardians of the Galaxy Vol. 2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,245842,The King's Daughter,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."


In [47]:
data.shape

(531, 6)

In [48]:
data['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))
data['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))
data['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))

In [49]:
def genre_generate(g):
    genres = []
    for i in g:
        gen = i.get("name")
        if gen == 'Science Fiction':
            genres.append('Sci-Fi')
        else:
            genres.append(gen)
    if genres == []:
        return np.nan
    else:
        return " ".join(genres)

In [50]:
data["genres"] = data["genres"].map(genre_generate)

In [51]:
data.head()

Unnamed: 0,id,title,genres,year,cast,crew
0,166426,Pirates of the Caribbean: Dead Men Tell No Tales,Adventure Action Fantasy Comedy,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,141052,Justice League,Action Adventure Fantasy Sci-Fi,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,284053,Thor: Ragnarok,Action Adventure Fantasy Sci-Fi,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."
3,283995,Guardians of the Galaxy Vol. 2,Action Adventure Comedy Sci-Fi,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de..."
4,245842,The King's Daughter,Fantasy Action Adventure,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de..."


In [52]:
def get_cast(x, actor_index = None, director = False):
    if director:
        direct = []
        for i in x:
            if i.get('job') == "Director":
                direct.append(i.get("name"))
        if direct == []:
            return np.nan
        else:
            return " ".join(direct)
    else:
        casts = []
        for i in x:
            casts.append(i.get('name'))
        if casts == []:
            return np.nan
        else:
            index = actor_index-1
            if index == 0:
                return (casts[index])
            else:
                if len(casts)<actor_index:
                    return np.nan
                else:
                    return casts[index]

In [53]:
data["actor_1_name"] = data["cast"].map(lambda x: get_cast(x, actor_index = 1))
data["actor_2_name"] = data["cast"].map(lambda x: get_cast(x, actor_index = 2))
data["actor_3_name"] = data["cast"].map(lambda x: get_cast(x, actor_index = 3))
data["director_name"] = data['crew'].map(lambda x: get_cast(x, director=True))

In [54]:
data.head()

Unnamed: 0,id,title,genres,year,cast,crew,actor_1_name,actor_2_name,actor_3_name,director_name
0,166426,Pirates of the Caribbean: Dead Men Tell No Tales,Adventure Action Fantasy Comedy,2017.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de...",Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg
1,141052,Justice League,Action Adventure Fantasy Sci-Fi,2017.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de...",Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder
2,284053,Thor: Ragnarok,Action Adventure Fantasy Sci-Fi,2017.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de...",Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi
3,283995,Guardians of the Galaxy Vol. 2,Action Adventure Comedy Sci-Fi,2017.0,"[{'cast_id': 3, 'character': 'Peter Quill / St...","[{'credit_id': '59171547925141583c0315a6', 'de...",Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn
4,245842,The King's Daughter,Fantasy Action Adventure,2017.0,"[{'cast_id': 0, 'character': 'King Louis XIV',...","[{'credit_id': '5431de49c3a36825d300007e', 'de...",Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara


In [55]:
data.columns

Index(['id', 'title', 'genres', 'year', 'cast', 'crew', 'actor_1_name',
       'actor_2_name', 'actor_3_name', 'director_name'],
      dtype='object')

In [56]:
data.rename(columns={
    'title':'movie_title'
}, inplace=True)

In [57]:
required_columns

['movie_title',
 'genres',
 'director_name',
 'actor_1_name',
 'actor_2_name',
 'actor_3_name']

In [58]:
data = data[required_columns]

In [59]:
data.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Pirates of the Caribbean: Dead Men Tell No Tales,Adventure Action Fantasy Comedy,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush
1,Justice League,Action Adventure Fantasy Sci-Fi,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot
2,Thor: Ragnarok,Action Adventure Fantasy Sci-Fi,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett
3,Guardians of the Galaxy Vol. 2,Action Adventure Comedy Sci-Fi,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista
4,The King's Daughter,Fantasy Action Adventure,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker


In [60]:
data.isnull().sum()

movie_title       0
genres            7
director_name     4
actor_1_name     22
actor_2_name     55
actor_3_name     70
dtype: int64

In [61]:
data.dropna(inplace=True)

In [62]:
data.isnull().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [63]:
data.shape

(458, 6)

In [64]:
print(len(all_movies))
all_movies+=list(data["movie_title"])
print(len(all_movies))

5043
5501


In [65]:
data["movie_title"] = data["movie_title"].str.lower()

In [66]:
df_2017 = data.copy()
df_2017.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,pirates of the caribbean: dead men tell no tales,Adventure Action Fantasy Comedy,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush
1,justice league,Action Adventure Fantasy Sci-Fi,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot
2,thor: ragnarok,Action Adventure Fantasy Sci-Fi,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett
3,guardians of the galaxy vol. 2,Action Adventure Comedy Sci-Fi,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista
4,the king's daughter,Fantasy Action Adventure,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker


## Processing movies upto year 2018,2019,2020

In [68]:
from tmdbv3api import TMDb
import json
import requests
from tmdbv3api import Movie


In [69]:
tmdb = TMDb()
tmdb.api_key = os.getenv('TMDB_API_KEY')
from dotenv import load_dotenv
load_dotenv()


In [70]:
WIKI_LINK_2018 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2018"
WIKI_LINK_2019 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
WIKI_LINK_2020 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

In [80]:
class CreateData:
    def __init__(self, wiki_link):
        self.wiki_link = wiki_link
        self.tmdb_movie = Movie()
        self.tmdb_movie_link = "https://api.themoviedb.org/3/movie/"
    
    def __get_genre(self, x):
        genres = []
        result = self.tmdb_movie.search(x)
        if not result:
            return np.nan
        movie_id = result[0].id
        response = requests.get(self.tmdb_movie_link+"{}?api_key={}".format(movie_id, tmdb.api_key))
        data_json = response.json()
        gen = data_json["genres"]
        if gen:
            for i in range(len(gen)):
                genres.append(gen[i]['name'])
            return ' '.join(genres)
        else:
            return np.nan
    
    def __get_actor_director(self, x, actor_index = None, director = False):
        if director:
            if " (director)" in x:
                return x.split(" (director)")[0]
            elif " (directors)" in x:
                return x.split(" (directors)")[0]
            else:
                return x.split(" (director/screenplay)")[0]
        else:
            actors = (x.split("screenplay); ")[-1]).split(", ")
            index = actor_index - 1
            if index == 0:
                return actors[0]
            else:
                if len(actors) < actor_index:
                    return np.nan
                else:
                    return actors[index]

    def _create_dataframe(self):
        WEB_LINK = self.wiki_link
        df1 = pd.read_html(WEB_LINK, header=0)[2]
        df2 = pd.read_html(WEB_LINK, header=0)[3]
        df3 = pd.read_html(WEB_LINK, header=0)[4]
        df4 = pd.read_html(WEB_LINK, header=0)[5]
        df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
        df['genres'] = df['Title'].map(lambda x: self.__get_genre(str(x)))
        df = df[['Title','Cast and crew','genres']]
        df["director_name"] = df["Cast and crew"].map(lambda x: self.__get_actor_director(x, director = True))
        for i in range(1,4):
            df[f"actor_{i}_name"] = df["Cast and crew"].map(lambda x: self.__get_actor_director(x, actor_index=i))
            df[f"actor_{i}_name"] = df[f"actor_{i}_name"].replace(np.nan, "unknown")
        df.rename(columns = {"Title":"movie_title"}, inplace = True)
        required_columns = ['movie_title', 'genres', 'director_name', 'actor_1_name', 'actor_2_name','actor_3_name']
        df = df[required_columns]
        df["movie_title"] = df["movie_title"].str.lower()
        return df

In [81]:
temp = CreateData(WIKI_LINK_2018)
df_2018 = temp._create_dataframe()
df_2018.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Insidious: The Last Key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,Drama History Western,Warwick Thornton,Bryan Brown,Sam Neill,unknown
4,The Commuter,Action Thriller,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson


In [82]:
temp = CreateData(WIKI_LINK_2019)
df_2019 = temp._create_dataframe()
df_2019.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Escape Room,Thriller Action Mystery Adventure Horror,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,Rust Creek,Thriller Drama,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,American Hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,A Dog's Way Home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,The Upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman


In [83]:
df_2018.isnull().sum()

movie_title      0
genres           3
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [84]:
df_2019.isnull().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [85]:
import bs4 as bs
import urllib.request

In [86]:
class CreateData1:
    def __init__(self, wiki_link):
        self.wiki_link = wiki_link
        self.tmdb_movie = Movie()
        self.tmdb_movie_link = "https://api.themoviedb.org/3/movie/"
    
    def __get_genre(self, x):
        genres = []
        result = self.tmdb_movie.search(x)
        if not result:
            return np.nan
        movie_id = result[0].id
        response = requests.get(self.tmdb_movie_link+"{}?api_key={}".format(movie_id, tmdb.api_key))
        data_json = response.json()
        gen = data_json["genres"]
        if gen:
            for i in range(len(gen)):
                genres.append(gen[i]['name'])
            return ' '.join(genres)
        else:
            return np.nan
    
    def __get_actor_director(self, x, actor_index = None, director = False):
        if director:
            if " (director)" in x:
                return x.split(" (director)")[0]
            elif " (directors)" in x:
                return x.split(" (directors)")[0]
            else:
                return x.split(" (director/screenplay)")[0]
        else:
            actors = (x.split("screenplay); ")[-1]).split(", ")
            index = actor_index - 1
            if index == 0:
                return actors[0]
            else:
                if len(actors) < actor_index:
                    return np.nan
                else:
                    return actors[index]

    def _create_dataframe(self):
        WEB_LINK = self.wiki_link
        source = urllib.request.urlopen(WEB_LINK).read()
        soup = bs.BeautifulSoup(source,'lxml')
        tables = soup.find_all('table',class_='wikitable sortable')
        df1 = pd.read_html(str(tables[0]))[0]
        df2 = pd.read_html(str(tables[1]))[0]
        df3 = pd.read_html(str(tables[2]))[0]
        df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'
        df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
        df['genres'] = df['Title'].map(lambda x: self.__get_genre(str(x)))
        df = df[['Title','Cast and crew','genres']]
        df["director_name"] = df["Cast and crew"].map(lambda x: self.__get_actor_director(x, director = True))
        for i in range(1,4):
            df[f"actor_{i}_name"] = df["Cast and crew"].map(lambda x: self.__get_actor_director(x, actor_index=i))
            df[f"actor_{i}_name"] = df[f"actor_{i}_name"].replace(np.nan, "unknown")
        df.rename(columns = {"Title":"movie_title"}, inplace = True)
        required_columns = ['movie_title', 'genres', 'director_name', 'actor_1_name', 'actor_2_name','actor_3_name']
        df = df[required_columns]
        df["movie_title"] = df["movie_title"].str.lower()
        return df

In [87]:
temp = CreateData1(WIKI_LINK_2020)
df_2020 = temp._create_dataframe()
df_2020.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Grudge,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Three Christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,Inherit the Viper,Drama Thriller Crime,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs


In [88]:
df_2020.isnull().sum()

movie_title      0
genres           1
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [89]:
df_2020.shape

(272, 6)

In [93]:
print(len(all_movies))
new=list(df_2018["movie_title"])+list(df_2019["movie_title"])+list(df_2020["movie_title"])
all_movies+=new
print(len(all_movies))

5501
6284


## Final Dataset Preparation

In [96]:
final_df = df_2016.append(df_2017.append(df_2018.append(df_2019.append(df_2020, ignore_index=True),ignore_index=True),ignore_index=True),ignore_index=True)
final_df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown


In [173]:
final_df.shape

(6284, 6)

In [174]:
final_df.isnull().sum()

movie_title      0
genres           4
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [175]:
final_df.dropna(inplace=True)


In [176]:
final_df.isnull().sum()


movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
dtype: int64

In [177]:
final_df["movie_feature"] = final_df["actor_1_name"] + " " + final_df["actor_2_name"] + \
    " " + final_df["actor_3_name"] + " " + \
    final_df["director_name"] + " " + final_df["genres"]


In [178]:
final_df.head()


Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,movie_feature
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,CCH Pounder Joel David Moore Wes Studi James C...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Doug Walker Rob Walker unknown Doug Walker Doc...


In [179]:
final_df.shape


(6280, 7)

In [180]:
final_data = final_df.drop_duplicates(
  subset = ['movie_title'],
  keep = 'last').reset_index(drop = True)
final_data.shape


(6113, 7)

In [181]:
final_df["movie_title"] = final_df["movie_title"].map(lambda x: x.strip())


In [185]:
final_df.to_csv("final_data.csv", index=False)


In [99]:
all_movies = list(set(all_movies))
print(len(all_movies))

6117


In [120]:
movie_data = {"movie_names":[]}
for index, name in enumerate(all_movies):
    movie_data["movie_names"].append({"title":str(name).strip()})

In [121]:
import json

def convert(o):
    if isinstance(o, np.generic):
        return o.item()
    raise TypeError

In [122]:
movie_names_json = json.dumps(movie_data, default = convert)

In [123]:
with open('movie_names.json', 'w') as outfile:
    json.dump(movie_data, outfile)