# Download data from "The Movie Database" (TMDB)

We connect with "The Movie Database" API to get data about the cast and popularity of every movie on our original dataset

In [1]:
import pandas as pd
import numpy as np
import requests
import datetime

In [2]:
# Import DataFrame
df = pd.read_excel("netflix_movies_dataset.xlsx")

df.sample(10)

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
549,To All the Boys I've Loved Before,Romantic comedy,"August 17, 2018",99,7.1,English
650,Untold: Breaking Point,Documentary,"September 7, 2021",79,7.6,English
563,Loudon Wainwright III: Surviving Twin,One-man show,"November 13, 2018",91,7.1,English
679,Cuba and the Cameraman,Documentary,"November 24, 2017",114,8.3,English
486,Long Live Brij Mohan,Comedy,"August 3, 2018",105,6.8,Hindi
223,15 August,Comedy-drama,"March 29, 2019",124,5.8,Marathi
285,Deidra & Laney Rob a Train,Drama,"March 17, 2017",94,6.1,English
220,All Day and a Night,Drama,"May 1, 2020",121,5.8,English
93,Rim of the World,Science fiction adventure,"May 24, 2019",98,5.2,English
332,Get the Goat,Comedy,"March 18, 2021",97,6.3,Portuguese


In [3]:
df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)

In [4]:
# Prepare "Premiere column": transform to datetime and get year
df["Premiere"] = pd.to_datetime(df["Premiere"])
df["premiere_year"] = df["Premiere"].dt.year
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year
0,Enter the Anime,Documentary,2019-08-05,58,2.5,English/Japanese,2019
1,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish,2020
2,The App,Science fiction/Drama,2019-12-26,79,2.6,Italian,2019
3,Sex: Unzipped,Comedy,2021-10-26,59,3.1,English,2021
4,Swallow,Drama,2021-10-01,128,3.2,English,2021


### 1. Get every movie id

We first get every movie ID and we append the result to the DataFrame

In [5]:
def get_movie_data(movie):
    '''
    
    Function that gets the name of the movie, connects with the TMDB API and gets all movies data with same name
    
    Parameters
    ==========
    
    movie: str
        Movie's name
    
    Returns
    =======
    
    dict
        movie's data
    '''
    
    # Prepare the string for the ID query replacing " " for "%20"
    movie.replace(" ", "%20")
    
    # Communicate with the API
    response = requests.get(f"http://api.tmdb.org/3/search/movie?api_key=f062d4d3bef1ed6f224531125e4c20c7&query={movie}")
    
    # If the status_code is 200 the function communicated correctly with the API
    if response.status_code == 200:
        
        #Gets movie's data and returns it
        try:
            movie_data = response.json()['results']
        
            return movie_data
        
        # If the program can't get data it prints a phrase
        except:
            movie_data = "No data was found"
            
            return movie_data
    
    else:
        return "The program failed to communicate with the API"

In [6]:
get_movie_data("7 prisoners")

[{'adult': False,
  'backdrop_path': '/pKgfWzxOpvGV3MQ0kLjLdjKAzUe.jpg',
  'genre_ids': [18, 80],
  'id': 785538,
  'original_language': 'pt',
  'original_title': '7 Prisioneiros',
  'overview': 'To provide a better life for his family in the country, 18-year-old Mateus accepts a job in a junkyard in São Paulo for his new boss Luca. But when he and a few other boys become trapped in the dangerous world of contemporary slavery, Mateus will be forced to decide between working for the very man who enslaved him or risk his and his family’s future.',
  'popularity': 34.285,
  'poster_path': '/5svMKCGnR6Yvj8wxldvDvgUi0Jk.jpg',
  'release_date': '2021-11-11',
  'title': '7 Prisoners',
  'video': False,
  'vote_average': 6,
  'vote_count': 1},
 {'adult': False,
  'backdrop_path': '/gv0p6gQM6tDkJFx2UpURWyCAjPn.jpg',
  'genre_ids': [18, 53, 80],
  'id': 146233,
  'original_language': 'en',
  'original_title': 'Prisoners',
  'overview': "Keller Dover faces a parent's worst nightmare when his 6-ye

In [7]:
def get_movie_id(title, release_year):
    '''
    Function that takes in input a movie title and its release year, communicates with TMDB API and returns the ID of
    the movie that matches the release year of the title in input
    
    Parameters
    ==========

    title: str 
        movie's title
    
    release_year: int 
        movie's release year
    
    Returns
    =======
    
    int 
        movie's id
    '''
    
    # Applies get_movie_data function to get data about every movie with a same title
    data = get_movie_data(title)
    
    # Make a for cicle to return de ID of the movie that matches the year in input or the year before (teather's premiere)
    try:
        if len(data) == 1:
            return data[0]["id"]
        
        elif len(data) > 1:
            for i in data:
                date = pd.to_datetime(i["release_date"])
                if release_year == date.year or release_year == (date.year) + 1:
                    return i["id"]
                else:
                    pass
        
        else:
            pass
        
    # If the program can't find the release_date returns "?"
    except:
        return "?"

In [8]:
# Prove
get_movie_id("7 prisoners", 2021)

785538

#### 1.1 Build the ID column

In [9]:
# List comprehension that builds the "id" column: we iterate with ".iterrows()"
df["tmdb_id"] = [get_movie_id(i[1]["Title"], i[1]["premiere_year"]) for i in df.iterrows()]

df.sample(10)

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
159,Òlòt?ré,Crime drama,2020-10-02,106,5.5,English,2020,
25,After the Raid,Documentary,2019-12-19,25,4.3,Spanish,2019,650029
96,#REALITYHIGH,Comedy,2017-09-08,99,5.2,English,2017,?
516,The Claudia Kishi Club,Documentary,2020-07-10,17,6.9,English,2020,686677
124,A Perfect Fit,Romantic comedy / drama,2021-07-15,112,5.3,Indonesian,2021,800669
643,The Bleeding Edge,Documentary,2018-07-27,100,7.6,English,2018,511815
588,Procession,Documentary,2021-11-19,118,7.2,English,2021,869623
361,Street Flow,Drama,2019-10-12,96,6.4,French,2019,614488
191,The Holiday Calendar,Romantic comedy,2018-11-02,95,5.7,English,2018,555850
331,Stuck Together,Comedy,2021-10-20,126,6.3,French,2021,785516


#### 1.2 Data validation

In [10]:
# Rows to solve
rows_to_solve = df[df["tmdb_id"].duplicated(keep = False)]
rows_to_solve

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
11,Paradox,Musical/Western/Fantasy,2018-03-23,73,3.9,English,2018,384521.0
45,Porta dos Fundos: The First Temptation of Christ,Comedy,2019-12-03,46,4.6,Portuguese,2019,
88,The Beast,Drama,2020-11-27,99,5.2,Italian,2020,
150,Sweet Girl,Action,2021-08-20,110,5.5,English,2021,619297.0
159,Òlòt?ré,Crime drama,2020-10-02,106,5.5,English,2020,
163,The Cloverfield Paradox,Science fiction,2018-02-04,102,5.5,English,2018,384521.0
299,The Lovebirds,Romantic comedy,2020-05-22,87,6.1,English,2020,
314,Voyuer,Documentary,2017-12-01,95,6.2,English,2017,
321,Octonauts & the Caves of Sac Actun,Animation,2020-08-14,72,6.2,English,2020,726940.0
358,Porta dos Fundos: The Last Hangover,Comedy,2018-12-21,44,6.3,Portuguese,2018,


In [12]:
rows_to_solve.shape

(23, 8)

##### Duplicates and "?: we solve it manually

In [13]:
rows_to_solve["tmdb_id"].value_counts()

384521    2
672745    2
861604    2
726940    2
619297    2
Name: tmdb_id, dtype: int64

In [14]:
df[df["tmdb_id"] == "?"]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
96,#REALITYHIGH,Comedy,2017-09-08,99,5.2,English,2017,?


In [15]:
# Solve rows that failed to receive data
get_movie_data("#realityhigh")

'The program failed to communicate with the API'

In [16]:
get_movie_data("realityhigh")

[{'adult': False,
  'backdrop_path': '/smgZYp49OB6xo4hZewxzryrh5xN.jpg',
  'genre_ids': [35],
  'id': 455656,
  'original_language': 'en',
  'original_title': '#realityhigh',
  'overview': 'When nerdy high schooler Dani finally attracts the interest of her longtime crush, she lands in the cross hairs of his ex, a social media celebrity.',
  'popularity': 11.151,
  'poster_path': '/9TbjIF1p5a3EJXUFzX63Coa2JRM.jpg',
  'release_date': '2017-07-17',
  'title': '#realityhigh',
  'video': False,
  'vote_average': 6.4,
  'vote_count': 935}]

In [17]:
df.iloc[96, 7] = get_movie_id("realityhigh", 2017)
df.iloc[96, :]

Title                   #REALITYHIGH
Genre                         Comedy
Premiere         2017-09-08 00:00:00
Runtime                           99
IMDB Score                       5.2
Language                     English
premiere_year                   2017
tmdb_id                       455656
Name: 96, dtype: object

In [18]:
df[df["tmdb_id"] == 619297]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
150,Sweet Girl,Action,2021-08-20,110,5.5,English,2021,619297
459,Sweet & Sour,Romantic drama,2021-06-04,102,6.7,Korean,2021,619297


In [19]:
get_movie_data("Sweet girl")

[{'adult': False,
  'backdrop_path': '/nprqOIEfiMMQx16lgKeLf3rmPrR.jpg',
  'genre_ids': [28],
  'id': 619297,
  'original_language': 'en',
  'original_title': 'Sweet Girl',
  'overview': "A devastated husband vows to bring justice to the people responsible for his wife's death while protecting the only family he has left, his daughter.",
  'popularity': 296.569,
  'poster_path': '/cP7odDzzFBD9ycxj2laTeFWGLjD.jpg',
  'release_date': '2021-08-18',
  'title': 'Sweet Girl',
  'video': False,
  'vote_average': 6.9,
  'vote_count': 697},
 {'adult': False,
  'backdrop_path': '/cwdeWOqC0dIKphNLhjENWB7GFQd.jpg',
  'genre_ids': [18, 27, 53],
  'id': 407757,
  'original_language': 'en',
  'original_title': 'Sweet, Sweet Lonely Girl',
  'overview': "Soon after moving in with her aging aunt Dora, Adele meets Beth, seductive and mysterious, who tests the limits of Adele's moral ground and sends her spiraling down a psychologically unstable and phantasmagoric path.",
  'popularity': 4.341,
  'poster_

In [20]:
get_movie_data("Sweet & sour")

[{'adult': False,
  'backdrop_path': '/nprqOIEfiMMQx16lgKeLf3rmPrR.jpg',
  'genre_ids': [28],
  'id': 619297,
  'original_language': 'en',
  'original_title': 'Sweet Girl',
  'overview': "A devastated husband vows to bring justice to the people responsible for his wife's death while protecting the only family he has left, his daughter.",
  'popularity': 296.569,
  'poster_path': '/cP7odDzzFBD9ycxj2laTeFWGLjD.jpg',
  'release_date': '2021-08-18',
  'title': 'Sweet Girl',
  'video': False,
  'vote_average': 6.9,
  'vote_count': 697},
 {'adult': False,
  'backdrop_path': '/5A3c0R5L2CUY4Dj9VJkm4zdFG7H.jpg',
  'genre_ids': [10751, 35],
  'id': 654974,
  'original_language': 'en',
  'original_title': 'Home Sweet Home Alone',
  'overview': 'After being left at home by himself for the holidays, 10-year-old Max Mercer must work to defend his home from a married couple who tries to steal back a valuable heirloom.',
  'popularity': 72.759,
  'poster_path': '/fP3VvqUjEBjawxZHL4sYCq2ZdJD.jpg',
  'r

In [21]:
df.iloc[459, 7] = 662237
df.iloc[459, :]

Title                   Sweet & Sour
Genre                 Romantic drama
Premiere         2021-06-04 00:00:00
Runtime                          102
IMDB Score                       6.7
Language                      Korean
premiere_year                   2021
tmdb_id                       662237
Name: 459, dtype: object

In [22]:
df[df["tmdb_id"] == 726940]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
321,Octonauts & the Caves of Sac Actun,Animation,2020-08-14,72,6.2,English,2020,726940
604,Octonauts & the Great Barrier Reef,Animation,2020-10-13,47,7.3,English,2020,726940


In [23]:
get_movie_data("Octonauts & the Caves of Sac Actun")

[{'adult': False,
  'backdrop_path': '/zG4dbBx3oO37yBkmLCiG81FxAzW.jpg',
  'genre_ids': [10751, 16],
  'id': 726940,
  'original_language': 'en',
  'original_title': 'Octonauts and the Caves of Sac Actun',
  'overview': 'The Octonauts embark on an underwater adventure, navigating a set of challenging caves to help a small octopus friend return to the Caribbean Sea.',
  'popularity': 11.584,
  'poster_path': '/wlizMbZByBYPOC4i2AN64e4W1iM.jpg',
  'release_date': '2020-08-14',
  'title': 'Octonauts and the Caves of Sac Actun',
  'video': False,
  'vote_average': 6.6,
  'vote_count': 23},
 {'adult': False,
  'backdrop_path': '/gXNXD1eou71RWNUQx37bUbjbPmA.jpg',
  'genre_ids': [10751, 16],
  'id': 765684,
  'original_language': 'en',
  'original_title': 'Octonauts: The Ring of Fire',
  'overview': 'In this film\'s adventure, a volcano erupts suddenly, suddenly awakening the Pacific Ocean floor of hundreds of volcanoes composed of the "ring of fire", this sudden event quickly turned into a gl

In [24]:
df.iloc[604, 7] = 752885
df.iloc[604, :]

Title            Octonauts & the Great Barrier Reef
Genre                                     Animation
Premiere                        2020-10-13 00:00:00
Runtime                                          47
IMDB Score                                      7.3
Language                                    English
premiere_year                                  2020
tmdb_id                                      752885
Name: 604, dtype: object

In [25]:
df[df["tmdb_id"] == 384521]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
11,Paradox,Musical/Western/Fantasy,2018-03-23,73,3.9,English,2018,384521
163,The Cloverfield Paradox,Science fiction,2018-02-04,102,5.5,English,2018,384521


In [26]:
get_movie_data("Paradox")

[{'adult': False,
  'backdrop_path': '/iSLXnEeZpkZQnxeCzp9ZjdhvAKK.jpg',
  'genre_ids': [878, 16, 28, 12],
  'id': 183011,
  'original_language': 'en',
  'original_title': 'Justice League: The Flashpoint Paradox',
  'overview': 'The Flash finds himself in a war-torn alternate timeline and teams up with alternate versions of his fellow heroes to restore the timeline.',
  'popularity': 37.478,
  'poster_path': '/zmpE3mfhv0NOnI872q66kuuGeZW.jpg',
  'release_date': '2013-07-30',
  'title': 'Justice League: The Flashpoint Paradox',
  'video': False,
  'vote_average': 7.9,
  'vote_count': 1341},
 {'adult': False,
  'backdrop_path': '/lWrmOhS5WH1650mHJhwcm91jQZR.jpg',
  'genre_ids': [27, 878, 28, 53],
  'id': 384521,
  'original_language': 'en',
  'original_title': 'The Cloverfield Paradox',
  'overview': 'Orbiting above a planet on the brink of war, scientists test a device to solve an energy crisis and end up face-to-face with a dark alternate reality.',
  'popularity': 21.134,
  'poster_pa

In [27]:
df.iloc[11, 7] = 502140
df.iloc[11, :]

Title                            Paradox
Genre            Musical/Western/Fantasy
Premiere             2018-03-23 00:00:00
Runtime                               73
IMDB Score                           3.9
Language                         English
premiere_year                       2018
tmdb_id                           502140
Name: 11, dtype: object

In [28]:
df[df["tmdb_id"] == 861604]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
492,Blood Brothers: Malcolm X & Muhammad Ali,Documentary,2021-09-09,96,6.8,English,2021,861604
559,Ali & Ratu Ratu Queens,Comedy-drama,2021-06-17,100,7.1,Indonesian,2021,861604


In [29]:
get_movie_data("Blood Brothers: Malcolm X & Muhammad Ali")

[{'adult': False,
  'backdrop_path': '/8ETxVtj2apApnVZeRLUNGZaFeh.jpg',
  'genre_ids': [99, 36],
  'id': 861604,
  'original_language': 'en',
  'original_title': 'Blood Brothers: Malcolm X and Muhammad Ali',
  'overview': "From a chance meeting to a tragic fallout, Malcolm X and Muhammad Ali's extraordinary bond cracks under the weight of distrust and shifting ideals.",
  'popularity': 16.473,
  'poster_path': '/kdOXdPIgbbCHXb51tWJZ0r8kZfe.jpg',
  'release_date': '2021-09-09',
  'title': 'Blood Brothers: Malcolm X and Muhammad Ali',
  'video': False,
  'vote_average': 7.6,
  'vote_count': 15}]

In [30]:
get_movie_data("Ali & Ratu Ratu Queens")

[{'adult': False,
  'backdrop_path': '/gVtm7G5NtQxjFNA5ydqxslxjCuk.jpg',
  'genre_ids': [18],
  'id': 8489,
  'original_language': 'en',
  'original_title': 'Ali',
  'overview': "In 1964, a brash, new pro boxer, fresh from his Olympic gold medal victory, explodes onto the scene: Cassius Clay. Bold and outspoken, he cuts an entirely new image for African Americans in sport with his proud public self-confidence and his unapologetic belief that he is the greatest boxer of all time. Yet at the top of his game, both Ali's personal and professional lives face the ultimate test.",
  'popularity': 15.221,
  'poster_path': '/egzfj0JpgrVfpMXF4tfHpzEYM70.jpg',
  'release_date': '2001-12-10',
  'title': 'Ali',
  'video': False,
  'vote_average': 6.8,
  'vote_count': 1271},
 {'adult': False,
  'backdrop_path': '/ll5ktqAG3EOlCS8bpyoC0kYSSe9.jpg',
  'genre_ids': [35],
  'id': 9298,
  'original_language': 'en',
  'original_title': 'Ali G Indahouse',
  'overview': "Ali G unwittingly becomes a pawn in t

In [31]:
df.iloc[559, 7] = 650129
df.iloc[559, :]

Title            Ali & Ratu Ratu Queens
Genre                      Comedy-drama
Premiere            2021-06-17 00:00:00
Runtime                             100
IMDB Score                          7.1
Language                     Indonesian
premiere_year                      2021
tmdb_id                          650129
Name: 559, dtype: object

In [32]:
df[df["tmdb_id"] == 672745]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
382,The Heartbreak Club,Comedy-drama,2021-01-14,101,6.4,Indonesian,2021,672745
417,The Heartbreak Club,Comedy-drama,2021-01-14,101,6.5,Indonesian,2021,672745


In [33]:
df.drop(382, inplace = True)

In [34]:
df.reset_index(drop = True, inplace = True)

In [35]:
# We redefine rows_to_solve
rows_to_solve = df[df["tmdb_id"].duplicated(keep = False)]
rows_to_solve

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
45,Porta dos Fundos: The First Temptation of Christ,Comedy,2019-12-03,46,4.6,Portuguese,2019,
88,The Beast,Drama,2020-11-27,99,5.2,Italian,2020,
159,Òlòt?ré,Crime drama,2020-10-02,106,5.5,English,2020,
299,The Lovebirds,Romantic comedy,2020-05-22,87,6.1,English,2020,
314,Voyuer,Documentary,2017-12-01,95,6.2,English,2017,
358,Porta dos Fundos: The Last Hangover,Comedy,2018-12-21,44,6.3,Portuguese,2018,
484,Long Live Brij Mohan,Comedy,2018-08-03,105,6.8,Hindi,2018,
526,I'm No Longer Here: A Discussion with Guillerm...,Aftershow / Interview,2020-11-03,14,7.0,English,2020,
584,The Road to El Camino: A Breaking Bad Movie,Making-of,2019-10-29,13,7.2,English,2019,
589,Seventeen,Coming-of-age comedy-drama,2019-10-18,99,7.2,Spanish,2019,


In [36]:
get_movie_data("The Lovebirds")

[{'adult': False,
  'backdrop_path': '/dwcazDHJQmC1euuc7oVvDHMRuCv.jpg',
  'genre_ids': [35, 28, 10749],
  'id': 576156,
  'original_language': 'en',
  'original_title': 'The Lovebirds',
  'overview': 'A couple experiences a defining moment in their relationship when they are unintentionally embroiled in a murder mystery. As their journey to clear their names takes them from one extreme – and hilarious - circumstance to the next, they must figure out how they, and their relationship, can survive the night.',
  'popularity': 7.496,
  'poster_path': '/5jdLnvALCpK1NkeQU1z4YvOe2dZ.jpg',
  'release_date': '2021-05-22',
  'title': 'The Lovebirds',
  'video': False,
  'vote_average': 6,
  'vote_count': 31},
 {'adult': False,
  'backdrop_path': '/xjdAkXQSRQNGk7j1VbzW7AWOjzt.jpg',
  'genre_ids': [18],
  'id': 586554,
  'original_language': 'en',
  'original_title': 'The Lovebirds',
  'overview': 'THE LOVEBIRDS intertwines six stories about love, friendship and survival. The film stars an intern

In [37]:
# "Lovebirds" has an error in "Premier" column
df.iloc[299, 7] = 576156
df.iloc[299, 2] = pd.to_datetime("2021-05-22")
df.iloc[299, :]

Title                  The Lovebirds
Genre                Romantic comedy
Premiere         2021-05-22 00:00:00
Runtime                           87
IMDB Score                       6.1
Language                     English
premiere_year                   2020
tmdb_id                       576156
Name: 299, dtype: object

In [38]:
get_movie_data("City of joy")

[{'adult': False,
  'backdrop_path': '/w6Zgh1kFxLzeQFLOR8hC2JTwEon.jpg',
  'genre_ids': [18],
  'id': 47821,
  'original_language': 'en',
  'original_title': 'City of Joy',
  'overview': 'Max Lowe is a Houston surgeon who has grown weary of the bureaucracy of American medicine. When he loses a patient on the operating table, Max impulsively decides to leave America and travel to India in the hope of finding himself. Not long after he arrives in Calcutta, Max is attacked by a group of thugs and left without money or a passport.',
  'popularity': 7.98,
  'poster_path': '/8aEfWFECThq5A1VnXsyzX8x2y72.jpg',
  'release_date': '1992-04-15',
  'title': 'City of Joy',
  'video': False,
  'vote_average': 6.6,
  'vote_count': 114},
 {'adult': False,
  'backdrop_path': '/ylUBY5u08TUt4mSYNW30Sw9wK8D.jpg',
  'genre_ids': [99],
  'id': 450005,
  'original_language': 'en',
  'original_title': 'City of Joy',
  'overview': 'The Democratic Republic of Congo has endured 20 years of devastating violence. R

In [39]:
# "City of joy" where released two years delayed
df.iloc[629, 7] = 450005
df.iloc[629, :]

Title                    City of Joy
Genre                    Documentary
Premiere         2018-09-07 00:00:00
Runtime                           74
IMDB Score                       7.5
Language                     English
premiere_year                   2018
tmdb_id                       450005
Name: 629, dtype: object

In [40]:
# Get the ID's manually from movies with "title" problems
movies_without_id = [["The First temptation of Chris", 2019],
                    ["La Belva", 2020],
                    ["Òlòtūré", 2020],
                    ["Voyeur", 2017],
                    ["The Last Hangover", 2017],
                    ["Brij Mohan Amar Rahe!", 2018],
                    ["I’m No Longer Here: A Conversation with Guillermo Del Toro & Alfonso Cuarón", 2020],
                    ["The Road to El Camino: Behind the Scenes of El Camino: A Breaking Bad Movie ", 2019],
                    ["Diecisiete", 2019],
                    ["Rolling Thunder Revue: A Bob Dylan Story by Martin Scorsese", 2019],
                    ["Struggle: The Life and Lost Art of Szukalski", 2018]]

id_list = [get_movie_id(i[0], i[1]) for i in movies_without_id]

id_list

[652020,
 654905,
 642803,
 473415,
 571721,
 539470,
 760880,
 683021,
 623627,
 574638,
 565255]

In [41]:
# Put the id's on the original DataFrame
movies_without_id_index = [45, 88, 159, 314, 358, 484, 526, 584, 589, 647, 667]
pack = zip(movies_without_id_index, id_list)

for x in pack:
    df.iloc[x[0], 7] = x[1]
    
df.iloc[movies_without_id_index, :]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id
45,Porta dos Fundos: The First Temptation of Christ,Comedy,2019-12-03,46,4.6,Portuguese,2019,652020
88,The Beast,Drama,2020-11-27,99,5.2,Italian,2020,654905
159,Òlòt?ré,Crime drama,2020-10-02,106,5.5,English,2020,642803
314,Voyuer,Documentary,2017-12-01,95,6.2,English,2017,473415
358,Porta dos Fundos: The Last Hangover,Comedy,2018-12-21,44,6.3,Portuguese,2018,571721
484,Long Live Brij Mohan,Comedy,2018-08-03,105,6.8,Hindi,2018,539470
526,I'm No Longer Here: A Discussion with Guillerm...,Aftershow / Interview,2020-11-03,14,7.0,English,2020,760880
584,The Road to El Camino: A Breaking Bad Movie,Making-of,2019-10-29,13,7.2,English,2019,683021
589,Seventeen,Coming-of-age comedy-drama,2019-10-18,99,7.2,Spanish,2019,623627
647,Rolling Thunder Revue: A bob Dylan Story by Ma...,Documentary,2019-06-12,144,7.6,English,2019,574638


In [42]:
# Save the new dataframe as a checkpoint
# df.to_excel("temporary_netflix_df.xlsx", index = False)

### 2. Get useful movies data

In [43]:
def get_movie_data(movie_id, *args):
    '''
    Takes a movie ID and a list of useful movie data we need, connects with TMDB API, requests the data and it returns
    the data required.
    
    Parameters
    ==========
    
    movie_id: int 
        movie's id we use to connect to the API
    
    *args: str 
        details we want to get
        
    Returns
    =======
    
    list 
        list with the field required passed in *args
    
    >>> The details can be found here: https://developers.themoviedb.org/3/movies/get-movie-details'''
    
    # Connect with the API
    response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=f062d4d3bef1ed6f224531125e4c20c7")
    
    # Make list of details with a for iteration calling the keys in args
    details= []
    for detail in args:
        try:
            details.append(response.json()[detail])
        except:
            details.append("?")
    
    return details

In [44]:
# Prove
get_movie_data(152601, "adult", "original_language", "original_title")

[False, 'en', 'Her']

#### 2.1 Movie's data

##### Get data we want:
- Genres: because we want to compare the results with the genres on the original dataset
- Imdb id: to try to get more data from IMDB site
- Overview: description of the movie
- Production countries
- Vote average
- Vote count
- Title: useful for data validation
- Original title: useful for data validation

In [45]:
movie_data = [get_movie_data(movie_id, 
                             "genres", 
                             "imdb_id", 
                             "overview", 
                             "production_countries", 
                             "vote_average", 
                             "vote_count",
                             "title",
                             "original_title") for movie_id in df["tmdb_id"]]

movie_data

[[[{'id': 99, 'name': 'Documentary'}],
  'tt10662450',
  'What is anime? Through deep-dives with notable masterminds of this electrifying genre, this fast-paced documentary seeks to find the answers.',
  [{'iso_3166_1': 'US', 'name': 'United States of America'},
   {'iso_3166_1': 'JP', 'name': 'Japan'}],
  5.8,
  51,
  'Enter the Anime',
  'Enter the Anime'],
 [[{'id': 27, 'name': 'Horror'},
   {'id': 28, 'name': 'Action'},
   {'id': 53, 'name': 'Thriller'},
   {'id': 9648, 'name': 'Mystery'}],
  'tt12536776',
  'In search of his sister, a renegade criminal seeks answers at a sordid hotel where he encounters a sinister guest and romances a mysterious waitress.',
  [],
  4.4,
  25,
  'Dark Forces',
  'Fuego negro'],
 [[{'id': 18, 'name': 'Drama'}, {'id': 878, 'name': 'Science Fiction'}],
  'tt11385066',
  'Loving girlfriend, family fortune, breakout movie role: he\'s got it all. Until an app awakens a powerful new yearning. While in Rome to shoot his first movie, actor and industrial he

In [46]:
# Transform the result to DataFrame
data_df = pd.DataFrame(movie_data, columns=["genres", 
                             "imdb_id", 
                             "overview", 
                             "production_countries", 
                             "vote_average", 
                             "vote_count",
                             "title",
                             "original_title"])

data_df.head()

Unnamed: 0,genres,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title
0,"[{'id': 99, 'name': 'Documentary'}]",tt10662450,What is anime? Through deep-dives with notable...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5.8,51,Enter the Anime,Enter the Anime
1,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",tt12536776,"In search of his sister, a renegade criminal s...",[],4.4,25,Dark Forces,Fuego negro
2,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",tt11385066,"Loving girlfriend, family fortune, breakout mo...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2.8,105,The App,The App
3,"[{'id': 35, 'name': 'Comedy'}]",tt15523050,Rap superstar Saweetie hosts a celebration of ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2.7,3,Sex: Unzipped,Sex: Unzipped
4,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",[],4.7,3,Swallow,Swallow


##### Clean the DataFrame

In [47]:
# Build first genre column
genres_1 = []
for genre in data_df["genres"]:
    try:
        genres_1.append(genre[0])
    except:
        genres_1.append("?")
        
genres_1

[{'id': 99, 'name': 'Documentary'},
 {'id': 27, 'name': 'Horror'},
 {'id': 18, 'name': 'Drama'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 18, 'name': 'Drama'},
 {'id': 27, 'name': 'Horror'},
 {'id': 27, 'name': 'Horror'},
 {'id': 28, 'name': 'Action'},
 {'id': 9648, 'name': 'Mystery'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 28, 'name': 'Action'},
 {'id': 10402, 'name': 'Music'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 27, 'name': 'Horror'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 27, 'name': 'Horror'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 27, 'name': 'Horror'},
 {'id': 18, 'name': 'Drama'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 18, 'name': 'Drama'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 14,

In [48]:
genres_1_clean = []

for genres in genres_1:
    try:
        genres_1_clean.append(genres['name'])
    except:
        genres_1_clean.append("?")
        
genres_1_clean

['Documentary',
 'Horror',
 'Drama',
 'Comedy',
 'Drama',
 'Horror',
 'Horror',
 'Action',
 'Mystery',
 'Comedy',
 'Action',
 'Music',
 'Comedy',
 'Horror',
 'Comedy',
 'Documentary',
 'Adventure',
 'Horror',
 'Comedy',
 'Comedy',
 'Thriller',
 'Comedy',
 'Thriller',
 'Comedy',
 'Comedy',
 'Documentary',
 'Horror',
 'Drama',
 'Comedy',
 'Drama',
 'Documentary',
 'Fantasy',
 'Drama',
 'Comedy',
 'Action',
 'Comedy',
 'Action',
 'Comedy',
 'Thriller',
 'Documentary',
 'Thriller',
 'Comedy',
 'Horror',
 'Comedy',
 'Drama',
 'Comedy',
 'Documentary',
 'History',
 'Horror',
 'Thriller',
 'Drama',
 'Action',
 'Animation',
 'Romance',
 'Documentary',
 'Science Fiction',
 'Western',
 'Comedy',
 'Horror',
 'Family',
 'Drama',
 'Thriller',
 'Horror',
 'Horror',
 'Comedy',
 'Family',
 'Drama',
 'Family',
 'Action',
 'Drama',
 'Comedy',
 'Thriller',
 'Drama',
 'Science Fiction',
 'Comedy',
 'Comedy',
 'Horror',
 'Animation',
 'Drama',
 'Comedy',
 'Comedy',
 'Documentary',
 'Action',
 'Comedy',
 'T

In [49]:
data_df["genre_1"] = genres_1_clean

data_df.head()

Unnamed: 0,genres,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1
0,"[{'id': 99, 'name': 'Documentary'}]",tt10662450,What is anime? Through deep-dives with notable...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5.8,51,Enter the Anime,Enter the Anime,Documentary
1,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",tt12536776,"In search of his sister, a renegade criminal s...",[],4.4,25,Dark Forces,Fuego negro,Horror
2,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",tt11385066,"Loving girlfriend, family fortune, breakout mo...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2.8,105,The App,The App,Drama
3,"[{'id': 35, 'name': 'Comedy'}]",tt15523050,Rap superstar Saweetie hosts a celebration of ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy
4,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",[],4.7,3,Swallow,Swallow,Drama


In [50]:
# Build second genre column
genres_2 = []
for genre in data_df["genres"]:
    try:
        genres_2.append(genre[1])
    except:
        genres_2.append("?")
        
genres_2

['?',
 {'id': 28, 'name': 'Action'},
 {'id': 878, 'name': 'Science Fiction'},
 '?',
 {'id': 36, 'name': 'History'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 18, 'name': 'Drama'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 53, 'name': 'Thriller'},
 '?',
 {'id': 80, 'name': 'Crime'},
 {'id': 37, 'name': 'Western'},
 '?',
 {'id': 53, 'name': 'Thriller'},
 {'id': 18, 'name': 'Drama'},
 '?',
 {'id': 10751, 'name': 'Family'},
 {'id': 9648, 'name': 'Mystery'},
 '?',
 {'id': 18, 'name': 'Drama'},
 '?',
 {'id': 28, 'name': 'Action'},
 '?',
 '?',
 {'id': 10749, 'name': 'Romance'},
 '?',
 '?',
 {'id': 53, 'name': 'Thriller'},
 '?',
 {'id': 53, 'name': 'Thriller'},
 '?',
 {'id': 53, 'name': 'Thriller'},
 {'id': 53, 'name': 'Thriller'},
 '?',
 {'id': 12, 'name': 'Adventure'},
 {'id': 10749, 'name': 'Romance'},
 {'id': 35, 'name': 'Comedy'},
 '?',
 '?',
 '?',
 {'id': 18, 'name': 'Drama'},
 '?',
 {'id': 53, 'name': 'Thriller'},
 '?',
 '?',
 {'id': 14, 'name': 'Fantasy'},
 '?',
 {'id': 10402, 'name': 'M

In [51]:
genres_2_clean = []

for genres in genres_2:
    try:
        genres_2_clean.append(genres['name'])
    except:
        genres_2_clean.append("?")
        
genres_2_clean

['?',
 'Action',
 'Science Fiction',
 '?',
 'History',
 'Thriller',
 'Drama',
 'Thriller',
 'Thriller',
 '?',
 'Crime',
 'Western',
 '?',
 'Thriller',
 'Drama',
 '?',
 'Family',
 'Mystery',
 '?',
 'Drama',
 '?',
 'Action',
 '?',
 '?',
 'Romance',
 '?',
 '?',
 'Thriller',
 '?',
 'Thriller',
 '?',
 'Thriller',
 'Thriller',
 '?',
 'Adventure',
 'Romance',
 'Comedy',
 '?',
 '?',
 '?',
 'Drama',
 '?',
 'Thriller',
 '?',
 '?',
 'Fantasy',
 '?',
 'Music',
 'Thriller',
 'Action',
 '?',
 'Fantasy',
 'Action',
 'Comedy',
 '?',
 '?',
 'Comedy',
 'Crime',
 '?',
 'Crime',
 'Science Fiction',
 '?',
 'Thriller',
 'Thriller',
 '?',
 'Comedy',
 'Comedy',
 'Animation',
 'Thriller',
 'Horror',
 'Romance',
 '?',
 'Romance',
 'Thriller',
 '?',
 'Romance',
 'Fantasy',
 'Family',
 'Romance',
 'Action',
 '?',
 '?',
 'Thriller',
 'Drama',
 'Horror',
 'Romance',
 'Drama',
 '?',
 '?',
 '?',
 'Drama',
 '?',
 '?',
 'Adventure',
 'Adventure',
 'Mystery',
 '?',
 'Drama',
 'Music',
 'Drama',
 '?',
 '?',
 '?',
 'Music

In [52]:
data_df["genre_2"] = genres_2_clean

In [53]:
# Drop genres column
data_df.drop(columns = "genres", inplace = True)
data_df.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2
0,tt10662450,What is anime? Through deep-dives with notable...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5.8,51,Enter the Anime,Enter the Anime,Documentary,?
1,tt12536776,"In search of his sister, a renegade criminal s...",[],4.4,25,Dark Forces,Fuego negro,Horror,Action
2,tt11385066,"Loving girlfriend, family fortune, breakout mo...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2.8,105,The App,The App,Drama,Science Fiction
3,tt15523050,Rap superstar Saweetie hosts a celebration of ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?
4,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",[],4.7,3,Swallow,Swallow,Drama,History


In [54]:
# Build first production country column
country_1 = []
for country in data_df["production_countries"]:
    try:
        country_1.append(country[0])
    except:
        country_1.append("?")
        
country_1

[{'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 {'iso_3166_1': 'IT', 'name': 'Italy'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'IN', 'name': 'India'},
 {'iso_3166_1': 'IN', 'name': 'India'},
 '?',
 {'iso_3166_1': 'TR', 'name': 'Turkey'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'BR', 'name': 'Brazil'},
 {'iso_3166_1': 'PL', 'name': 'Poland'},
 {'iso_3166_1': 'IN', 'name': 'India'},
 {'iso_3166_1': 'IN', 'name': 'India'},
 {'iso_3166_1': 'CA', 'name': 'Canada'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'MX', 'name': 'Mexico'},
 {'iso_3166_1': 'ID', 'name': 'Indonesia'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'MY', 'name': 'Malaysia'},
 {'iso_3166_1': 'DE', 'name': 'Germany'},
 {'iso_3166_1': 'ZA', 'name': 'Sout

In [55]:
country_1_clean = []

for country in country_1:
    try:
        country_1_clean.append(country['name'])
    except:
        country_1_clean.append("?")
        
country_1_clean

['United States of America',
 '?',
 'Italy',
 'United States of America',
 '?',
 'United States of America',
 'India',
 'India',
 '?',
 'Turkey',
 'United States of America',
 'United States of America',
 'Brazil',
 'Poland',
 'India',
 'India',
 'Canada',
 'United States of America',
 'Mexico',
 'Indonesia',
 'United States of America',
 'Malaysia',
 'Germany',
 'South Africa',
 'United States of America',
 'Mexico',
 'India',
 'United Kingdom',
 'South Korea',
 'United States of America',
 'United States of America',
 'United States of America',
 'India',
 'United States of America',
 'United States of America',
 'Netherlands',
 'France',
 'South Africa',
 'United States of America',
 'United States of America',
 'United States of America',
 'Italy',
 'United States of America',
 'Italy',
 'France',
 'Brazil',
 'France',
 '?',
 'Italy',
 'France',
 'Philippines',
 'United States of America',
 'Japan',
 'South Africa',
 'United Kingdom',
 'United States of America',
 'United States of

In [56]:
data_df["country_1"] = country_1_clean

data_df.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1
0,tt10662450,What is anime? Through deep-dives with notable...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5.8,51,Enter the Anime,Enter the Anime,Documentary,?,United States of America
1,tt12536776,"In search of his sister, a renegade criminal s...",[],4.4,25,Dark Forces,Fuego negro,Horror,Action,?
2,tt11385066,"Loving girlfriend, family fortune, breakout mo...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2.8,105,The App,The App,Drama,Science Fiction,Italy
3,tt15523050,Rap superstar Saweetie hosts a celebration of ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?,United States of America
4,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",[],4.7,3,Swallow,Swallow,Drama,History,?


In [57]:
# Build second production country column
country_2 = []
for country in data_df["production_countries"]:
    try:
        country_2.append(country[1])
    except:
        country_2.append("?")
        
country_2

[{'iso_3166_1': 'JP', 'name': 'Japan'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 {'iso_3166_1': 'GB', 'name': 'United Kingdom'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 '?',
 '?',
 {'iso_3166_1': 'FR', 'name': 'France'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 {'iso_3166_1': 'FR', 'name': 'France'},
 {'iso_3166_1': 'MX', 'name': 'Mexico'},
 '?',
 '?',
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 

In [58]:
country_2_clean = []

for country in country_2:
    try:
        country_2_clean.append(country['name'])
    except:
        country_2_clean.append("?")
        
country_2_clean

['Japan',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 'United States of America',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United Kingdom',
 'United States of America',
 '?',
 '?',
 '?',
 'France',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'France',
 'Mexico',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United States of America',
 'Peru',
 '?',
 '?',
 '?',
 'Belgium',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 

In [59]:
data_df["country_2"] = country_2_clean

data_df.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
0,tt10662450,What is anime? Through deep-dives with notable...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5.8,51,Enter the Anime,Enter the Anime,Documentary,?,United States of America,Japan
1,tt12536776,"In search of his sister, a renegade criminal s...",[],4.4,25,Dark Forces,Fuego negro,Horror,Action,?,?
2,tt11385066,"Loving girlfriend, family fortune, breakout mo...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2.8,105,The App,The App,Drama,Science Fiction,Italy,?
3,tt15523050,Rap superstar Saweetie hosts a celebration of ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?,United States of America,?
4,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",[],4.7,3,Swallow,Swallow,Drama,History,?,?


In [60]:
data_df.drop(columns = "production_countries", inplace = True)

data_df.head()

Unnamed: 0,imdb_id,overview,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
0,tt10662450,What is anime? Through deep-dives with notable...,5.8,51,Enter the Anime,Enter the Anime,Documentary,?,United States of America,Japan
1,tt12536776,"In search of his sister, a renegade criminal s...",4.4,25,Dark Forces,Fuego negro,Horror,Action,?,?
2,tt11385066,"Loving girlfriend, family fortune, breakout mo...",2.8,105,The App,The App,Drama,Science Fiction,Italy,?
3,tt15523050,Rap superstar Saweetie hosts a celebration of ...,2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?,United States of America,?
4,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",4.7,3,Swallow,Swallow,Drama,History,?,?


##### Merge the DataFrames

In [61]:
df = df.merge(data_df, left_index = True, right_index = True)

df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id,imdb_id,overview,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
0,Enter the Anime,Documentary,2019-08-05,58,2.5,English/Japanese,2019,616904,tt10662450,What is anime? Through deep-dives with notable...,5.8,51,Enter the Anime,Enter the Anime,Documentary,?,United States of America,Japan
1,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish,2020,735110,tt12536776,"In search of his sister, a renegade criminal s...",4.4,25,Dark Forces,Fuego negro,Horror,Action,?,?
2,The App,Science fiction/Drama,2019-12-26,79,2.6,Italian,2019,653522,tt11385066,"Loving girlfriend, family fortune, breakout mo...",2.8,105,The App,The App,Drama,Science Fiction,Italy,?
3,Sex: Unzipped,Comedy,2021-10-26,59,3.1,English,2021,890280,tt15523050,Rap superstar Saweetie hosts a celebration of ...,2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?,United States of America,?
4,Swallow,Drama,2021-10-01,128,3.2,English,2021,874562,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",4.7,3,Swallow,Swallow,Drama,History,?,?


In [62]:
# Save the new dataframe as a checkpoint
# df.to_excel("temporary_netflix_df.xlsx", index = False)

##### Data validation 

In [63]:
# Films with different title columns
pd.set_option("display.max_rows", None, "display.max_columns", None)
validate_df = df[df["Title"] != df["title"]]
validate_df.loc[:, ("Title", "Language", "title", "original_title", "country_1")]

Unnamed: 0,Title,Language,title,original_title,country_1
13,Nobody Sleeps in the Woods Tonight Part 2,Polish,Nobody Sleeps in the Woods Tonight 2,W lesie dziś nie zaśnie nikt 2,Poland
16,The Call,Korean,The Call of the Wild,The Call of the Wild,Canada
17,Escape the Undertaker,English,Escape The Undertaker,Escape The Undertaker,United States of America
28,What Happened to Mr. Cha?,Korean,What Happened to Mr Cha?,차인표,South Korea
30,"Hello Privilege. It's Me, Chelsea",English,"Hello, Privilege. It's Me, Chelsea","Hello, Privilege. It's Me, Chelsea",United States of America
45,Porta dos Fundos: The First Temptation of Christ,Portuguese,The First Temptation of Christ,A Primeira Tentação de Cristo,Brazil
58,Deep,Thai,The Deep House,The Deep House,Belgium
75,Trippin' with the Kandasamys,English,Trippin’ with the Kandasamys,Trippin’ with the Kandasamys,South Africa
76,Cadaver,Norwegian,The Unburied,El cadáver insepulto,Argentina
96,#REALITYHIGH,English,#realityhigh,#realityhigh,United States of America


##### List of movies with problems
- The Call: 575604
- Deep: 430474
- Cadaver: 692969
- Grudge: 874948
- Last Summer: 785537
- Back to School: 624060
- The Decline: 674607
- The Killer: 466190
- The Occupant: 674944
- Monster: 489932
- Sometimes: 411852
- John Mulaney & the Sack Lunch Bunch: 650073
- Jim & Andy: The Great Beyond - Featuring a Very Special, Contractually Obligated Mention of Tony Cliffton: 469019

In [64]:
# Replace the correct id's
ids_to_correct = [16, 58, 76, 109, 120, 122, 258, 300, 389, 409, 580, 637, 657]
id_list_2 = [575604, 430474, 692969, 874948, 785537, 624060, 674607, 466190, 674944, 489932, 411852, 650073, 469019]
pack_2 = zip(ids_to_correct, id_list_2)

for x in pack_2:
    df.iloc[x[0], 7] = x[1]
    
df.iloc[ids_to_correct, :]

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id,imdb_id,overview,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
16,The Call,Drama,2020-11-27,112,4.1,Korean,2020,575604,tt7504726,Buck is a big-hearted dog whose blissful domes...,7.6,2678,The Call of the Wild,The Call of the Wild,Adventure,Family,Canada,United States of America
58,Deep,Science fiction,2021-07-16,101,4.8,Thai,2021,430474,tt11686490,"While diving in a remote French lake, a couple...",5.5,116,The Deep House,The Deep House,Horror,?,Belgium,France
76,Cadaver,Horror,2020-10-22,86,5.1,Norwegian,2020,692969,,Maximiliano is a psychiatrist who suffers from...,4.5,6,The Unburied,El cadáver insepulto,Horror,Fantasy,Argentina,?
109,Grudge,Thriller,2021-10-08,105,5.3,Turkish,2021,874948,tt3612126,After a young mother murders her family in her...,5.6,852,The Grudge,The Grudge,Horror,Mystery,Canada,United States of America
120,Last Summer,Romantic drama,2021-07-09,101,5.3,Turkish,2021,785537,tt11864226,"Westchester, Summer 1991. Closeted teenager Da...",0.0,0,Last Summer with Uncle Ira,Last Summer with Uncle Ira,?,?,?,?
122,Back to School,Comedy,2019-08-30,83,5.3,French,2019,624060,tt10687588,The Super Monsters welcome Vida to her new hom...,6.5,2,Super Monsters Back to School,Super Monsters Back to School,Animation,?,?,?
258,The Decline,Thriller,2020-03-27,83,5.9,French,2020,674607,,NOFX perform The Decline live with Baz's Orche...,0.0,0,NOFX: The Decline Live At Red Rocks,NOFX: The Decline Live At Red Rocks,?,?,?,?
300,The Killer,Western,2017-11-10,99,6.1,Portuguese,2017,466190,tt4480398,A chemical accident turns ordinary donuts into...,4.2,79,Attack of the Killer Donuts,Attack of the Killer Donuts,Comedy,Horror,United States of America,?
389,The Occupant,Thriller,2020-03-25,103,6.4,Spanish,2020,674944,tt11629124,"Trapped in a mysterious psychiatric ward, a ma...",5.7,16,The Current Occupant,The Current Occupant,Thriller,Mystery,United States of America,?
409,Monster,Drama,2021-05-07,98,6.5,English,2021,489932,tt6475714,A portal transports Cpt. Artemis and an elite ...,6.9,2241,Monster Hunter,Monster Hunter,Action,Fantasy,Germany,Canada


In [65]:
# Get the correct data from films that had the wrong id
id_list_2 = [575604, 430474, 692969, 874948, 785537, 624060, 674607, 466190, 674944, 489932, 411852, 650073, 469019]
movie_data_2 = [get_movie_data(movie_id, 
                              "genres", 
                              "imdb_id", 
                              "overview", 
                              "production_countries", 
                              "vote_average", 
                              "vote_count",
                              "title",
                              "original_title") for movie_id in id_list_2]

movie_data_2

[[[{'id': 53, 'name': 'Thriller'},
   {'id': 9648, 'name': 'Mystery'},
   {'id': 878, 'name': 'Science Fiction'}],
  'tt10530176',
  'Connected by phone in the same home but 20 years apart, a serial killer puts another woman’s past — and life — on the line to change her own fate.',
  [{'iso_3166_1': 'KR', 'name': 'South Korea'}],
  7.6,
  407,
  'The Call',
  '콜'],
 [[{'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}],
  'tt4105584',
  'In 2100, when humanity has abandoned the earth, a colony of extravagant creatures still thrives in the deepest abyss of the ocean.',
  [{'iso_3166_1': 'BE', 'name': 'Belgium'},
   {'iso_3166_1': 'CN', 'name': 'China'},
   {'iso_3166_1': 'ES', 'name': 'Spain'},
   {'iso_3166_1': 'CH', 'name': 'Switzerland'},
   {'iso_3166_1': 'GB', 'name': 'United Kingdom'},
   {'iso_3166_1': 'US', 'name': 'United States of America'}],
  6.0,
  120,
  'Deep',
  'Deep'],
 [[{'id': 18, 'name': 'Drama'},
   {'id': 27, 'name': 'Horror'},
   {'id': 53, 'name': 

In [66]:
# Clean the new downloaded data
data_df_2 = pd.DataFrame(movie_data_2, columns=["genres", 
                                                "imdb_id", 
                                                "overview", 
                                                "production_countries", 
                                                "vote_average", 
                                                "vote_count",
                                                "title",
                                                "original_title"])
data_df_2.head()

Unnamed: 0,genres,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title
0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",tt10530176,Connected by phone in the same home but 20 yea...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",7.6,407,The Call,콜
1,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",tt4105584,"In 2100, when humanity has abandoned the earth...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",6.0,120,Deep,Deep
2,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",tt11284280,"In the aftermath of a nuclear disaster, a star...","[{'iso_3166_1': 'FI', 'name': 'Finland'}, {'is...",5.6,413,Cadaver,Kadaver
3,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",tt13717980,"Ahead of a promotion, a police chief becomes e...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",6.6,32,Grudge,Kin
4,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",tt13067004,"For Deniz, a 16-year-old teenager, this summer...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",5.8,31,Last Summer,Geçen Yaz


In [67]:
# Build first genre column
genres_1 = []
for genre in data_df_2["genres"]:
    try:
        genres_1.append(genre[0])
    except:
        genres_1.append("?")
        
genres_1

[{'id': 53, 'name': 'Thriller'},
 {'id': 16, 'name': 'Animation'},
 {'id': 18, 'name': 'Drama'},
 {'id': 80, 'name': 'Crime'},
 {'id': 10749, 'name': 'Romance'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 28, 'name': 'Action'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 80, 'name': 'Crime'},
 {'id': 18, 'name': 'Drama'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 99, 'name': 'Documentary'}]

In [68]:
genres_1_clean = []

for genres in genres_1:
    try:
        genres_1_clean.append(genres['name'])
    except:
        genres_1_clean.append("?")
        
genres_1_clean

['Thriller',
 'Animation',
 'Drama',
 'Crime',
 'Romance',
 'Comedy',
 'Thriller',
 'Action',
 'Thriller',
 'Crime',
 'Drama',
 'Comedy',
 'Documentary']

In [69]:
data_df_2["genre_1"] = genres_1_clean

data_df_2.head()

Unnamed: 0,genres,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1
0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",tt10530176,Connected by phone in the same home but 20 yea...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",7.6,407,The Call,콜,Thriller
1,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",tt4105584,"In 2100, when humanity has abandoned the earth...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",6.0,120,Deep,Deep,Animation
2,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",tt11284280,"In the aftermath of a nuclear disaster, a star...","[{'iso_3166_1': 'FI', 'name': 'Finland'}, {'is...",5.6,413,Cadaver,Kadaver,Drama
3,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",tt13717980,"Ahead of a promotion, a police chief becomes e...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",6.6,32,Grudge,Kin,Crime
4,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",tt13067004,"For Deniz, a 16-year-old teenager, this summer...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",5.8,31,Last Summer,Geçen Yaz,Romance


In [70]:
# Build second genre column
genres_2 = []
for genre in data_df_2["genres"]:
    try:
        genres_2.append(genre[1])
    except:
        genres_2.append("?")
        
genres_2

[{'id': 9648, 'name': 'Mystery'},
 {'id': 10751, 'name': 'Family'},
 {'id': 27, 'name': 'Horror'},
 {'id': 18, 'name': 'Drama'},
 {'id': 18, 'name': 'Drama'},
 '?',
 '?',
 {'id': 37, 'name': 'Western'},
 {'id': 18, 'name': 'Drama'},
 {'id': 18, 'name': 'Drama'},
 '?',
 {'id': 10751, 'name': 'Family'},
 {'id': 35, 'name': 'Comedy'}]

In [71]:
genres_2_clean = []

for genres in genres_2:
    try:
        genres_2_clean.append(genres['name'])
    except:
        genres_2_clean.append("?")
        
genres_2_clean

['Mystery',
 'Family',
 'Horror',
 'Drama',
 'Drama',
 '?',
 '?',
 'Western',
 'Drama',
 'Drama',
 '?',
 'Family',
 'Comedy']

In [72]:
data_df_2["genre_2"] = genres_2_clean

In [73]:
# Drop genres column
data_df_2.drop(columns = "genres", inplace = True)
data_df_2.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2
0,tt10530176,Connected by phone in the same home but 20 yea...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",7.6,407,The Call,콜,Thriller,Mystery
1,tt4105584,"In 2100, when humanity has abandoned the earth...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",6.0,120,Deep,Deep,Animation,Family
2,tt11284280,"In the aftermath of a nuclear disaster, a star...","[{'iso_3166_1': 'FI', 'name': 'Finland'}, {'is...",5.6,413,Cadaver,Kadaver,Drama,Horror
3,tt13717980,"Ahead of a promotion, a police chief becomes e...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",6.6,32,Grudge,Kin,Crime,Drama
4,tt13067004,"For Deniz, a 16-year-old teenager, this summer...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",5.8,31,Last Summer,Geçen Yaz,Romance,Drama


In [74]:
# Build first production country column
country_1 = []
for country in data_df_2["production_countries"]:
    try:
        country_1.append(country[0])
    except:
        country_1.append("?")
        
country_1

[{'iso_3166_1': 'KR', 'name': 'South Korea'},
 {'iso_3166_1': 'BE', 'name': 'Belgium'},
 {'iso_3166_1': 'FI', 'name': 'Finland'},
 {'iso_3166_1': 'TR', 'name': 'Turkey'},
 {'iso_3166_1': 'TR', 'name': 'Turkey'},
 {'iso_3166_1': 'FR', 'name': 'France'},
 {'iso_3166_1': 'CA', 'name': 'Canada'},
 {'iso_3166_1': 'BR', 'name': 'Brazil'},
 {'iso_3166_1': 'ES', 'name': 'Spain'},
 {'iso_3166_1': 'CA', 'name': 'Canada'},
 {'iso_3166_1': 'IN', 'name': 'India'},
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 {'iso_3166_1': 'CA', 'name': 'Canada'}]

In [75]:
country_1_clean = []

for country in country_1:
    try:
        country_1_clean.append(country['name'])
    except:
        country_1_clean.append("?")
        
country_1_clean

['South Korea',
 'Belgium',
 'Finland',
 'Turkey',
 'Turkey',
 'France',
 'Canada',
 'Brazil',
 'Spain',
 'Canada',
 'India',
 'United States of America',
 'Canada']

In [76]:
data_df_2["country_1"] = country_1_clean

data_df_2.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1
0,tt10530176,Connected by phone in the same home but 20 yea...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",7.6,407,The Call,콜,Thriller,Mystery,South Korea
1,tt4105584,"In 2100, when humanity has abandoned the earth...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",6.0,120,Deep,Deep,Animation,Family,Belgium
2,tt11284280,"In the aftermath of a nuclear disaster, a star...","[{'iso_3166_1': 'FI', 'name': 'Finland'}, {'is...",5.6,413,Cadaver,Kadaver,Drama,Horror,Finland
3,tt13717980,"Ahead of a promotion, a police chief becomes e...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",6.6,32,Grudge,Kin,Crime,Drama,Turkey
4,tt13067004,"For Deniz, a 16-year-old teenager, this summer...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",5.8,31,Last Summer,Geçen Yaz,Romance,Drama,Turkey


In [77]:
# Build second production country column
country_2 = []
for country in data_df_2["production_countries"]:
    try:
        country_2.append(country[1])
    except:
        country_2.append("?")
        
country_2

['?',
 {'iso_3166_1': 'CN', 'name': 'China'},
 {'iso_3166_1': 'NO', 'name': 'Norway'},
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'},
 '?',
 '?',
 {'iso_3166_1': 'US', 'name': 'United States of America'}]

In [78]:
country_2_clean = []

for country in country_2:
    try:
        country_2_clean.append(country['name'])
    except:
        country_2_clean.append("?")
        
country_2_clean

['?',
 'China',
 'Norway',
 '?',
 '?',
 '?',
 '?',
 '?',
 '?',
 'United States of America',
 '?',
 '?',
 'United States of America']

In [79]:
data_df_2["country_2"] = country_2_clean

data_df_2.head()

Unnamed: 0,imdb_id,overview,production_countries,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
0,tt10530176,Connected by phone in the same home but 20 yea...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",7.6,407,The Call,콜,Thriller,Mystery,South Korea,?
1,tt4105584,"In 2100, when humanity has abandoned the earth...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",6.0,120,Deep,Deep,Animation,Family,Belgium,China
2,tt11284280,"In the aftermath of a nuclear disaster, a star...","[{'iso_3166_1': 'FI', 'name': 'Finland'}, {'is...",5.6,413,Cadaver,Kadaver,Drama,Horror,Finland,Norway
3,tt13717980,"Ahead of a promotion, a police chief becomes e...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",6.6,32,Grudge,Kin,Crime,Drama,Turkey,?
4,tt13067004,"For Deniz, a 16-year-old teenager, this summer...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",5.8,31,Last Summer,Geçen Yaz,Romance,Drama,Turkey,?


In [80]:
data_df_2.drop(columns = "production_countries", inplace = True)

data_df_2.head()

Unnamed: 0,imdb_id,overview,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2
0,tt10530176,Connected by phone in the same home but 20 yea...,7.6,407,The Call,콜,Thriller,Mystery,South Korea,?
1,tt4105584,"In 2100, when humanity has abandoned the earth...",6.0,120,Deep,Deep,Animation,Family,Belgium,China
2,tt11284280,"In the aftermath of a nuclear disaster, a star...",5.6,413,Cadaver,Kadaver,Drama,Horror,Finland,Norway
3,tt13717980,"Ahead of a promotion, a police chief becomes e...",6.6,32,Grudge,Kin,Crime,Drama,Turkey,?
4,tt13067004,"For Deniz, a 16-year-old teenager, this summer...",5.8,31,Last Summer,Geçen Yaz,Romance,Drama,Turkey,?


In [81]:
# Replace with this DataFrame the original one
ids_to_correct = [16, 58, 76, 109, 120, 122, 258, 300, 389, 409, 580, 637, 657]

n = 0
for i in ids_to_correct:
    df.iloc[i, 8:] = data_df_2.iloc[n, :]
    n += 1
    
df.iloc[16, :]    

Title                                                      The Call
Genre                                                         Drama
Premiere                                        2020-11-27 00:00:00
Runtime                                                         112
IMDB Score                                                      4.1
Language                                                     Korean
premiere_year                                                  2020
tmdb_id                                                      575604
imdb_id                                                  tt10530176
overview          Connected by phone in the same home but 20 yea...
vote_average                                                    7.6
vote_count                                                      407
title                                                      The Call
original_title                                                    콜
genre_1                                         

### 3. Get cast data

#### 3.1 Full cast

In [82]:
def get_cast(movie_id):
    '''
    Takes movie's id, communicates with TMDB API and gets its entire cast and director
    
    Parameters
    ==========
    
    movie_id: int 
        movie's id we use to connect to the API
   
    Returns
    =======
    
    list
        list with actor's name 
    
    >>> The details can be found here: https://developers.themoviedb.org/3/movies/get-movie-credits'''
    
    # Connect with the API
    response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key=f062d4d3bef1ed6f224531125e4c20c7&language=en-US")
    
    # Save the cast and crew in respective lists
    cast = list(response.json()["cast"])
    crew = list(response.json()["crew"])
    
    # List to fullfill
    actor_list = []
    directors = []
    
    ## Loop that creates a list with the original name of every actor (if actor is relevant)
    for actor in cast:
        if actor["gender"] != 0:
            actor_list.append(actor["original_name"])
        else:
            pass
        
        
    ## Get the director
    for person in crew:
        if person["job"] == "Director":
            directors.append(person["original_name"])
        else:
            pass
        
        
    return actor_list, directors

In [83]:
# Prove
get_cast(398978)

(['Robert De Niro',
  'Al Pacino',
  'Joe Pesci',
  'Stephen Graham',
  'Ray Romano',
  'Harvey Keitel',
  'Bobby Cannavale',
  'Anna Paquin',
  'Stephanie Kurtzuba',
  'Kathrine Narducci',
  'Welker White',
  'Jesse Plemons',
  'Jack Huston',
  'Domenick Lombardozzi',
  'Paul Herman',
  'Louis Cancelmi',
  'Gary Basaraba',
  'Marin Ireland',
  'Sebastian Maniscalco',
  'Steven Van Zandt',
  'Lucy Gallina',
  'Dascha Polanco',
  'Bo Dietl',
  'Aleksa Palladino',
  'Daniel Jenkins',
  'Jim Norton',
  'Billy Smith',
  "Kevin O'Rourke",
  'Action Bronson',
  'Glenn Cunningham',
  'Paul Ben-Victor',
  'Patrick Gallo',
  'Jake Hoffman',
  'Barry Primus',
  'Danny A. Abeckaser',
  'Anthony J. Gallo',
  'J. C. MacKenzie',
  'John Polce',
  'Joseph Riccobene',
  'Vinny Vella',
  'Thomas E. Sullivan',
  'John Cenatiempo',
  'Robert Funaro',
  'Jennifer Mudge',
  'India Ennenga',
  'Jordyn DiNatale',
  'Kate Arrington',
  'Philip Suriano',
  'James P. Harkins',
  'Garry Pastore',
  'Frank Pietra

In [84]:
actors_directors = [get_cast(tmdb_id) for tmdb_id in df["tmdb_id"]]

In [85]:
actors_directors[0][0]

['Kouzou Morishita',
 'Kenji Kamiyama',
 'Youko Takahashi',
 'Alex Burunova',
 'Adi Shankar',
 'LeSean Thomas']

In [86]:
len(actors_directors)

684

In [87]:
df.shape

(684, 18)

##### Build the columns "cast" and "director"

In [88]:
actors_list = []

for actors in actors_directors:
    actors_list.append(actors[0])
    
df["cast"] = actors_list
df["cast"].sample(10)

549    [Carey Mulligan, Ralph Fiennes, Lily James, Jo...
675    [Jason Schwartzman, J.K. Simmons, Rashida Jone...
243    [Rishi Kapoor, Amyra Dastur, Nirmal Rishi, Apa...
200    [Tiya Sircar, Chad Connell, Marco Grazzini, Je...
272    [Dhanush, James Cosmo, Joju George, Kalaiyaras...
107    [Nasim Pedrad, Anna Camp, Lamorne Morris, Sara...
603    [Simon Greenall, Rob Rackstraw, Jo Wyatt, Keit...
121    [Rose McIver, Ben Lamb, Alice Krige, Honor Kne...
613                                                   []
130    [Vanessa Hudgens, Sam Palladio, Nick Sagar, Su...
Name: cast, dtype: object

In [89]:
df.iloc[117,:]

Title                                                     Intuition
Genre                                                      Thriller
Premiere                                        2020-05-28 00:00:00
Runtime                                                         116
IMDB Score                                                      5.3
Language                                                    Spanish
premiere_year                                                  2020
tmdb_id                                                      620883
imdb_id                                                  tt12282598
overview          Police officer Pipa works on her first big cas...
vote_average                                                    6.3
vote_count                                                      149
title                                                     Intuition
original_title                                        La corazonada
genre_1                                         

In [90]:
directors_list = []

for actors in actors_directors:
    directors_list.append(actors[1])
    
df["director"] = directors_list
df["director"].sample(10)

399            [Sachin Yardi]
395             [Brett Haley]
218               [Ben Young]
222    [Shaun Paul Piccinino]
200     [Kate Miles Melville]
192      [Steven K. Tsuchida]
79            [Michael Dowse]
307              [Jim Mickle]
405               [Adam Leon]
19             [Chandra Liow]
Name: director, dtype: object

#### 3.3 Gender column

In [91]:
def get_male(movie_id):
    '''
    Gets a movie id and get the sum of male actors in the movie
    
    Parameters
    ==========
    
    movie_id: int
        Movie's ID
    
    Returns
    =======
    int
        Sum of male actors
    '''
    
    # Connect with the API
    response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key=f062d4d3bef1ed6f224531125e4c20c7&language=en-US")

    # Save the cast in a list
    cast = list(response.json()["cast"])
    
    # Sum of male actors
    male_actors = 0
    
    ## Loop that builds the sum of male actors
    for actor in cast:
        if actor["gender"] == 2:
            male_actors += 1
        else:
            pass
        
    # Return the sum
    return male_actors

In [93]:
def get_female(movie_id):
    '''
    Gets a movie id and get the sum of female actors in the movie
    
    Parameters
    ==========
    
    movie_id: int
        Movie's ID
    
    Returns
    =======
    int
        Sum of female actors
    '''
    
    # Connect with the API
    response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key=f062d4d3bef1ed6f224531125e4c20c7&language=en-US")

    # Save the cast in a list
    cast = list(response.json()["cast"])
    
    # Sum of male actors
    female_actors = 0
    
    ## Loop that builds the sum of male actors
    for actor in cast:
        if actor["gender"] == 1:
            female_actors += 1
        else:
            pass
        
    # Return the sum
    return female_actors

In [95]:
# Prove
print(get_male(398978), get_female(398978))

78 24


In [97]:
# Data validation: the sum of get_male and get female must be equal to the length of get_cast's actors_list
get_male(398978) + get_female(398978) == len(get_cast(398978)[0])

True

In [99]:
# Build columns
df["male_actors"] = [get_male(movie_id) for movie_id in df["tmdb_id"] ]
df["female_actors"] = [get_female(movie_id) for movie_id in df["tmdb_id"] ]

In [100]:
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language,premiere_year,tmdb_id,imdb_id,overview,vote_average,vote_count,title,original_title,genre_1,genre_2,country_1,country_2,cast,director,male_actors,female_actors
0,Enter the Anime,Documentary,2019-08-05,58,2.5,English/Japanese,2019,616904,tt10662450,What is anime? Through deep-dives with notable...,5.8,51,Enter the Anime,Enter the Anime,Documentary,?,United States of America,Japan,"[Kouzou Morishita, Kenji Kamiyama, Youko Takah...",[Alex Burunova],4,2
1,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish,2020,735110,tt12536776,"In search of his sister, a renegade criminal s...",4.4,25,Dark Forces,Fuego negro,Horror,Action,?,?,"[Tenoch Huerta, Eréndira Ibarra, Ariane Pellic...",[Bernardo Arellano],2,2
2,The App,Science fiction/Drama,2019-12-26,79,2.6,Italian,2019,653522,tt11385066,"Loving girlfriend, family fortune, breakout mo...",2.8,105,The App,The App,Drama,Science Fiction,Italy,?,"[Vincenzo Crea, Greta Scarano, Maya Sansa, Abe...",[Elisa Fuksas],3,3
3,Sex: Unzipped,Comedy,2021-10-26,59,3.1,English,2021,890280,tt15523050,Rap superstar Saweetie hosts a celebration of ...,2.7,3,Sex: Unzipped,Sex: Unzipped,Comedy,?,United States of America,?,"[Saweetie, Joel Kim Booster, Michelle Buteau, ...",[],3,7
4,Swallow,Drama,2021-10-01,128,3.2,English,2021,874562,tt14391622,"Set in the 1980s, Tolani Ajao is a bank secret...",4.7,3,Swallow,Swallow,Drama,History,?,?,[],[Kunle Afolayan],0,0


In [102]:
# Data validation: the sum of "male_actors" and "female" actors must be equal to the lenght of cast

wrong = 0

for row in df:
    if row[-2] + row[-1] == len(row[-4]):
        wrong += 1
    else:
        pass
    
wrong

0

### 4. Save final DF

In [103]:
df.to_excel("final_netflix_df.xlsx", index = False)