In [1]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

import time

In [2]:
#  Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    movie = dict(movie)
    alt_titles = {}
    
    # loop through a list of all alternative title keys
    for key in ['Also known as', 'Arabic', 'Cantonese', 'Chinese', 'French',
               'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally', 'Mandarin',
               'McCune-Reischauer', "Original title", 'Polish', 'Revised Romanization', 'Romanized', 'Russian',
                'Simplified', 'Traditional', 'Yiddish']:
        # Check if the current key exists in the movie object
        if key in movie:
            # Remove the key-value pair and add the alternatie titles to dictionary
            alt_titles[key] = movie[key]
            movie.pop(key)
    # Add alternative titles dict to the movie object
    if len(alt_titles)> 0:
            movie['alt_titles'] = alt_titles
           
    # Consolidate redundant columns
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)  
    change_column_name('Directed by', 'Director')
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release date')
    change_column_name('Released Date', 'Released date')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Story by',"Writer(s)")
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    return movie
    

In [3]:
# 1 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def extract_transform_load():
# Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(f'{file_dir}/movies_metadata.csv', low_memory = False)
    ratings = pd.read_csv(f'{file_dir}/ratings.csv')

# Open and read the Wikipedia data JSON file.
    with open(f'{file_dir}/wikipedia-movies.json') as file:
        wiki_movies_raw = json.load(file)
        
# Write a list comprehension to filter out TV shows.
    wiki_movies = [movie for movie in wiki_movies_raw
                     if ('Director' in movie or "Directed by" in movie)
                     and 'imdb_link' in movie
                     and 'No. of episodes' not in movie]

# Write a list comprehension to iterate through the cleaned wiki movies list
# and call the clean_movie function on each movie.
    clean_movies = [clean_movie(movie) for movie in wiki_movies]

# Read in the cleaned movies list from Step 4 as a DataFrame.
    wiki_movies_df = pd.DataFrame(clean_movies)

# Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
#  dropping any imdb_id duplicates. If there is an error, capture and print the exception.

    try: 
        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})') 
        wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
        
    except:
        print(f'ID not found')
    
# Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    notnull_columns = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
    wiki_movies_df = wiki_movies_df[notnull_columns]
# Create a variable that will hold the non-null values from the “Box office” column.
    box_office = wiki_movies_df['Box office']
    
# Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
    
# Write a regular expression to match the six elements of "form_one" of the box office data.
    form_one = r'\$\d+\.?\d*\s*[mb]illion'
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    form_two = r'\$\d{1,3}(?:,\d{3})+'

# Add the parse_dollars function.
    def parse_dollars(s):
    # if s is not a string, return NaN
        if type(s) != str:
            return np.nan

        # if input is of the form $###.# million
        if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):
            # remove dollar sign and " million"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a million
            value = float(s) * 10**6

            # return value
            return value

        # if input is of the form $###.# billion
        elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

            # remove dollar sign and " billion"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a billion
            value = float(s) * 10**9

            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):

            # remove dollar sign and commas
            s = re.sub('\$|,','', s)

            # convert to float
            value = float(s)

            # return value
            return value

        # otherwise, return NaN
        else:
            return np.nan


# Clean the box office column in the wiki_movies_df DataFrame.
    
    #
    wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    # Drop the 'Box office' column
    wiki_movies_df.drop('Box office', axis=1, inplace=True)

# Clean the budget column in the wiki_movies_df DataFrame.
    
    # Drop 'Budget' column null values
    budget = wiki_movies_df['Budget'].dropna()
    # Convert 'budget' list values into string values
    budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)
    # Replace any values between a $ sign and a hypen, with a $ sign
    budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    # Remove citation references
    budget = budget.str.replace(r'\[\d+\]\s*', '')
    # Parse and clean budget data
    wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    # Drop Budget column
    wiki_movies_df.drop('Budget', axis=1, inplace=True)

# Clean the release date column in the wiki_movies_df DataFrame.
    
    # Drop 'Release date' column null values, convert list values to strings
    release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    # Using regular expression, match the four form of 'release date' values
    date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
    date_form_two = r'\d{4}.[01]\d.[0123]\d'
    date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
    date_form_four = r'\d{4}'
    # Parse 'release date' list for captured value forms, format datetime dtype, and create new column
    wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)
    # Drop Release date column
    wiki_movies_df.drop('Release date', axis=1, inplace=True)
    
# Clean the running time column in the wiki_movies_df DataFrame.
    
    # Drop null values, convert lists to strings
    running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else (x))
    # Extract value forms using regular expression
    running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
    # Convert string values to numeric values uisng .to_numeric(), fill empty string with NaNs, then fill NaNs with 0
    running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
    # Convert extracted times to minutes
    wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
    # Drop Running time column
    wiki_movies_df.drop('Running time', axis=1, inplace=True)
   

# =================== DELIVERABLE 3================================

# 2. Clean the Kaggle metadata.
    
    # Filter the 'adult' column for False values, drop the column
    kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult', axis='columns')
    
    # Convert 'video' dtypes to boolean
    kaggle_metadata['video'] == 'True'
    kaggle_metadata['video'] = kaggle_metadata['video'] == "True"
    
    # convert 'budget', 'id', 'popularity' to numeric data types
    kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
    kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
    kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise')
    
    # convert 'release date' to datetime dtype
    kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date'])
    
    # convert 'ratings' values to datetime datatypes
    pd.to_datetime(ratings['timestamp'], unit='s')
    
    # assign rating values to timestamp column
    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')


# 3. Merged the two DataFrames into the movies DataFrame.
    movies_df = pd.merge(wiki_movies_df, kaggle_metadata, on='imdb_id', suffixes=['wiki','kaggle'])

# 4. Drop unnecessary columns from the merged DataFrame.
    
    # Drop wiki outlier
    movies_df = movies_df.drop(movies_df[(movies_df['release_datewiki'] > '1996-01-01') & (movies_df['release_datekaggle'] < '1965-01-01')].index)
    
    # Drop title, release_date, language, production company of wiki-data columnf of df
    movies_df.drop(columns=['titlewiki', 'release_datewiki', 'Language', 'Production company(s)'])

# 5. Add in the function to fill in the missing Kaggle data.
    
    # create function to Fill in missing kaggles data, with wiki data, then drop redundant (wiki) columns
    def fill_missing_kaggle_data(df, kaggle_column, wiki_column):
        df[kaggle_column] = df.apply(
        lambda row: row[wiki_column] if row[kaggle_column] == 0 else row[kaggle_column], axis=1)
        
        df.drop(columns=wiki_column, inplace=True)

# 6. Call the function in Step 5 with the DataFrame and columns as the arguments.
    
    # Run function to fill in missing kaggle value
    fill_missing_kaggle_data(movies_df, 'runtime', 'running_time')
    fill_missing_kaggle_data(movies_df, 'budgetkaggle', 'budgetwiki')
    fill_missing_kaggle_data(movies_df, 'revenue', 'box_office')


# 7. Filter the movies DataFrame for specific columns.
    
    for col in movies_df.columns:
        lists_to_tuples = lambda x: tuple(x) if type(x) == list else x
        value_counts = movies_df[col].apply(lists_to_tuples).value_counts(dropna=False)
        num_values = len(value_counts)
        if num_values == 1:
            print(col)
        
    # drop 'video' column
    movies_df.drop(columns='video', inplace=True)


# 8. Rename the columns in the movies DataFrame.

    # Reorder movies_df columns
    movies_df = movies_df.loc[:, ['imdb_id', 'id', 'titlekaggle', 'original_title', 'tagline', 'belongs_to_collection', 'url', 'imdb_link', 'runtime', 'budgetkaggle', 'revenue', 'release_datekaggle', 'popularity', 'vote_average', 'vote_count', 'genres', 'original_language', 'overview', 'spoken_languages', 'Country', 'production_companies', 'production_countries', 'Distributor', 'Producer(s)', 'Director', 'Starring', 'Cinematography', 'Editor(s)', 'Writer(s)', 'Composer(s)', 'Based on']]

    movies_df.rename({'id':'kaggle_id',
                  'titlekaggle':'title',
                  'url':'wikipedia_url',
                  'budgetkaggle':'budget',
                  'release_datekaggle':'release_date',
                  'Country':'country',
                  'Distributor':'distributor',
                  'Producer(s)':'producers',
                  'Director':'director',
                  'Starring':'starring',
                  'Cinematography':'cinematography',
                  'Editor(s)':'editors',
                  'Writer(s)':'writers',
                  'Composer(s)':'composers',
                  'Based on':'based_on'
                 }, axis='columns', inplace=True)


# 9. Transform and merge the ratings DataFrame.
    
    # Use the groupby() function to group 'movieId' and 'rating' columns, then take count(),
    # rename 'userId' column to "count",
    # set 'movieId' as the index, the columns to 'rating values', and 'counts' to rows
    
    
    rating_counts = ratings.groupby(['movieId', 'rating'], as_index=False).count()\
    .rename({'userId':'count'}, axis=1)\
    .pivot(index='movieId', columns='rating', values='count')
    
    # rename each ratings'column
    rating_counts.columns = ['rating_' +str(col) for col in rating_counts.columns]
    
    # Merge transformed dataframes
    movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='kaggle_id', right_index=True, how='left')
    
    # fill in missing rating values with '0'
    movies_with_ratings_df[rating_counts.columns]= movies_with_ratings_df[rating_counts.columns].fillna(0)
    
#===========================================================================
    
    # Create SQLalchemy connection string
    db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/movie_data"
    # Create database engine
    engine = create_engine(db_string)
    # import and save movie data
    movies_df.to_sql(name='movies', con=engine, if_exists='replace')

    # Create a variable for the number of rows imported
    rows_imported = 0

    # Get the start_time from time.time()
    start_time = time.time()

    # Import ratings data using chunksize= method
    for data in pd.read_csv(f'{file_dir}/ratings.csv', chunksize=1000000):

        # print out the range of rows that are being imported = indicate beginning
        print(f'Importing rows {rows_imported} to {rows_imported + len(data)}...', end=' ')

        data.to_sql(name='ratings', con=engine, if_exists='append')

        # increment the number ofrows imported by chunksize
        rows_imported += len(data)

        # print that the rows have finished importing, add elapsed time to final print out
        print(f'Done. {time.time() - start_time} total seconds elapsed')
        
    return wiki_movies_df, movies_with_ratings_df, movies_df

In [4]:
# 10. Create the path to your file directory and variables for the three files.
file_dir = '/Users/Tracari/Desktop/Desktop Air/UM Projects/Movies-ETL'
# The Wikipedia data
wiki_file = f'{file_dir}/wikipedia_movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [10]:
# 11. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = extract_transform_load()



video
Importing rows 0 to 1000000... Done. 25.155937910079956 total seconds elapsed
Importing rows 1000000 to 2000000... Done. 51.59819006919861 total seconds elapsed
Importing rows 2000000 to 3000000... Done. 78.24881410598755 total seconds elapsed
Importing rows 3000000 to 4000000... Done. 102.86910891532898 total seconds elapsed
Importing rows 4000000 to 5000000... Done. 126.22461581230164 total seconds elapsed
Importing rows 5000000 to 6000000... Done. 150.02155208587646 total seconds elapsed
Importing rows 6000000 to 7000000... Done. 173.3605728149414 total seconds elapsed
Importing rows 7000000 to 8000000... Done. 196.92515897750854 total seconds elapsed
Importing rows 8000000 to 9000000... Done. 220.1025788784027 total seconds elapsed
Importing rows 9000000 to 10000000... Done. 243.000746011734 total seconds elapsed
Importing rows 10000000 to 11000000... Done. 265.47998690605164 total seconds elapsed
Importing rows 11000000 to 12000000... Done. 289.44111585617065 total seconds e

In [6]:
# 12. Set the DataFrames from the return statement equal to the file names in Step 11. 
wiki_movies_df = wiki_file
movies_with_ratings_df = kaggle_file
movies_df = ratings_file

In [7]:
# 13. Check the wiki_movies_df DataFrame. 
wiki_movies_df.head(10)

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Country,Language,Director,...,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,English,Renny Harlin,...,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,"[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,English,James Foley,...,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,"[English, Lao]",Roger Spottiswoode,...,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]","[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,English,Woody Allen,...,Susan E. Morse,,Robert Greenhut,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,English,John Cornell,...,David Stiven,Maurice Jarre,John Cornell,,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-12-19,95.0
5,https://en.wikipedia.org/wiki/The_Ambulance,1990,https://www.imdb.com/title/tt0099026/,The Ambulance,,"[Eric Roberts, James Earl Jones, Red Buttons, ...",Jacques Haitkin,United States,English,Larry Cohen,...,"[Claudia Finkle, Armond Leibowitz]",Jay Chattaway,"[Larry Cohen, Moctesuma Esparza, Robert Katz]",Epic Productions,Larry Cohen,tt0099026,,,1990-03-22,95.0
6,https://en.wikipedia.org/wiki/American_Dream_(...,1990,https://www.imdb.com/title/tt0099028/,American Dream,,,"[Tom Hurwitz, Mathieu Roberts, Nesya Shapiro]","[United States, United Kingdom]",English,"[Barbara Kopple, Co-directors:, Cathy Caplan, ...",...,"[Cathy Caplan, Thomas Haneke, Lawrence Silk]",Michael Small,"[Arthur Cohn, Barbara Kopple]","[Cabin Creek, Catholic Communication Campaign,...",,tt0099028,,,1990-10-06,100.0
7,https://en.wikipedia.org/wiki/American_Ninja_4...,1990,https://www.imdb.com/title/tt0101326/,American Ninja 4: The Annihilation,,"[Michael Dudikoff, David Bradley, James Booth,...",,"[United States, Lesotho]",English,Cedric Sundstrom,...,,,Ovidio G Assonitis,,David Geeves,tt0101326,,,1991-03-08,99.0
8,https://en.wikipedia.org/wiki/Andre%27s_Mother,1990,https://www.imdb.com/title/tt0099037/,Andre's Mother,,"[Richard Thomas, Sada Thompson, Sylvia Sidney]",Bobby Bukowski,United States,English,Deborah Reinisch,...,Jeffrey Wolf,Jonathan Sheffer,"[Sarah Green, Deborah Reinisch]",,Terrence McNally,tt0099037,,,1990-03-07,50.0
9,https://en.wikipedia.org/wiki/Angel_Town_(film),1990,https://www.imdb.com/title/tt0099039/,Angel Town,,"[Olivier Gruner, Theresa Saldana, Frank Aragon...",John LeBlanc,United States,English,Eric Karson,...,Duane Hartzell,Terry Plumeri,"[Ash R. Shah, Eric Karson]",,S. Warren,tt0099039,855810.0,,1990-02-23,102.0


In [8]:
# 14. Check the movies_with_ratings_df DataFrame.
movies_with_ratings_df

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,tt0098987,9548,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,104.0,49000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0098994,25501,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,114.0,6000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0099005,11856,Air America,Air America,The few. The proud. The totally insane.,,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,112.0,35000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0099012,8217,Alice,Alice,,,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,102.0,12000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0099018,25943,Almost an Angel,Almost an Angel,Who does he think he is?,,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,95.0,25000000.0,...,3.0,0.0,3.0,2.0,5.0,26.0,37.0,46.0,16.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6047,tt5639354,429191,A Fantastic Woman,Una mujer fantástica,,,https://en.wikipedia.org/wiki/A_Fantastic_Woman,https://www.imdb.com/title/tt5639354/,104.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6048,tt5390066,390059,Permission,Permission,,,https://en.wikipedia.org/wiki/Permission_(film),https://www.imdb.com/title/tt5390066/,96.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6049,tt6304162,429174,Loveless,Нелюбовь,,,https://en.wikipedia.org/wiki/Loveless_(film),https://www.imdb.com/title/tt6304162/,128.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6050,tt5795086,412302,Gemini,Gemini,,,https://en.wikipedia.org/wiki/Gemini_(2017_film),https://www.imdb.com/title/tt5795086/,92.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 15. Check the movies_df DataFrame. 
movies_df.head(20)

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,production_countries,distributor,producers,director,starring,cinematography,editors,writers,composers,based_on
0,tt0098987,9548,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,104.0,49000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",20th Century Fox,"[Steve Perry, Joel Silver]",Renny Harlin,"[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,Michael Tronick,"[David Arnott, James Cappe]","[Cliff Eidelman, Yello]","[Characters, by Rex Weiner]"
1,tt0098994,25501,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,114.0,6000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Avenue Pictures,"[Ric Kidney, Robert Redlin]",James Foley,"[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,Howard E. Smith,"[James Foley, Robert Redlin]",Maurice Jarre,"[the novel, After Dark, My Sweet, by, Jim Thom..."
2,tt0099005,11856,Air America,Air America,The few. The proud. The totally insane.,,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,112.0,35000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",TriStar Pictures,Daniel Melnick,Roger Spottiswoode,"[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,"[John Bloom, Lois Freeman-Fox]","[John Eskow, Richard Rush]",Charles Gross,"[Air America, by, Christopher Robbins]"
3,tt0099012,8217,Alice,Alice,,,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,102.0,12000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Orion Pictures,Robert Greenhut,Woody Allen,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,Susan E. Morse,Woody Allen,,
4,tt0099018,25943,Almost an Angel,Almost an Angel,Who does he think he is?,,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,95.0,25000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Paramount Pictures,John Cornell,John Cornell,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,David Stiven,Paul Hogan,Maurice Jarre,
5,tt0099026,79509,The Ambulance,The Ambulance,You'll be in perfect health before you die.,,https://en.wikipedia.org/wiki/The_Ambulance,https://www.imdb.com/title/tt0099026/,91.0,,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Triumph Releasing Corporation,"[Larry Cohen, Moctesuma Esparza, Robert Katz]",Larry Cohen,"[Eric Roberts, James Earl Jones, Red Buttons, ...",Jacques Haitkin,"[Claudia Finkle, Armond Leibowitz]",Larry Cohen,Jay Chattaway,
6,tt0099028,41326,American Dream,American Dream,"The award-winning film of American lives, Amer...",,https://en.wikipedia.org/wiki/American_Dream_(...,https://www.imdb.com/title/tt0099028/,98.0,,...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",Prestige Films,"[Arthur Cohn, Barbara Kopple]","[Barbara Kopple, Co-directors:, Cathy Caplan, ...",,"[Tom Hurwitz, Mathieu Roberts, Nesya Shapiro]","[Cathy Caplan, Thomas Haneke, Lawrence Silk]",,Michael Small,
7,tt0101326,25528,American Ninja 4: The Annihilation,American Ninja 4: The Annihilation,,"{'id': 91945, 'name': 'American Ninja Collecti...",https://en.wikipedia.org/wiki/American_Ninja_4...,https://www.imdb.com/title/tt0101326/,99.0,,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Cannon Group,Ovidio G Assonitis,Cedric Sundstrom,"[Michael Dudikoff, David Bradley, James Booth,...",,,David Geeves,,
8,tt0099044,11595,Another 48 Hrs.,Another 48 Hrs.,The boys are back in town.,"{'id': 93295, 'name': '48 Hrs. Collection', 'p...",https://en.wikipedia.org/wiki/Another_48_Hrs.,https://www.imdb.com/title/tt0099044/,95.0,38000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Paramount Pictures,"[Lawrence Gordon, Robert D. Wachs]",Walter Hill,"[Eddie Murphy, Nick Nolte, Brion James, Ed O'R...",Matthew F. Leonetti,"[Donn Aron, Carmel Davies, Freeman A. Davies, ...",Fred Braughton,James Horner,"[Characters by, Roger Spottiswoode, Walter Hil..."
9,tt0099052,6488,Arachnophobia,Arachnophobia,"Eight legs, two fangs, and an attitude.",,https://en.wikipedia.org/wiki/Arachnophobia_(f...,https://www.imdb.com/title/tt0099052/,103.0,31000000.0,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Buena Vista Pictures,"[Kathleen Kennedy, Richard Vane]",Frank Marshall,"[Jeff Daniels, Julian Sands, Harley Jane Kozak...",Mikael Salomon,Michael Kahn,"[Don Jakoby, Al Williams]",Trevor Jones,
