<br><br><font size=6><b>Project 3: </b></font><font size=6>Part 2 - <span style="color:#7e6c94;font-size:29px;">Extract from TMDB</span></font>

<font size=3><ul><li>Suganthan Thamotharampillai
<br><li>02.08.2023

<br><font size=5><b>Imports

In [2]:
# LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

<br><font size=5><b>Data

In [45]:
# Load Dataframe:'basics'
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)

<br><font size=5><b>API Login

In [12]:
# API login:TMDB
with open('/Users/sugan/.secret/TMDB_api.json', 'r') as f:
    login = json.load(f)

# API_Key
tmdb.API_KEY =  login['api-key']
login.keys()

dict_keys(['api-key'])

<br><font size=5><b>Functions

In [13]:
# Revenue-Budget-Rating
def revenueBudgetRating(movie_id):
    
    # Get movie object for current id
    movie = tmdb.Movies(movie_id)
    
    # Save .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation=US
        if c['iso_3166_1' ]=='US':
           #save the "certification" key in .info
           info['certification'] = c['certification']
    
    return info

In [49]:
# Dumping to Json
def write_json(new_data, filename): 
    "Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"
    with open(filename,'r+') as file:
        # Load existing data into dict
        file_data = json.load(file)
        
        # Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
                
        # Sets file's current position at offset
        file.seek(0)
        # Convert back to json
        json.dump(file_data, file)

<br><font size=5><b>Designation

In [36]:
# Designate a folder
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

<br><font size=5><b>Loop Components

In [46]:
# Required Lists for the Loop
# The Years List
YEARS_TO_GET = [2000,2001]

In [69]:
# The Errors list
errors = []

<br><font size=5><b>API Call

In [81]:
## API Call
## OUTER LOOP
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0, colour='violet'):

    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it does not exist: create it
    if file_exists == False:
    # Save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    # Define/filter the IDs to call
    df = basics.loc[basics['startYear']==YEAR].copy()

    # Saving movie ids to list
    movie_ids = df['tconst'].copy()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)

    # Filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    ## INNER LOOP
    # Get index and movie id from list
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = revenueBudgetRating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.01)

        except Exception as e:
            errors.append([movie_id, e])
            
    ## END OF INNER LOOP

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

## END OF OUTER LOOP

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1429 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1545 [00:00<?, ?it/s]

<br><font size=5><b>Result

<b>1. Revenue-Budget-Rating

In [121]:
# Revenue-Budget-Rating
final_year_df[['imdb_id','revenue','budget','certification']]

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0035423,76019048.0,48000000.0,PG-13
2,tt0114447,0.0,0.0,
3,tt0118589,5271666.0,22000000.0,PG-13
4,tt0118652,0.0,1000000.0,R
...,...,...,...,...
1297,tt7797670,0.0,0.0,
1298,tt7797790,0.0,0.0,
1299,tt8665056,0.0,0.0,
1300,tt8795764,0.0,0.0,NR


<b>2. Errors

In [82]:
# Errors
pd.DataFrame(errors, columns=['Movie_Id',f'Errors: Total {len(errors)}'])

Unnamed: 0,Movie_Id,Errors: Total 469
0,tt0115937,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=c38104ee477e47aaa262c788d50e6564
1,tt0116628,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0116628?api_key=c38104ee477e47aaa262c788d50e6564
2,tt0118710,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0118710?api_key=c38104ee477e47aaa262c788d50e6564
3,tt0115937,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=c38104ee477e47aaa262c788d50e6564
4,tt0116628,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0116628?api_key=c38104ee477e47aaa262c788d50e6564
...,...,...
464,tt8846956,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt8846956?api_key=c38104ee477e47aaa262c788d50e6564
465,tt9212730,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt9212730?api_key=c38104ee477e47aaa262c788d50e6564
466,tt9228234,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt9228234?api_key=c38104ee477e47aaa262c788d50e6564
467,tt9555974,404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt9555974?api_key=c38104ee477e47aaa262c788d50e6564


Information courtesy of
IMDb
(https://www.imdb.com). Used with permission.
<br><img src="https://www.themoviedb.org/assets/2/v4/logos/v2/blue_long_2-9665a76b1ae401a510ec1e0ca40ddcb3b0cfe45f1d51b77a308fea0845885648.svg" align="left" width=470 alt="My Image" />