# **More Data** -pt2

Import Modules:

In [1]:
import os
import time
import json
import pandas as pd
import tmdbsimple as tmdb
from tqdm import tqdm_notebook

Login to the tmdb_API:

In [2]:
with open("moviesDatabase.json", "r") as f:
    a = json.load(f)

# Set your TMDB API key
tmdb.API_KEY = a["API-key"]

- **Define Custom Functions:**

In [3]:
# Function to add certification (MPAA Rating) to movie.info
def get_movie_with_certification(movie_id):
    movie = tmdb.Movies(movie_id)
    response = movie.info()
    certification = movie.release_dates()["results"]
    for result in certification:
        if result["iso_3166_1"] == "US":
            movie["certification"] = result["release_dates"][0]["certification"]
            break
    return movie

# Function to write data to a JSON file
def write_json(new_data, filename):
    if not os.path.isfile(filename):
        with open(filename, "w") as file:
            json.dump([{"imdb_id": 0}], file)

    with open(filename, "r+") as file:
        file_data = json.load(file)
        if isinstance(new_data, list) and isinstance(file_data, list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)

        file.seek(0)
        json.dump(file_data, file)

In [4]:
#Function From: https://github.com/coding-dojo-data-science/data-enrichment-helper-functions

# Function to read and fix JSON file
def read_and_fix_json(JSON_FILE):
    try:
        with open(JSON_FILE, "r") as f:
            file_contents = f.read()
        print("File Contents:")
        print(file_contents)
        previous_df = pd.read_json(JSON_FILE)
    except ValueError:
        with open(JSON_FILE, "r+") as f:
            data = json.load(f)
            if len(data) == 1 and "imdb_id" in data[0]:
                data.pop(0)
                f.seek(0)
                json.dump(data, f)
                f.truncate()
                previous_df = pd.DataFrame(data)
            else:
                raise Exception("Invalid JSON format")
    return previous_df

In [5]:
# Check if folder Data exists
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)

- **Create Outer and Inner Loop:**

In [6]:
# Load the filtered dataframe from Part 1
basics = pd.read_csv("Data/title_basics.csv")

# Define the years to extract from the API
YEARS_TO_GET = [2000, 2001]

# List of errors
errors = []

In [7]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc="YEARS", position=0):
    # Define the JSON file to store results for each year
    JSON_FILE = f"{FOLDER}tmdb_api_results_{YEAR}.json"

    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it does not exist, save an empty dict with just imdb_id to the new json file.
    if not file_exists:
        with open(JSON_FILE, "w") as f:
            json.dump({"imdb_id": 0}, f)
    
    # Saving new year as the current dataframe
    df = basics.loc[basics["startYear"] == YEAR].copy()
    
    # Saving movie ids to a list
    movie_ids = df["tconst"].copy()

    # Load existing data from JSON into a dataframe called "previous_df"
    previous_df = read_and_fix_json(JSON_FILE)
    print("Previous DataFrame:")
    print(previous_df)

    # Filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]

    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get, desc=f"Movies from {YEAR}", position=1, leave=True):
        try:
            # Retrieve the data for the movie id and add certification
            temp = get_movie_with_certification(movie_id)
            
            # Append/extend results to the existing file using the write_json function
            write_json(temp, JSON_FILE)
            
            # Short sleep to prevent overwhelming the server
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])
    
    final_year_df = read_and_fix_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)
    
    print("Final Year DataFrame:")
    print(final_year_df)

print(f"- Total errors: {len(errors)}")
print("Inner Loop Done")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for YEAR in tqdm_notebook(YEARS_TO_GET, desc="YEARS", position=0):


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

File Contents:
[{"imdb_id": 0}]
Previous DataFrame:
   imdb_id
0        0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(movie_ids_to_get, desc=f"Movies from {YEAR}", position=1, leave=True):


Movies from 2000:   0%|          | 0/1450 [00:00<?, ?it/s]

File Contents:
[{"imdb_id": 0}, 


JSONDecodeError: Expecting value: line 1 column 18 (char 17)