# **More Data** -pt2

Import Modules:

In [1]:
import os
import time
import json
import pandas as pd
import tmdbsimple as tmdb

Login to the tmdb_API:

In [2]:
with open("moviesDatabase.json", "r") as f:
    a = json.load(f)

# Set your TMDB API key
tmdb.API_KEY = a["API-key"]

- **Define Custom Functions:**

In [3]:
# Function to add certification (MPAA Rating) to movie.info
def get_movie_with_certification(movie_id):
    movie = tmdb.Movies(movie_id)
    response = movie.info()
    certification = movie.release_dates()['results']
    for result in certification:
        if result['iso_3166_1'] == 'US':
            movie['certification'] = result['release_dates'][0]['certification']
            break
    return movie

# Function to append/extend a JSON file with Python
def write_json(new_data, filename):
    with open(filename, 'r+') as file:
        file_data = json.load(file)
        if isinstance(new_data, list) and isinstance(file_data, list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        file.seek(0)
        json.dump(file_data, file)

In [4]:
# Check if folder Data exists:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)

- **Create Outer and Inner Loop:**

In [None]:
# Load the filtered dataframe from Part 1
basics = pd.read_csv('Data/title_basics.csv')

YEARS_TO_GET = [2000, 2001]

# List of errors
errors = []

In [5]:
# Start of OUTER loop
for YEAR in YEARS_TO_GET:
    # Defining the JSON file to store results for each year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # If it does not exist, save an empty dict with just "imdb_id" to the new json file.
    if not file_exists:
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id': 0}], f)
    
    # Saving new year as the current dataframe
    df = basics.loc[basics['startYear'] == YEAR].copy()
    
    # Saving movie ids to a list
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # Filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    # INNER Loop
    for movie_id in movie_ids_to_get:
        try:
            # Retrieve the data for the movie id and add certification
            temp = get_movie_with_certification(movie_id)
            
            # Append/extend results to the existing file using the write_json function
            write_json(temp, JSON_FILE)
            
            # Short sleep to prevent overwhelming the server
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)
    
print(f"- Total errors: {len(errors)}")
print("Inner Loop Done")

PermissionError: [Errno 13] Permission denied: 'Data/title_basics.csv'

In [None]:
# Load the data into a single dataframe for EDA
all_data = pd.concat([pd.read_csv(f"{FOLDER}final_tmdb_data_{year}.csv.gz") for year in YEARS_TO_GET])

In [None]:
# Perform EDA
movies_with_financial_info = all_data[(all_data['budget'] > 0) | (all_data['revenue'] > 0)]

valid_certifications = movies_with_financial_info['certification'].value_counts()
average_revenue_per_certification = movies_with_financial_info.groupby('certification')['revenue'].mean()
average_budget_per_certification = movies_with_financial_info.groupby('certification')['budget'].mean()

print("Number of movies with valid financial information:")
print(len(movies_with_financial_info))

print("Number of movies in each certification category:")
print(valid_certifications)

print("Average revenue per certification category:")
print(average_revenue_per_certification)

print("Average budget per certification category:")
print(average_budget_per_certification)