# Part 1

In [40]:
# Import Libraries
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt

In [3]:
# Load the datasets
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [10]:
# Load AKAs dataset
akas = pd.read_csv(akas_url, sep='\t', low_memory=False, na_values='\\N')

# Filter AKAs dataset to include only US movies
us_movies_akas = akas[(akas['region'] == 'US') & (akas['types'] != 'alternative')]

In [11]:
# Load Ratings dataset
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False, na_values='\\N')

# Filter Basics dataset based on AKAs dataset to include only US movies
basics = basics.merge(us_movies_akas[['titleId']], left_on='tconst', right_on='titleId', how='inner')

In [12]:
# Load Basics dataset
basics = pd.read_csv(basics_url, sep='\t', low_memory=False, na_values='\\N')

# Filter Basics dataset to include only full-length movies released between 2000 and 2021 in the US and exclude documentaries
basics = basics[(basics['titleType'] == 'movie') &
                (basics['startYear'].between(2000, 2021)) &
                (~basics['genres'].str.contains('documentary', case=False, na=False))]

# Filter Basics dataset based on AKAs dataset to include only US movies
basics = basics.merge(us_movies_akas[['titleId']], left_on='tconst', right_on='titleId', how='inner')

In [13]:
# Check remaining movies and data types in Basics dataset
print(basics.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105648 entries, 0 to 105647
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          105648 non-null  object 
 1   titleType       105648 non-null  object 
 2   primaryTitle    105648 non-null  object 
 3   originalTitle   105648 non-null  object 
 4   isAdult         105648 non-null  float64
 5   startYear       105648 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  91160 non-null   object 
 8   genres          102630 non-null  object 
 9   titleId         105648 non-null  object 
dtypes: float64(3), object(7)
memory usage: 8.9+ MB
None


In [14]:
# Check remaining movies and data types in AKAs dataset
print(us_movies_akas.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1429678 entries, 5 to 36516552
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1429678 non-null  object 
 1   ordering         1429678 non-null  int64  
 2   title            1429678 non-null  object 
 3   region           1429678 non-null  object 
 4   language         3851 non-null     object 
 5   types            959918 non-null   object 
 6   attributes       46946 non-null    object 
 7   isOriginalTitle  1428336 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 98.2+ MB
None


In [15]:
# Check remaining movies and data types in Ratings dataset
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1329143 entries, 0 to 1329142
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1329143 non-null  object 
 1   averageRating  1329143 non-null  float64
 2   numVotes       1329143 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.4+ MB
None


In [16]:
# Save filtered datasets as compressed CSV files
basics.to_csv('Data/basics.csv.gz', compression='gzip', index=False)
us_movies_akas.to_csv('Data/akas.csv.gz', compression='gzip', index=False)
ratings.to_csv('Data/ratings.csv.gz', compression='gzip', index=False)

# API Connection

In [17]:
from tqdm import tqdm

In [18]:
# Function to extract movie info from TMDB API
def get_movie_info(movie_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    response = requests.get(url)
    movie_data = response.json()
    return movie_data

In [19]:
# Function to append data to a JSON file
def append_to_json(data, filename):
    with open(filename, 'a') as file:
        json.dump(data, file)
        file.write('\n')

api_key = "6ae1796c8924400f82273b37dedcdd66"

avengers_info = get_movie_info("tt0848228", api_key)
notebook_info = get_movie_info("tt0332280", api_key)

print("The Avengers:")
print(avengers_info)
print("\n")
print("The Notebook:")
print(notebook_info)

The Avengers:
{'adult': False, 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg', 'belongs_to_collection': {'id': 86311, 'name': 'The Avengers Collection', 'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg', 'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'}, 'budget': 220000000, 'genres': [{'id': 878, 'name': 'Science Fiction'}, {'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}], 'homepage': 'https://www.marvel.com/movies/the-avengers', 'id': 24428, 'imdb_id': 'tt0848228', 'original_language': 'en', 'original_title': 'The Avengers', 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!', 'popularity': 129.163, 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg', 'production_companies': [{'id': 420, 'logo_path': '/hUzeosd

In [20]:
# Load the filtered basics dataset
basics = pd.read_csv('Data/basics.csv.gz', compression='gzip')

In [21]:
# Extract financial data for movies starting in 2000 or 2001
years = [2000, 2001]

for year in years:
    # Filter basics dataset for the specific year
    movies_year = basics[basics['startYear'] == year]

    # Create an empty list to store movie data
    movie_data_list = []

    # Loop through the movies and extract financial data
    for index, movie in tqdm(movies_year.iterrows(), total=movies_year.shape[0], desc=f"Year {year}"):
        movie_id = movie['tconst']
        movie_info = get_movie_info(movie_id, api_key)
        movie_data = {
            'movie_id': movie_id,
            'budget': movie_info.get('budget'),
            'revenue': movie_info.get('revenue'),
            'certification': movie_info.get('certification'),
        }
        movie_data_list.append(movie_data)

Year 2000: 100%|███████████████████████████| 2148/2148 [05:30<00:00,  6.50it/s]
Year 2001: 100%|███████████████████████████| 2345/2345 [05:57<00:00,  6.56it/s]


In [23]:
# Save the movie data as a JSON file
filename = f"financial_data_{year}.json"
with open(filename, 'w') as file:
    json.dump(movie_data_list, file)

## Financial Data

In [24]:
# Function to extract movie info from TMDB API
def get_movie_info(movie_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    response = requests.get(url)
    movie_data = response.json()
    return movie_data

In [25]:
# Function to append data to a JSON file
def append_to_json(data, filename):
    with open(filename, 'a') as file:
        json.dump(data, file)
        file.write('\n')

In [26]:
# API Key
api_key = "6ae1796c8924400f82273b37dedcdd66"

In [27]:
# Test API calls
avengers_info = get_movie_info("tt0848228", api_key)
notebook_info = get_movie_info("tt0332280", api_key)

print("The Avengers:")
print(avengers_info)
print("\n")
print("The Notebook:")
print(notebook_info)

The Avengers:
{'adult': False, 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg', 'belongs_to_collection': {'id': 86311, 'name': 'The Avengers Collection', 'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg', 'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'}, 'budget': 220000000, 'genres': [{'id': 878, 'name': 'Science Fiction'}, {'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}], 'homepage': 'https://www.marvel.com/movies/the-avengers', 'id': 24428, 'imdb_id': 'tt0848228', 'original_language': 'en', 'original_title': 'The Avengers', 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!', 'popularity': 129.163, 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg', 'production_companies': [{'id': 420, 'logo_path': '/hUzeosd

In [28]:
# Load the filtered basics dataset
basics = pd.read_csv('Data/basics.csv.gz', compression='gzip')

In [29]:
# Extract financial data for movies that meet the criteria in Part 1
filtered_movies = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2001)]

for year in [2000, 2001]:
    movies_year = filtered_movies[filtered_movies['startYear'] == year]
    movie_data_list = []

    for _, movie in movies_year.iterrows():
        movie_id = movie['tconst']
        movie_info = get_movie_info(movie_id, api_key)
        movie_data = {
            'movie_id': movie_id,
            'budget': movie_info.get('budget', 0),
            'revenue': movie_info.get('revenue', 0),
            'certification': movie_info.get('certification'),
        }
        movie_data_list.append(movie_data)

    filename = f"financial_data_{year}.json"
    with open(filename, 'w') as file:
        json.dump(movie_data_list, file)

    df = pd.DataFrame(movie_data_list)
    csv_filename = f"financial_data_{year}.csv.gz"
    df.to_csv(csv_filename, compression='gzip', index=False)

## Exploratory Data Analysis

In [46]:
# Load the financial data for each year
years = [2000, 2001]
dfs = []

for year in years:
    csv_filename = f"financial_data_{year}.csv.gz"
    df = pd.read_csv(csv_filename, compression='gzip')
    dfs.append(df)

In [47]:
# Concatenate the data into one dataframe
financial_data = pd.concat(dfs, ignore_index=True)

In [48]:
# Filter movies with valid financial information (non-zero budget or revenue)
valid_financial_data = financial_data[(financial_data['budget'] > 0) | (financial_data['revenue'] > 0)]

In [49]:
# Count of movies with valid financial information
valid_movies_count = valid_financial_data['certification'].value_counts().reset_index()

In [50]:
# Count of movies in each certification category
certification_counts = valid_financial_data['certification'].value_counts()

In [51]:
# Average revenue per certification category
average_revenue = valid_financial_data.groupby('certification')['revenue'].mean()

In [52]:
# Average budget per certification category
average_budget = valid_financial_data.groupby('certification')['budget'].mean()

In [56]:
# Save the merged TMDB API data as a CSV file
financial_data.to_csv('tmdb_results_combined.csv.gz', compression='gzip', index=False)