In [19]:
# Import Libraries
import pandas as pd
import numpy as np
import requests
import json

In [20]:
# Load the datasets
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [21]:
# Load AKAs dataset
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.replace({'\\N': np.nan}, inplace=True)

# Filter AKAs dataset to include only US movies
us_movies_akas = akas[akas['region'] == 'US']

In [22]:
# Load Ratings dataset
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings.replace({'\\N': np.nan}, inplace=True)

In [23]:
# Load Basics dataset
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.replace({'\\N': np.nan}, inplace=True)

# Filter Basics dataset to include only full-length movies released between 2000 and 2021 in the US and exclude documentaries
basics = basics[basics['titleType'] == 'movie']
basics['startYear'] = basics['startYear'].astype(float)  # Convert startYear column to float
basics = basics[basics['startYear'].between(2000, 2021)]
basics = basics[~basics['genres'].str.contains('documentary', case=False, na=False)]

# Filter Basics dataset based on AKAs dataset to include only US movies
basics = basics[basics['tconst'].isin(us_movies_akas['titleId'])]

In [24]:
# Check remaining movies and data types in Basics dataset
print(basics.info())

# Check remaining movies and data types in AKAs dataset
print(us_movies_akas.info())

# Check remaining movies and data types in Ratings dataset
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97482 entries, 34803 to 9985716
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          97482 non-null  object 
 1   titleType       97482 non-null  object 
 2   primaryTitle    97482 non-null  object 
 3   originalTitle   97482 non-null  object 
 4   isAdult         97482 non-null  object 
 5   startYear       97482 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  83340 non-null  object 
 8   genres          94538 non-null  object 
dtypes: float64(1), object(8)
memory usage: 7.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1449468 entries, 5 to 36461217
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1449468 non-null  object
 1   ordering         1449468 non-null  int64 
 2   title            1449468 non

In [25]:
# Save filtered datasets as compressed CSV files
basics.to_csv('Data/basics.csv.gz', compression='gzip', index=False)
us_movies_akas.to_csv('Data/akas.csv.gz', compression='gzip', index=False)
ratings.to_csv('Data/ratings.csv.gz', compression='gzip', index=False)

In [26]:
# Function to extract movie info from TMDB API
def get_movie_info(movie_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    response = requests.get(url)
    movie_data = response.json()
    return movie_data

# Function to append data to a JSON file
def append_to_json(data, filename):
    with open(filename, 'a') as file:
        json.dump(data, file)
        file.write('\n')

In [27]:
api_key = "6ae1796c8924400f82273b37dedcdd66"

avengers_info = get_movie_info("tt0848228", api_key)
notebook_info = get_movie_info("tt0332280", api_key)

print("The Avengers:")
print(avengers_info)
print("\n")
print("The Notebook:")
print(notebook_info)

The Avengers:
{'adult': False, 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg', 'belongs_to_collection': {'id': 86311, 'name': 'The Avengers Collection', 'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg', 'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'}, 'budget': 220000000, 'genres': [{'id': 878, 'name': 'Science Fiction'}, {'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}], 'homepage': 'https://www.marvel.com/movies/the-avengers', 'id': 24428, 'imdb_id': 'tt0848228', 'original_language': 'en', 'original_title': 'The Avengers', 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!', 'popularity': 155.899, 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg', 'production_companies': [{'id': 420, 'logo_path': '/hUzeosd

In [None]:
api_key = "6ae1796c8924400f82273b37dedcdd66"

avengers_info = get_movie_info("tt0848228", api_key)
notebook_info = get_movie_info("tt0332280", api_key)

print("The Avengers:")
print(avengers_info)
print("\n")
print("The Notebook:")
print(notebook_info)

In [None]:
# Load the filtered basics dataset 
basics = pd.read_csv('Data/basics.csv.gz', compression='gzip')

# Extract financial data for movies starting in 2000 or 2001
years = [2000, 2001]

for year in years:
    # Filter basics dataset for the specific year
    movies_year = basics[basics['startYear'] == year]
    
    # Create an empty list to store movie data
    movie_data_list = []
    
    # Loop through the movies and extract financial data
    for index, movie in movies_year.iterrows():
        movie_id = movie['tconst']
        movie_info = get_movie_info(movie_id, api_key)
        movie_data = {
            'movie_id': movie_id,
            'budget': movie_info.get('budget'),
            'revenue': movie_info.get('revenue'),
            'certification': movie_info.get('certification'),
        }
        movie_data_list.append(movie_data)
    
    # Save the movie data as a JSON file
    filename = f"financial_data_{year}.json"
    with open(filename, 'w') as file:
        json.dump(movie_data_list, file)

# Exploratory Data Analysis

In [None]:
# Load the financial data for each year and concatenate into one dataframe
years = [2000, 2001]  

dfs = []
for year in years:
    filename = f"financial_data_{year}.json"
    with open(filename, 'r') as file:
        data = json.load(file)
        df = pd.DataFrame(data)
        dfs.append(df)

financial_data = pd.concat(dfs, ignore_index=True)

# Filter movies with valid financial information (non-zero budget or revenue)
valid_financial_data = financial_data[(financial_data['budget'] > 0) | (financial_data['revenue'] > 0)]

# Count of movies with valid financial information
valid_movies_count = len(valid_financial_data)

# Count of movies in each certification category
certification_counts = valid_financial_data['certification'].value_counts()

# Average revenue per certification category
average_revenue = valid_financial_data.groupby('certification')['revenue'].mean()

# Average budget per certification category
average_budget = valid_financial_data.groupby('certification')['budget'].mean()

In [None]:
# Save the merged TMDB API data as a CSV file
financial_data.to_csv('tmdb_results_combined.csv.gz', compression='gzip', index=False)