# Load the data into your project using pandas:

- Open a Python environment or **Jupyter Notebook**.
- Import the pandas library: **import pandas as pd**.
- Use the copied link addresses to **read each file into a DataFrame()**:

In [1]:
import pandas as pd
import numpy as np
import requests
import json as jn
import multiprocessing

In [2]:
# Set the number of processes for parallel processing
num_processes = multiprocessing.cpu_count()

# URLs for data retrieval
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
# Read the data from URLs
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
print("CSV Readings complete")
basics.head()

CSV Readings complete


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Perform data preprocessing and filtering:

- Replace "\N" with np.nan in each DataFrame: basics.replace({'\\N': np.nan}, inplace=True), akas.replace({'\\N': np.nan}, inplace=True), ratings.replace({'\\N': np.nan}, inplace=True).
- Filter the basics DataFrame based on the provided specifications:

In [4]:
# Replace "\N" with np.nan in each DataFrame
basics.replace({'\\N': np.nan}, inplace=True)
akas.replace({'\\N': np.nan}, inplace=True)
ratings.replace({'\\N': np.nan}, inplace=True)

print("NaNs were Replaced Successfully")

NaNs were Replaced Successfully


In [5]:
# Filter basics DataFrame based on the provided specifications
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])
basics = basics[basics['titleType'] == 'movie']
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]
basics = basics[~basics['genres'].str.contains('documentary', case=False)]

### Filter the basics DataFrame to include only US movies based on the akas DataFrame:

In [6]:
# Filter basics DataFrame to include only US movies based on the akas DataFrame
us_movies = basics.merge(akas[akas['region'] == 'US'], left_on='tconst', right_on='titleId')

# Save the filtered DataFrames to compressed CSV files:

- Create a "Data" folder within your repository if it doesn't already exist.
- Use the to_csv method to save each DataFrame with compression:

In [7]:
# Function to fetch certification (MPAA Rating) for a batch of movies
def fetch_certifications(movie_ids):
    url = "https://api.themoviedb.org/3/movie/{ids}?api_key=e6439afc327b0d52d1d2b3517a3493f8"
    response = requests.get(url.format(ids=','.join(movie_ids)))
    data = jn.loads(response.content.decode('utf-8'))
    certifications = {}

    for result in data['results']:
        if 'iso_3166_1' in result and result['iso_3166_1'] == 'US':
            certification = result['release_dates'][0]['certification']
            certifications[result['id']] = certification

    return certifications

- **Functions**

In [8]:
# Split movie IDs into batches for parallel processing
def split_into_batches(lst, batch_size):
    return [lst[i:i+batch_size] for i in range(0, len(lst), batch_size)]

# Apply certification function to each batch of movies
def apply_certification_batch(movie_batch):
    certifications = {}
    for movie_id in movie_batch:
        certifications[movie_id] = add_certification(movie_id)
    return certifications

# Add certification (MPAA Rating) to movie information
def add_certification(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=e6439afc327b0d52d1d2b3517a3493f8"
    response = requests.get(url)
    data = jn.loads(response.content.decode('utf-8'))
    
    certification = None
    if 'release_dates' in data and 'results' in data['release_dates']:
        results = data['release_dates']['results']
        for result in results:
            if 'iso_3166_1' in result and result['iso_3166_1'] == 'US':
                certification = result['release_dates'][0]['certification']
                break

    return certification

In [9]:
# Parallelize certification retrieval using multiprocessing
def get_movie_certifications(movie_ids):
    pool = multiprocessing.Pool(processes=num_processes)
    batches = split_into_batches(movie_ids, batch_size=100)
    certifications = pool.map(apply_certification_batch, batches)
    pool.close()
    pool.join()

    movie_certifications = {}
    for batch_certifications in certifications:
        movie_certifications.update(batch_certifications)

    return movie_certifications

# **<span style="color:#266d07">Part 2[MORE DATA]</span>**

In [None]:
# Apply certification function to each movie
movie_ids = us_movies['tconst'].tolist()
movie_certifications = get_movie_certifications(movie_ids)
us_movies['certification'] = us_movies['tconst'].map(movie_certifications)

# Filter movies with valid financial information
financial_info = us_movies[(us_movies['budget'] > 0) | (us_movies['revenue'] > 0)]

- Creating a CSV that will contain our results

In [None]:
# Save results to separate CSV files for each year
for year in range(2000, 2002):
    year_filtered = financial_info[financial_info['startYear'] == year]
    year_filtered.to_csv(f"Data/year_{year}_financial_info.csv.gz", compression='gzip', index=False)
    
# Concatenate the data into one DataFrame
all_data = pd.concat([pd.read_csv(f"Data/year_{year}_financial_info.csv.gz") for year in range(2000, 2002)])


### Questions:

In [None]:
# How many movies had at least some valid financial information?
valid_financial_info_movies = len(all_data)
print(f"Number of movies with valid financial information: {valid_financial_info_movies}")

# How many movies are there in each certification category?
certification_counts = all_data['certification'].value_counts()
print("Number of movies in each certification category:")
print(certification_counts)

# What is the average revenue per certification category?
average_revenue_per_certification = all_data.groupby('certification')['revenue'].mean()
print("Average revenue per certification category:")
print(average_revenue_per_certification)

# What is the average budget per certification category?
average_budget_per_certification = all_data.groupby('certification')['budget'].mean()
print("Average budget per certification category:")
print(average_budget_per)