# Load the data into your project using pandas:

- Open a Python environment or **Jupyter Notebook**.
- Import the pandas library: **import pandas as pd**.
- Use the copied link addresses to **read each file into a DataFrame()**:

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
print("CSV Readings complete")

CSV Readings complete


In [3]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Perform data preprocessing and filtering:

- Replace "\N" with np.nan in each DataFrame: basics.replace({'\\N': np.nan}, inplace=True), akas.replace({'\\N': np.nan}, inplace=True), ratings.replace({'\\N': np.nan}, inplace=True).
- Filter the basics DataFrame based on the provided specifications:

In [4]:
basics = pd.DataFrame(basics).replace({'\\N': np.nan})
akas = pd.DataFrame(akas).replace({'\\N': np.nan})
ratings = pd.DataFrame(ratings).replace({'\\N': np.nan})

print("NaNs were Replaced Succesfully")

NaNs were Replaced Succesfully


In [5]:
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])
basics = basics[basics['titleType'] == 'movie']
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce') # Convert 'startYear' to numeric type
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]
basics = basics[~basics['genres'].str.contains('documentary', case=False)]

### Filter the basics DataFrame to include only US movies based on the akas DataFrame:

In [6]:
akas = akas[(akas['region'] == 'US')]
print(akas)
keepers = basics['tconst'].isin(akas['titleId'])
keepers
basics = basics[keepers]

            titleId  ordering                                      title  \
5         tt0000001         6                                 Carmencita   
14        tt0000002         7                     The Clown and His Dogs   
33        tt0000005        10                           Blacksmith Scene   
36        tt0000005         1                        Blacksmithing Scene   
41        tt0000005         6                        Blacksmith Scene #1   
...             ...       ...                                        ...   
36220084  tt9916560         1  March of Dimes Presents: Once Upon a Dime   
36220154  tt9916620         1                          The Copeland Case   
36220243  tt9916702         1              Loving London: The Playground   
36220286  tt9916756         1                   Pretty Pretty Black Girl   
36220302  tt9916764         1                                         38   

         region language        types             attributes isOriginalTitle  
5       

In [7]:
keepers2 = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers2]

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81762 entries, 34803 to 9930477
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81762 non-null  object 
 1   titleType       81762 non-null  object 
 2   primaryTitle    81762 non-null  object 
 3   originalTitle   81762 non-null  object 
 4   isAdult         81762 non-null  object 
 5   startYear       81762 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  81762 non-null  object 
 8   genres          81762 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.2+ MB


In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501175 entries, 0 to 1319567
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501175 non-null  object 
 1   averageRating  501175 non-null  float64
 2   numVotes       501175 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [10]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444985 entries, 5 to 36220302
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1444985 non-null  object
 1   ordering         1444985 non-null  int64 
 2   title            1444985 non-null  object
 3   region           1444985 non-null  object
 4   language         3951 non-null     object
 5   types            980119 non-null   object
 6   attributes       46763 non-null    object
 7   isOriginalTitle  1443643 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.2+ MB


# Save the filtered DataFrames to compressed CSV files:

- Create a "Data" folder within your repository if it doesn't already exist.
- Use the to_csv method to save each DataFrame with compression:

In [None]:
basics.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)
akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)
ratings.to_csv("Data/title_ratings.csv.gz", compression='gzip', index=False)

In [None]:
print("Check \"Data\" Folder")

In [None]:
import requests
import time

max_retries = 3
retry_delay = 5

def make_api_request(url):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
        except requests.exceptions.RequestException:
            pass
        retries += 1
        time.sleep(retry_delay)
    
    return None

# Usage
response = make_api_request(url)
if response is not None:
    # Process the response data
else:
    print("Unable to retrieve data from the API.")


# **<span style="color:#266d07">Part 2[MORE DATA]</span>**

In [None]:
import requests
import json as jn

# Load the filtered data from Part 1
basics = pd.read_csv("Data/title_basics.csv.gz")
akas = pd.read_csv("Data/title_akas.csv.gz")
ratings = pd.read_csv("Data/title_ratings.csv.gz")

# Function to add certification (MPAA Rating) to movie information
def add_certification(movie_info):
    movie_id = movie_info['tconst']
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=e6439afc327b0d52d1d2b3517a3493f8"
    response = requests.get(url)
    data = jn.loads(response.content.decode('utf-8'))
    
    if 'release_dates' in data and 'results' in data['release_dates']:
        results = data['release_dates']['results']
        for result in results:
            if 'iso_3166_1' in result and result['iso_3166_1'] == 'US':
                certification = result['release_dates'][0]['certification']
                break
        else:
            certification = None
    else:
        certification = None
    
    movie_info['certification'] = certification
    return movie_info

# Apply certification function to each movie
movies_with_certification = basics.apply(add_certification, axis=1)

# Filter movies with valid financial information
financial_info = movies_with_certification[(movies_with_certification['budget'] > 0) | (movies_with_certification['revenue'] > 0)]

# Save results to separate CSV files for each year
for year in range(2000, 2002):
    year_filtered = financial_info[financial_info['startYear'] == year]
    year_filtered.to_csv(f"Data/year_{year}_financial_info.csv.gz", compression='gzip', index=False)

# Concatenate the data into one dataframe
all_data = pd.concat([pd.read_csv(f"Data/year_{year}_financial_info.csv.gz") for year in range(2000, 2002)])

# How many movies had at least some valid financial information?
valid_financial_info_movies = len(all_data)
print(f"Number of movies with valid financial information: {valid_financial_info_movies}")

# How many movies are there in each certification category?
certification_counts = all_data['certification'].value_counts()
print("Number of movies in each certification category:")
print(certification_counts)

# What is the average revenue per certification category?
average_revenue_per_certification = all_data.groupby('certification')['revenue'].mean()
print("Average revenue per certification category:")
print(average_revenue_per_certification)

# What is the average budget per certification category?
average_budget_per_certification = all_data.groupby('certification')['budget'].mean()
print("Average budget per certification category:")
print(average_budget_per_certification)