In [29]:
import requests
import pandas as pd
import gzip
import json
from datetime import datetime
from time import sleep


In [30]:
with open('secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [31]:
#endpoint call to tmdb
def tmdb_api_call(endpoint, params):
    base_url = "https://api.themoviedb.org/3"
    api_key = login["api-key"]  
    params["api_key"] = api_key
    response = requests.get(f"{base_url}/{endpoint}", params=params)
    # Check if the response is not empty before attempting to decode as JSON
    if response.text:
        return response.json()
    else:
        return None 

#I found that this call returns a single page, in order to get all filtered movies, i have adjusted it to get all pages
def tmdb_movies_api_call(endpoint, params):
    base_url = "https://api.themoviedb.org/3"
    api_key = login["api-key"] 
    params["api_key"] = api_key

    results = []
    page = 1
    

    while True:
        params["page"] = page
        
        response = requests.get(f"{base_url}/{endpoint}", params=params)
        data = response.json()
        if "results" in data:
            results.extend(data["results"])

            # Check if there are more pages
            if page <= data["total_pages"]:
                page += 1
            else:
                break
        else:
            break

    return results


In [32]:

# Function to get movie details by ID
def get_movie_details(movie_id):
    endpoint = f"movie/{movie_id}"
    params = {"append_to_response": "credits"}
    return tmdb_api_call(endpoint, params)

# Function to get movie details by ID
def get_movie_mpaa_rating(movie_id):
    endpoint = f"movie/{movie_id}/releases"
    params = {}
    return tmdb_api_call(endpoint, params)

In [33]:
#Function to get the certification information
def add_certification(movie):
    tmdb_api_key = login["api-key"]
    tmdb_movie_id = movie['id']  

    url = f'https://api.themoviedb.org/3/movie/{tmdb_movie_id}?api_key={tmdb_api_key}&language=en-US'
    response = requests.get(url)
    if response.text:
        data = response.json()
    else:
        return None 
    data = {}

    # Extract MPAA Rating (Certification) and add it to the movie info
    certification = data.get('certification', '')
    movie['certification'] = certification

    return movie

In [44]:
# Function to extract and save movies based on specified criteria
def extract_and_save_movies(start_year, end_year):
    movies_data = []

    for year in range(start_year, end_year + 1):
        # Filter criterias for movies
        endpoint = "discover/movie"
        params = {
            "primary_release_year": year,
            "include_adult": False,
            "with_original_language": "en",
            "with_runtime.gte": 60,
            "with_genres": "",
            "certification_country": "US",
            "region": "US",
        }
        

        # Get the list of movies
        results = tmdb_movies_api_call(endpoint, params)
        for movie in results:
            movie_id = movie["id"]
            # Get additional details for the movie
            movie_details = get_movie_details(movie_id)
            add_movie_details = get_movie_mpaa_rating(movie_id)
            if (
                movie_details.get("genres")
                and movie_details.get("runtime")
                and movie_details["runtime"] >= 60
                and "Documentary" not in [
                    genre["name"] for genre in movie_details.get("genres", [])
                ]
                and movie_details.get("production_countries", []) == [{"iso_3166_1": "US", "name": "United States of America"}]
            ):
                mpaa_rating = ''
                additinal_movie = add_movie_details.get("countries", [])
                us_certification = next((movie_add_details['certification'] for movie_add_details in reversed(additinal_movie) if movie_add_details['iso_3166_1'] == 'US'), None)
                certificate = add_certification(movie);
                movie_info = {
                    "tconst": movie_details.get("imdb_id"),
                    "title": movie_details.get("title"),
                    "release_date": movie_details.get("release_date"),
                    "runtime": movie_details.get("runtime"),
                    "genres": movie_details.get("genres"),
                    "budget": movie_details.get("budget"),
                    "revenue": movie_details.get("revenue"),
                    "mpaa_rating": us_certification,
                    "vote_average": certificate['vote_average'],
                    "vote_count": certificate['vote_count'],
                    "certificate":certificate
                }

                movies_data.append(movie_info)
                #writing the movie data into a json file
                append_to_json("output.json", movies_data)
    # Save the data to a CSV file for each year
    for year in range(start_year, end_year + 1):
        year_data = [movie for movie in movies_data if movie["release_date"].startswith(str(year))]
        df = pd.DataFrame(year_data)
        file_name = f"movies_{year}.csv.gz"
        df.to_csv(file_name, index=False, compression="gzip")
       
        
       

In [45]:
# Function to write into the jason file
def append_to_json(file_path, data):
    try:
        with open(file_path, "r") as file:
            json_data = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        # Handle the case when the file is empty or not found
        json_data = []

    json_data.extend(data)

    with open(file_path, "w") as file:
        json.dump(json_data, file, indent=2)
        

In [None]:
# Extract movies for 2000 and 2001 and save as JSON
extract_and_save_movies(2000, 2001)

In [28]:
# Test movie IDs
test_movie_ids = ['tt0848228', 'tt0332280']

for movie_id in test_movie_ids:
    movie_details = get_movie_details(movie_id)

    if movie_details is not None:
        print(f"Movie ID: {movie_id}")
        print(f"Title: {movie_details.get('title')}")
        print(f"Overview: {movie_details.get('overview')}")
        print(f"Budget: {movie_details.get('budget')}")
        print(f"Revenue: {movie_details.get('revenue')}")
        print(f"Certification: {movie_details.get('certification')}")
        print("\n")
    else:
        print(f"Error retrieving details for movie ID {movie_id}")

Movie ID: tt0848228
Title: The Avengers
Overview: When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!
Budget: 220000000
Revenue: 1518815515
Certification: None


Movie ID: tt0332280
Title: The Notebook
Overview: An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.
Budget: 29000000
Revenue: 115603229
Certification: None


