In [None]:
import requests
api_key = 'XXXX'
import csv
from time import sleep
from random import randint
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import time

In [None]:
# Define the range of pages to fetch data from and lists to store it
top_rated_movies_data = []
num_pages = 100
release_date_list = []
type_list = []

In [None]:
#Function to enter TMDB and fetch data about film from Top Rated list
for page in range(1, num_pages + 1):

    print(f"Entering page {page}")
        
    base_url = f'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page={page}'
    url = 'https://api.themoviedb.org/3'

    response = requests.get(base_url)
    if response.status_code == 200:
        time.sleep(2)        
        top_rated_movies = response.json()['results']
        #Get Movie ID
        for movie in top_rated_movies:
            movie_id = movie['id']
            title = movie['title']
             # Make an additional API request to get the movie's release date in US and save films only from specified date range
            release_date_endpoint = f'/movie/{movie_id}/release_dates'
            release_date_params = {'api_key': api_key}
            release_date_response = requests.get(url + release_date_endpoint, params=release_date_params)

            if release_date_response.status_code == 200:
                    release_data = release_date_response.json()
                    desired_country_code = 'US'  
                    selected_release_info = None
                    for entry in release_data['results']:
                        if 'iso_3166_1' in entry and entry['iso_3166_1'] == desired_country_code:
                            selected_release_info = {
                                'release_date': entry['release_dates'][0]['release_date'],
                                'type': entry['release_dates'][0]['type']
                            }
                            break  

                    if selected_release_info:
                        release_date = selected_release_info['release_date']
                        release_year = int(release_date.split('-')[0])
                        if not (2000 <= release_year <= 2022):

                            continue  
                        else:
                            release_date_list.append(selected_release_info['release_date'])
                            type_list.append(selected_release_info['type'])
                    else:
                        release_date_list.append('0000-00-00T00:00:00.000Z')
                        type_list.append('N/A')
            details_endpoint = f'/movie/{movie_id}'            
            details_params = {'api_key': api_key}
            details_response = requests.get(url + details_endpoint, params=details_params)
            #Fetch data available on main page of a movie
            if details_response.status_code == 200:
                movie_details = details_response.json()
                revenue = movie_details.get('revenue', 'n/a')
                budget = movie_details.get('budget','n/a')
                runtime = movie_details.get('runtime','n/a')
                genres = movie_details.get('genres','n/a')
                genre_1 = genres[0]['name']
                if len(genres) > 1:
                    genre_2 = genres[1]['name']
                else:
                    genre_2 = 'n/a'
                vote_average = movie_details.get('vote_average','n/a')
                vote_count = movie_details.get('vote_count', 'n/a')
                production_countries = movie_details.get('production_countries','n/a')
                spoken_languages = movie_details.get('spoken_languages','n/a')
                imdb_id = movie_details.get('imdb_id','n/a')
                keywords_endpoint = f'/movie/{movie_id}/keywords'
                keywords_params = {'api_key': api_key}
                keywords_response = requests.get(url + keywords_endpoint, params=keywords_params)
                #Select 5 most important keywords
                if keywords_response.status_code == 200:
                    keywords_data = keywords_response.json()
                    keywords = [keyword['name'] for keyword in keywords_data['keywords'][:5]]
                else:
                    print(f"Failed to retrieve keywords for movie {movie_id}. Status Code: {keywords_response.status_code}")
                #Find ID of a movie in IMDB to tie it with other databases
                imdb_endpoint = f'/movie/{movie_id}/external_ids'
                imdb_params = {'api_key': api_key}
                imdb_response = requests.get(url + imdb_endpoint, params=imdb_params)

                if imdb_response.status_code == 200:
                    imdb_data = imdb_response.json()
                    imdb_id = imdb_data.get('imdb_id', 'n/a')
                else:
                    print(f"Failed to retrieve IMDB ID for movie {movie_id}. Status Code: {imdb_response.status_code}")
                #Find information about cast and creators of a film
                director_endpoint = f'/movie/{movie_id}/credits?language=en-US'
                director_params = {'api_key': api_key}
                director_response = requests.get(url + director_endpoint, params=director_params)

                if director_response.status_code == 200:
                    director_data = director_response.json()
                    # Find the element with job = 'Director' and return the 'name'
                    director_info = next((item for item in director_data.get('crew', []) if item['job'] == 'Director'), None)
                    director = director_info.get('name', 'n/a')
                else:
                    print("Director not found in the data.")
                #Screenplay
                if director_response.status_code == 200:
                    screenplay_info = next((item for item in director_data.get('crew', []) if item['job'] == 'Screenplay'), None)
                    if screenplay_info is not None:
                        screenplay = screenplay_info.get('name', 'n/a')
                    else:
                        screenplay = ('n/a')
                    
                #Actors
                cast = director_data['cast']
                # Get the first two actors
                first_actor = cast[0]['name'] if len(cast) > 0 else 'N/A'
                second_actor = cast[1]['name'] if len(cast) > 1 else 'N/A'

            else:
                print(f"Failed to retrieve details for movie {movie_id}. Status Code: {details_response.status_code}")
            top_rated_movies_data.append({'Title': title, 
                                          'ID': movie_id,
                                          'IMDB_ID':imdb_id,
                                          'Revenue': revenue,
                                          'Budget': budget,
                                          'Runtime':runtime,
                                          'Genre_1':genre_1,
                                          'Genre_2':genre_2,
                                          'Vote':vote_average,
                                          'Vote_count': vote_count,
                                          'Production_countries': production_countries,
                                          'spoken_languages':spoken_languages,
                                          'keywords':keywords,
                                          'director':director,
                                          'screenplay':screenplay,
                                          'actor_1':first_actor,
                                          'actor_2':second_actor
                                         })

    else:
        print(f"Error: {response.status_code} - Unable to fetch data from page {page}")
  


In [None]:
#Collect your data into DataFrame
df = pd.DataFrame(top_rated_movies_data)
df['Release_Date'] = release_date_list
df['RDtype'] = type_list

In [None]:
#Divide certain variables and remove leftovers
columns_to_select = ['Production_countries', 'spoken_languages', 'keywords']
df_spare = df[columns_to_select].copy()

In [None]:
df['Release_Date'] = df['Release_Date'].str.split('T').str[0]

In [None]:
df['Country_1'] = df['Production_countries'].apply(lambda x: x[0]['iso_3166_1'] if len(x) > 0 else 'n/a')
df['Country_2'] = df['Production_countries'].apply(lambda x: x[1]['iso_3166_1'] if len(x) > 1 else 'n/a')

In [None]:
df['Language_1'] = df['spoken_languages'].apply(lambda x: x[0]['english_name'] if len(x) > 0 else 'n/a')
df['Language_2'] = df['spoken_languages'].apply(lambda x: x[1]['english_name'] if len(x) > 1 else 'n/a')

In [None]:
df['keyword_1'] = df['keywords'].apply(lambda x: x[0] if len(x) > 0 else 'n/a')
df['keyword_2'] = df['keywords'].apply(lambda x: x[1] if len(x) > 1 else 'n/a')
df['keyword_3'] = df['keywords'].apply(lambda x: x[2] if len(x) > 2 else 'n/a')
df['keyword_4'] = df['keywords'].apply(lambda x: x[3] if len(x) > 3 else 'n/a')
df['keyword_5'] = df['keywords'].apply(lambda x: x[4] if len(x) > 4 else 'n/a')

In [None]:
df.drop(columns=columns_to_select, inplace=True)

In [None]:
# Write the DataFrame to a CSV file
csv_file = "movies.csv"

df.to_csv(csv_file, index=False)
