In [None]:
import requests
import pandas as pd
import time
from pandas import json_normalize

# My NYT API key
nyt_api_key = '3PPv8tcsK4RprAXfGrUvrWgKGR5Vzvup'
# My TMDb API key
tmdb_api_key = 'a1c404440f32faa30f8760426af7e694'

# Setting the input command for the user's keyword
search_title = input("Please type your keyword here: ")

# Setting the base URL for the New York Times API
nyt_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filtering for movie reviews with the keyword/given title in the headline
# Selecting a wide range of time (begin/end date) to maximize results given
filter_query = f'section_name:"Movies" AND type_of_material:"Review" AND headline:"{search_title}"'
sort = "newest"
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"
begin_date = "20130101"
end_date = "20240531"

# Creating an empty list to store the reviews retrieved
all_reviews = []

# Creating a for loop to loop through 20 pages, starting from page 0
# Steps outlined throughout the code
for page in range(20):
    # Extending the query URL to include the paga parameter
    query_url = (
        f"{nyt_url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
        + f'&fq={filter_query}&sort={sort}&fl={field_list}&page={page}'
    )

    try:
        # Initializing the GET request to retrieve the page of results;
        # Try-except clause for looping through reviews["response"]["docs"]
        # with an append for each review, finally printing out the query number;
        # Rounding out this code block with a 12-second interval between 
        # queries to stay within the API query rate limits
        response = requests.get(query_url)
        
        # Verification step to make sure my GET request was successful
        if response.status_code == 200:
            reviews = response.json().get('response', {}).get('docs', [])
            
            if reviews:
                # Appending each review to the all_reviews list
                all_reviews.extend(reviews)
                print(f"Page {page} retrieved successfully with {len(reviews)} reviews.")
            else:
                print(f"Page {page} had no results.")
                break
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
            print(response.text)
            break
        
        # 12-second interval between requests
        time.sleep(12)
    
    except Exception as e:
        print(f"An error occurred on page {page}: {e}")
        break

# Verification if any reviews were received here
if not all_reviews:
    print(f"No reviews found for the movie title '{search_title}' in the NYT API.")
else:
    # Convert reviews_list to a Pandas DataFrame using json_normalize()
    df = json_normalize(all_reviews)
    df.columns

    # 6. Extract the movie title from the "headline.main" column and save     
    # it to a new column "title". To do this, you will use the Pandas 
    # apply() method and the following lambda function:
    
    if 'headline.main' in df.columns:
        df['title'] = df['headline.main'].apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])
    else:
        print("The 'headline.main' column unfortunately wasn't found.")

    # 7. Use the supplied extract_keywords function to convert the
    # "keywords" column from a list of dictionaries to strings using 
    # the apply() method.
    
    def extract_the_keywords(keyword_list):
        if isinstance(keyword_list, list):
            return ', '.join([kw['value'] for kw in keyword_list if 'value' in kw])
        return ''

    # Converting the "keywords" column from a list of dictionaries 
    # to strings
    if 'keywords' in df.columns:
        df['keywords'] = df['keywords'].apply(extract_the_keywords)
    else:
        print("The 'keywords' column is not found in the DataFrame.")

# PART TWO

    # Create an empty list called tmdb_movies_list to store the results
    # from your API requests. This will contain a list of dictionaries.
    
    if 'title' in df.columns:
        titles = df['title'].to_list()
    else:
        titles = []
        print("The 'title' column is not found in the DataFrame.")

    tmdb_search_url = "https://api.themoviedb.org/3/search/movie"

    # List to store movie details
    tmdb_movies_list = []

    # Create a variable called request_counter and initialize it with 
    # the value of 1. This counter should do the following:
    # Increment by one every time you iterate through the titles list.
    # Use time.sleep(1) when it reaches a multiple of 50.
    # Print a message to indicate that the application is sleeping.
    
    request_counter = 1

    # Loop through each title to get the movie ID and then fetch movie details
    for title in titles:
        try:
            # Using a try clause to:
            # 1. collect the movie ID from the first result
            # 2. GET request with movie query + ID to retrieve full movie
            # details in JSON.
            # 3. Extract the genre names from the results into a list
            # called genres.
            # 4. Extract the spoken_languages' English name from the
            # results into a list called spoken_languages.
            # 5.Extract the production_countries' name from the 
            # results into a list called production_countries.
            # 6. Create a dictionary with the following results: title,
            # original_title, budget, original_language, homepage,
            # overview, popularity, runtime, revenue, release_date,
            # vote_average, vote_count, as well as the genres,
            # spoken_languages, and production_countries lists you
            # just created.
            # 7. Append this dictionary to tmdb_movies_list.
            # 8. Print out the name of the movie and a message to
            # indicate that the title was found.
            # 9.
            # 
            search_query_url = f"{tmdb_search_url}?query={title.replace(' ', '+')}&api_key={tmdb_api_key}"
            
            # GET request
            search_response = requests.get(search_query_url)
            
            if search_response.status_code == 200:
                search_results = search_response.json().get('results', [])
                if search_results:
                    movie_id = search_results[0]['id']  # Take the first result's ID
                    
                    # Getting the movie details using the movie ID
                    movie_details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
                    details_response = requests.get(movie_details_url)
                    
                    if details_response.status_code == 200:
                        movie_details = details_response.json()
                        
                        # Extracting genres
                        genres = [genre['name'] for genre in movie_details.get('genres', [])]
                        
                        # Extracting spoken languages
                        spoken_languages = [lang['english_name'] for lang in movie_details.get('spoken_languages', [])]
                        
                        # Extracting production countries
                        production_countries = [country['name'] for country in movie_details.get('production_countries', [])]
                        
                        # Creating a dictionary with the movie details
                        movie_info = {
                            'title': movie_details.get('title'),
                            'original_title': movie_details.get('original_title'),
                            'budget': movie_details.get('budget'),
                            'original_language': movie_details.get('original_language'),
                            'homepage': movie_details.get('homepage'),
                            'overview': movie_details.get('overview'),
                            'popularity': movie_details.get('popularity'),
                            'runtime': movie_details.get('runtime'),
                            'revenue': movie_details.get('revenue'),
                            'release_date': movie_details.get('release_date'),
                            'vote_average': movie_details.get('vote_average'),
                            'vote_count': movie_details.get('vote_count'),
                            'genres': genres,
                            'spoken_languages': spoken_languages,
                            'production_countries': production_countries
                        }
                        
                        # Appending the dictionary to the tmdb_movies_list
                        tmdb_movies_list.append(movie_info)
                        print(f"Details for the movie '{title}' successfully retrieved.")
                    else:
                        print(f"Unfortunately, failed to retrieve details for the movie '{title}'. Status code: {details_response.status_code}")
                else:
                    print(f"Unfortunately, no results were found for the movie '{title}'.")
            else:
                print(f"Unfortunately,failed to search for the movie '{title}'. Status code: {search_response.status_code}")
            
            # Increasing the request counter by +1
            request_counter += 1
            
            # Verifying if the request counter is a multiple of 50
            if request_counter % 50 == 0:
                print("Seems that we've reached 50 requests, time to sleep for 1 second, else we may hit a limit...")
                time.sleep(1)
        
        except Exception as e:
            print(f"Unfortunately, an error occurred while trying to process the movie '{title}': {e}")

    movie_details_df = pd.json_normalize(tmdb_movies_list)
    print(movie_details_df.head(5))

# PART THREE
    
    # 1. Merge the New York Times reviews and TMDB DataFrames on the title column.
    merged_df = pd.merge(df, movie_details_df, on='title', how='inner')

    # 2a. The list of columns that need fixing
    columns_to_fix = ['genres', 'spoken_languages', 'production_countries']

    # 2b. List of characters to remove
    characters_to_remove = ["[", "]", "'"]

    # 2c. Loop through the characters_to_remove and use the Pandas
    # str.replace() method to remove the character from the string.
    for column in columns_to_fix:
        merged_df[column] = merged_df[column].astype(str)
        for char in characters_to_remove:
            merged_df[column] = merged_df[column].str.replace(char, '')

    # 2d. Print the head of the updated DataFrame to confirm the list
    # characters were removed
    print("Updated DataFrame:")
    merged_df.head()

In [31]:
json.dumps(tmdb_movies_list[:5], indent=4)

# This is where the movie_details_list is converted to a DataFrame
tmdb_df = pd.DataFrame(tmdb_movies_list)

    # Display the first few rows of the DataFrame
tmdb_df.head(5)

Unnamed: 0,title,original_title,budget,original_language,homepage,overview,popularity,runtime,revenue,release_date,vote_average,vote_count,genres,spoken_languages,production_countries
0,Rebel Moon - Part Two: The Scargiver,Rebel Moon - Part Two: The Scargiver,83000000,en,https://www.netflix.com/title/81624666,The rebels gear up for battle against the ruth...,818.825,123,0,2024-04-19,6.097,782,"[Science Fiction, Action, Drama]",[English],[United States of America]
1,Arcadian,Arcadian,0,en,,"In a near future, normal life on Earth has bee...",141.448,92,859453,2024-04-12,6.349,126,"[Science Fiction, Thriller, Horror]",[English],"[Ireland, United States of America]"
2,Glitter,Glitter,22000000,en,,"A young woman is catapulted into pop stardom, ...",13.298,104,5271666,2001-09-21,4.404,135,"[Drama, Music, Romance]",[English],[United States of America]
3,Dune: Part Two,Dune: Part Two,190000000,en,https://www.dunemovie.com,Follow the mythic journey of Paul Atreides as ...,1242.512,167,704300257,2024-02-27,8.184,3976,"[Science Fiction, Adventure]",[English],[United States of America]
4,Io Capitano,Io capitano,13272819,it,https://cohenmedia.net/product/io-capitano,"Longing for a brighter future, two Senegalese ...",108.517,121,0,2023-09-07,7.793,509,"[Adventure, Drama]","[English, Italian, French, Wolof]","[Belgium, France, Italy]"


In [34]:
merged_df.to_csv('merged_movie_data.csv', index=False)
print('Data has been exported')

Data has been exported
