In [1]:
# To use the URL to python
import requests
import pandas as pd
import time  # For handling delays

In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [3]:
api_key = 'api_key_here'

In [4]:
# Sending a get request to TMDB API to fetch the list of popular movies (page 1) in English
requests.get(f'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page=1')

<Response [200]>

In [5]:
# To see the data - use .json() after the above 
requests.get(f'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page=1').json()

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/3V4kLQg0kSqPLctI5ziYWabAZYF.jpg',
   'genre_ids': [878, 28, 12],
   'id': 912649,
   'original_language': 'en',
   'original_title': 'Venom: The Last Dance',
   'overview': "Eddie and Venom are on the run. Hunted by both of their worlds and with the net closing in, the duo are forced into a devastating decision that will bring the curtains down on Venom and Eddie's last dance.",
   'popularity': 3047.508,
   'poster_path': '/aosm8NMQ3UyoBVpSxyimorCQykC.jpg',
   'release_date': '2024-10-22',
   'title': 'Venom: The Last Dance',
   'video': False,
   'vote_average': 6.489,
   'vote_count': 789},
  {'adult': False,
   'backdrop_path': '/18TSJF1WLA4CkymvVUcKDBwUJ9F.jpg',
   'genre_ids': [27, 53, 9648],
   'id': 1034541,
   'original_language': 'en',
   'original_title': 'Terrifier 3',
   'overview': "Five years after surviving Art the Clown's Halloween massacre, Sienna and Jonathan are still struggling to rebuild their shatter

In [6]:
response = requests.get(f'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page=1').json()

In [7]:
response['total_results']

944689

In [8]:
response['total_pages']

47235

In [9]:
# Extracting relevant movie details from the API response and storing them in a list of dictionaries
movie_data = []
for i in response['results']:
    movie_data.append({
        'title': i['title'],
        'overview': i.get('overview', ''),
        'original_language': i['original_language'],
        'release_date': i.get('release_date', ''),
        'popularity': i['popularity'],
        'vote_count': i['vote_count'],
        'vote_average': i['vote_average']
    })
df = pd.DataFrame(movie_data)

In [10]:
df

Unnamed: 0,title,overview,original_language,release_date,popularity,vote_count,vote_average
0,Venom: The Last Dance,Eddie and Venom are on the run. Hunted by both...,en,2024-10-22,3047.508,789,6.489
1,Terrifier 3,Five years after surviving Art the Clown's Hal...,en,2024-10-09,1929.351,1031,6.909
2,The Wild Robot,"After a shipwreck, an intelligent robot called...",en,2024-09-12,1808.363,2938,8.471
3,Apocalypse Z: The Beginning of the End,When a kind of rabies that transforms people i...,es,2024-10-04,1638.618,498,6.784
4,Gladiator II,Years after witnessing the death of the revere...,en,2024-11-13,1742.5,450,6.791
5,Deadpool & Wolverine,A listless Wade Wilson toils away in civilian ...,en,2024-07-24,1327.058,5488,7.7
6,The Substance,A fading celebrity decides to use a black mark...,en,2024-09-07,1205.748,2082,7.274
7,Classified,Operating alone in the field for more than 20 ...,en,2024-09-19,1158.653,56,5.545
8,Transformers One,The untold origin story of Optimus Prime and M...,en,2024-09-11,1113.84,727,8.1
9,Red One,After Santa Claus (codename: Red One) is kidna...,en,2024-10-31,920.72,136,6.588


In [12]:
all_movie_data = []

for j in range(1, 501): # Data for pages 1 to 500
    response = requests.get(f'https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page={j}').json()

    movie_data = []
    for i in response['results']:
        movie_data.append({
            'title': i.get('title', 'N/A'),
            'movie_ID': i.get('id', None),
            'overview': i.get('overview', ''),
            'original_language': i.get('original_language', 'N/A'),
            'release_date': i.get('release_date', 'N/A'),
            'popularity': i.get('popularity', 0),
            'vote_count': i.get('vote_count', 0),
            'vote_average': i.get('vote_average', 0)
        })
    
    all_movie_data.extend(movie_data)

In [14]:
# Convert the collected movie data to a DataFrame
df = pd.DataFrame(all_movie_data)

# Save the DataFrame to a CSV file
df.to_csv('Movies.csv', index=False)

print("Data collection complete. Data saved to 'Movies.csv'.")

Data collection complete. Data saved to 'Movies.csv'.


In [15]:
df['overview'] = df['overview'].replace({r'\n': ' ', r'\r': ' '}, regex=True)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('Cleaned_Movies.csv', index=False)

print("Line breaks in 'overview' column replaced. Data saved to 'Cleaned_Movies.csv'.")

Line breaks in 'overview' column replaced. Data saved to 'Cleaned_Movies.csv'.


### Extracting the details of the movie using movie id


In [16]:
movies_df = pd.read_csv('Cleaned_Movies.csv')
movie_ids = movies_df['movie_ID'].dropna().unique()

In [17]:
print(len(movie_ids))

9130


In [None]:
movie_details = []

# Loop through movie IDs and fetch details
for idx, movie_id in enumerate(movie_ids, 1):
    try:
        # Make the API request
        response = requests.get(
            f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}').json()

        # Append the details for each movie
        movie_details.append({
            'movie_ID': movie_id,
            'budget': response.get('budget', 0),
            'genres': [genre['name'] for genre in response.get('genres', [])],
            'homepage': response.get('homepage', ''),
            'production_companies': [company['name'] for company in response.get('production_companies', [])],
            'production_countries': [country['name'] for country in response.get('production_countries', [])],
            'revenue': response.get('revenue', 0),
            'runtime': response.get('runtime', None)
        })

        # Print progress every 100 movies
        if idx % 100 == 0:
            print(f"Processed {idx}/{len(movie_ids)} movies...")

        # Add a delay to respect API rate limits
        time.sleep(0.4)

    except Exception as e:
        print(f"Error fetching details for movie ID {movie_id}: {e}")

# Convert the movie details into a DataFrame
details_df = pd.DataFrame(movie_details)

# Save the DataFrame to a CSV file
details_df.to_csv('Details.csv', index=False)

print("Movie details collection complete. Data saved to 'Details.csv'.")

All the movies were processed and stored in Details.csv

### Dropping the dupliacted rows in Cleaned_movies.csv

In [33]:
cleaned_movies_df = pd.read_csv('Cleaned_Movies.csv')

# Drop duplicate rows based on the 'movie_ID' column
cleaned_movies_df = cleaned_movies_df.drop_duplicates(subset='movie_ID')

# Save the cleaned data back to CSV
cleaned_movies_df.to_csv('Cleaned_Movies_1.csv', index=False)

# Print the shape to verify the number of rows and columns
print(f"Cleaned data: {cleaned_movies_df.shape[0]} rows, {cleaned_movies_df.shape[1]} columns")

Cleaned data: 9130 rows, 8 columns


### Combining the movies and details Datasets

In [34]:
updated_details_df

Unnamed: 0,movie_ID,budget,genres,homepage,production_companies,production_countries,revenue,runtime
0,912649,120000000,"['Science Fiction', 'Action', 'Adventure']",https://venom.movie,"['Columbia Pictures', 'Pascal Pictures', 'Matt...",['United States of America'],394000000,109
1,1034541,2000000,"['Horror', 'Thriller', 'Mystery']",https://terrifier3.com/,"['Cineverse', 'Bloody Disgusting', 'Dark Age C...",['United States of America'],78573405,125
2,1184918,78000000,"['Animation', 'Science Fiction', 'Family']",https://www.thewildrobotmovie.com,['DreamWorks Animation'],['United States of America'],308583746,102
3,1118031,0,"['Drama', 'Action', 'Horror']",https://nostromopictures.com/en/movies/coming-...,['Nostromo Pictures'],['Spain'],0,119
4,558449,310000000,"['Action', 'Adventure', 'Drama']",https://www.gladiator.movie,"['Paramount Pictures', 'Red Wagon Entertainmen...",['United States of America'],87000000,148
...,...,...,...,...,...,...,...,...
9125,25633,1758000,"[Crime, Drama, Romance]",,"[Euterpe Productions, Metro-Goldwyn-Mayer]",[United States of America],2380000,99
9126,1048522,3000000,"[Comedy, Drama]",https://www.musicboxfilms.com/film/fremont/,"[Butimar Productions, Extra A Productions, Blu...",[United States of America],600376,92
9127,38745,112000000,"[Family, Comedy, Adventure, Fantasy]",,"[Dune Entertainment, Davis Entertainment, Dune...",[United States of America],237382724,85
9128,373569,42000000,"[Action, Comedy]",,"[Chernin Entertainment, Feigco Entertainment, ...",[United States of America],60800000,90


In [35]:
cleaned_movies_df

Unnamed: 0,title,movie_ID,overview,original_language,release_date,popularity,vote_count,vote_average
0,Venom: The Last Dance,912649,Eddie and Venom are on the run. Hunted by both...,en,2024-10-22,3047.508,789,6.489
1,Terrifier 3,1034541,Five years after surviving Art the Clown's Hal...,en,2024-10-09,1929.351,1031,6.909
2,The Wild Robot,1184918,"After a shipwreck, an intelligent robot called...",en,2024-09-12,1808.363,2938,8.471
3,Apocalypse Z: The Beginning of the End,1118031,When a kind of rabies that transforms people i...,es,2024-10-04,1638.618,498,6.784
4,Gladiator II,558449,Years after witnessing the death of the revere...,en,2024-11-13,1742.500,450,6.791
...,...,...,...,...,...,...,...,...
9173,Death Hunt,26444,"Yukon Territory, Canada, November 1931. Albert...",en,1981-04-15,20.825,177,6.600
9174,New Female Secretary,918487,"Choi Kyeong-ri, a job seeker, succeeds in find...",ko,2021-12-16,8.924,2,7.000
9175,Get the Gringo,80389,A career criminal nabbed by Mexican authoritie...,en,2012-03-15,20.822,1622,6.600
9176,Trouble,570480,A pampered dog named Trouble must learn to liv...,en,2019-08-08,19.490,220,6.582


In [36]:
# Merge the two DataFrames on the 'movie_ID' column
combined_df = pd.merge(cleaned_movies_df, updated_details_df, on='movie_ID', how='left')

# Save the combined dataset to a new CSV file
combined_df.to_csv('../data/Combined_Movies_Details.csv', index=False)

# Printing the shape of the combined DataFrame to verify
print(f"Combined data: {combined_df.shape[0]} rows, {combined_df.shape[1]} columns")

Combined data: 9130 rows, 15 columns


## Adding cast and crew Details

In [37]:
details_df = pd.read_csv('../data/Combined_Movies_Details.csv')

movie_ids = details_df['movie_ID'].values

In [None]:
# List to store the newly fetched movie details (movie_id, director, top_3_actors)
new_movie_details = []

# Loop through each movie ID in your dataset
for movie_id in movie_ids:
    try:
        # Fetch cast and crew data from the TMDB API using the movie ID
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}')
        movie_data = response.json()

        # Extract the director's name from the crew
        director = None
        for member in movie_data.get('crew', []):
            if member['job'] == 'Director':
                director = member['name']
                break
        
        # Extract the top 3 actors' names from the cast (limit to the first 3 actors)
        actors = [actor['name'] for actor in movie_data.get('cast', [])[:3]]

        # Create a dictionary with movie details (ID, Director, Top 3 Actors)
        movie_detail = {
            'movie_ID': movie_id,
            'director': director,
            'top_actors': ', '.join(actors)  # Join top 3 actors' names with commas
        }

        # Add the movie details to the list
        new_movie_details.append(movie_detail)

        # Adding a delay to prevent hitting the API too frequently
        time.sleep(0.4)

    except Exception as e:
        print(f"Error fetching details for movie ID {movie_id}: {e}")

# Convert the new movie details to a DataFrame
new_details_df = pd.DataFrame(new_movie_details)

# Save the DataFrame to a CSV file
new_details_df.to_csv('../data/Movie_Crew_Details.csv', index=False)

In [49]:
# Load the datasets
combined_movies_df = pd.read_csv('../data/Combined_Movies_Details.csv')
crew_df = pd.read_csv('../data/Movie_Crew_Details.csv')

# Merge the two datasets based on 'movie_ID'
final_df = pd.merge(combined_movies_df, crew_df, on='movie_ID', how='left')

# Save the final dataset
final_df.to_csv('../data/complete_movie_dataset.csv', index=False)

print("Datasets have been successfully merged and saved as 'complete_movie_dataset.csv'.")

Datasets have been successfully merged and saved as 'complete_movie_dataset.csv'.


### Adding Keywords

In [None]:
details_df = pd.read_csv('../data/Combined_Movies_Details.csv')

movie_ids = details_df['movie_ID'].values

# List to store the newly fetched movie details (movie_id, keywords)
new_movie_keywords = []

# Loop through each movie ID in your dataset
for movie_id in movie_ids:
    try:
        # Fetch movie keywords from the TMDB API using the movie ID
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}&language=en-US')
        movie_data = response.json()

        # Extract the keywords (list of dictionaries)
        keywords = [keyword['name'] for keyword in movie_data.get('keywords', [])]

        # Create a dictionary with movie details (ID, Keywords)
        movie_keyword_detail = {
            'movie_ID': movie_id,
            'keywords': ', '.join(keywords)  # Join keywords with commas
        }

        # Add the movie details to the list
        new_movie_keywords.append(movie_keyword_detail)

        # Adding a delay to prevent hitting the API too frequently
        time.sleep(0.4)

    except Exception as e:
        print(f"Error fetching keywords for movie ID {movie_id}: {e}")

# Convert the new movie keywords to a DataFrame
new_keywords_df = pd.DataFrame(new_movie_keywords)

# Save the DataFrame to a CSV file
new_keywords_df.to_csv('movie_keywords.csv', index=False)

### Adding missing movie Keywords

In [7]:
# Load the details and existing movie keywords data
details_df = pd.read_csv('../data/Combined_Movies_Details.csv')
keywords_df = pd.read_csv('movie_keywords.csv')

# Extract movie IDs
movie_ids_in_details = details_df['movie_ID'].values
movie_ids_in_keywords = keywords_df['movie_ID'].values

# Find the missing movie IDs (in details but not in keywords)
missing_movie_ids = [movie_id for movie_id in movie_ids_in_details if movie_id not in movie_ids_in_keywords]

# List to store the new movie keyword details
new_movie_keywords = []

# Loop through the missing movie IDs and fetch keywords from TMDB API
for movie_id in missing_movie_ids:
    try:
        # Fetch movie keywords data from the TMDB API
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}')
        movie_data = response.json()

        # Extract the keywords from the response
        keywords = [keyword['name'] for keyword in movie_data.get('keywords', [])]

        # Create a dictionary with movie ID and its keywords
        movie_keywords = {
            'movie_ID': movie_id,
            'keywords': ', '.join(keywords)  # Join the keywords as a string
        }

        # Add the movie keyword details to the list
        new_movie_keywords.append(movie_keywords)

    except Exception as e:
        print(f"Error fetching details for movie ID {movie_id}: {e}")

# Convert the new movie keywords details to a DataFrame
new_keywords_df = pd.DataFrame(new_movie_keywords)

# Append the new movie keywords to the existing movie_keywords.csv
new_keywords_df.to_csv('movie_keywords.csv', mode='a', header=False, index=False)

print(f"Successfully fetched and appended keywords for {len(new_movie_keywords)} movies.")

Successfully fetched and appended keywords for 10 movies.


### Merging the keywords back to the complete dataset

In [10]:
# Read the complete_movie_dataset and movie_keywords.csv
complete_movie_dataset = pd.read_csv('../data/complete_movie_dataset.csv')
movie_keywords = pd.read_csv('movie_keywords.csv')

# Merge the datasets based on the 'movie_ID' column
updated_movie_dataset = complete_movie_dataset.merge(movie_keywords, on='movie_ID', how='left')

# Save the updated dataset back to complete_movies_dataset.csv
updated_movie_dataset.to_csv('../data/complete_movie_dataset.csv', index=False)

print("Datasets successfully merged and saved back to complete_movie_dataset.csv")

Datasets successfully merged and saved back to complete_movie_dataset.csv
