In [1]:
# Import necessary libraries
import pandas as pd
import requests
import os
from google.colab import files

# Upload .env file manually via a file selector
uploaded = files.upload()

# Manual .env loading for Colab
def load_api_key_from_env(file_path=".env"):
    with open(file_path) as f:
        for line in f:
            if line.startswith("TMDB_API_KEY"):
                return line.strip().split("=")[1]
    return None

api_key = load_api_key_from_env()


Saving .env to .env (2)


In [2]:
# Load the cleaned Netflix dataset
df = pd.read_csv("netflix_titles_cleaned.csv")

# Keep only the necessary columns
df = df[['show_id', 'title', 'release_year']].drop_duplicates()

# Display the first few rows to check
df.head()


Unnamed: 0,show_id,title,release_year
0,s1,Dick Johnson Is Dead,2020
1,s2,Blood & Water,2021
2,s3,Ganglands,2021
3,s4,Jailbirds New Orleans,2021
4,s5,Kota Factory,2021


In [3]:
from re import search

# TMDB API base URLs
search_url_movie = "https://api.themoviedb.org/3/search/movie"
search_url_tv_show = "https://api.themoviedb.org/3/search/tv"
poster_base_url = "https://image.tmdb.org/t/p/w500"

def get_poster_url(title, year):
    try:
        # Common parameters
        params = {
            "api_key": api_key,
            "query": title,
            "first_air_date_year": year,  # For TV show
            "year": year                  # For Movies
        }

        # 1. Try Movie search
        movie_response = requests.get(search_url_movie, params=params)
        movie_data = movie_response.json()
        movie_results = movie_data.get("results", [])

        if movie_results and movie_results[0].get("poster_path"):
            return poster_base_url + movie_results[0]["poster_path"]

        # 2. If no result, try TV search
        tv_response = requests.get(search_url_tv_show, params=params)
        tv_data = tv_response.json()
        tv_results = tv_data.get("results", [])

        if tv_results and tv_results[0].get("poster_path"):
            return poster_base_url + tv_results[0]["poster_path"]

        return None  # Not found

    except Exception as e:
        print(f"Error for {title} ({year}): {e}")
        return None


In [4]:
# Test the poster fetching on the first 5 rows
test_sample = df.head(20).copy()

# Apply the function to get the poster URLs
test_sample['poster_url'] = test_sample.apply(lambda row: get_poster_url(row['title'], row['release_year']), axis=1)

# Display the result
test_sample[['title', 'release_year', 'poster_url']]


Unnamed: 0,title,release_year,poster_url
0,Dick Johnson Is Dead,2020,https://image.tmdb.org/t/p/w500/vnfI34kmi3fcdg...
1,Blood & Water,2021,https://image.tmdb.org/t/p/w500/1xdJWTn5Tuqy7y...
2,Ganglands,2021,https://image.tmdb.org/t/p/w500/3E6IPkHH541ii4...
3,Jailbirds New Orleans,2021,https://image.tmdb.org/t/p/w500/pA7urHEBVDSJR7...
4,Kota Factory,2021,
5,Midnight Mass,2021,https://image.tmdb.org/t/p/w500/gqswuTpiNnFWxK...
6,My Little Pony: A New Generation,2021,https://image.tmdb.org/t/p/w500/uvbj8puRj37Pc8...
7,Sankofa,1993,https://image.tmdb.org/t/p/w500/t5PTcnhZgHzL7H...
8,The Great British Baking Show,2021,
9,The Starling,2021,https://image.tmdb.org/t/p/w500/gPkaPGNbjZCeRu...


In [5]:
df['poster_url'] = df.apply(lambda row: get_poster_url(row['title'], row['release_year']), axis=1)

# Save final CSV with only show_id, title, and poster_url
df[['show_id', 'title', 'poster_url']].to_csv("netflix_posters.csv", index=False)
print("Done! File saved as netflix_posters.csv")


Done! File saved as netflix_posters.csv
