In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer



In [None]:

# TMDb API settings
with open("readaccess.txt", "r") as f:
    ACCESS_TOKEN = f.read().strip()
BASE_URL = "https://api.themoviedb.org/3/discover/movie"
HEADERS = {
    "Authorization": f"Bearer {ACCESS_TOKEN}",
    "accept": "application/json"
}

# Parameters to fetch without restrictions
PARAMS = {
    "include_adult": "true",
    "include_video": "false",
    "language": "en-US",
    "sort_by": "popularity.desc",  # You could also try vote_average.desc
    "page": 1  # We'll increment this up to page 50 (20 movies per page)
}

all_movies = []

for page in range(1, 51):  # 50 pages * 20 movies/page = 1000 movies
    print(f"Fetching page {page}...")
    PARAMS["page"] = page
    response = requests.get(BASE_URL, headers=HEADERS, params=PARAMS)
    if response.status_code == 200:
        data = response.json()
        all_movies.extend(data["results"])
    else:
        print(f"Failed on page {page}: {response.status_code}")
        break
    time.sleep(0.25)  # Avoid hitting rate limits

# Convert to DataFrame
df_movies = pd.DataFrame(all_movies)
df_movies = df_movies.drop_duplicates(subset="id").reset_index(drop=True)

print(f"Retrieved {len(df_movies)} movies.")
df_movies.head()


In [None]:
desired_columns = [
    "adult", "genre_ids", "id", "original_language",
    "original_title", "overview", "popularity",
    "release_date", "title", "video",
    "vote_average", "vote_count"
]

df_movies = df_movies[desired_columns]
df_movies.head()


In [None]:
# 1. Get genre ID-to-name mapping
genre_url = "https://api.themoviedb.org/3/genre/movie/list"
genre_response = requests.get(genre_url, headers=HEADERS)
genre_data = genre_response.json()

genre_map = {genre['id']: genre['name'] for genre in genre_data['genres']}

# 2. Convert genre_ids to names
def map_genre_ids_to_names(id_list):
    return [genre_map.get(gid, "Unknown") for gid in id_list]

df_movies['genres'] = df_movies['genre_ids'].apply(map_genre_ids_to_names)
df_movies.drop(columns=['genre_ids'], inplace=True)

df_movies[['title', 'genres']].head()



In [None]:

# Initialize the encoder
mlb = MultiLabelBinarizer()

# Fit and transform the 'genres' column
genre_encoded = mlb.fit_transform(df_movies['genres'])

# Create a DataFrame with the genre columns
df_genres = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Concatenate with the original DataFrame
df_movies = pd.concat([df_movies.drop(columns=['genres']), df_genres], axis=1)

# Display updated DataFrame
df_movies.head()


In [None]:


# Store results
directors_list = []
cast_list = []

# You already have this function defined:
def get_crew_and_cast(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
    response = requests.get(url, headers=HEADERS)
    while response.status_code == 429:
        print("Rate limited! Sleeping...")
        time.sleep(1)
        response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        return [], []

    data = response.json()
    crew = data.get("crew", [])
    cast = data.get("cast", [])

    # Filter for Director
    directors = [person["name"] for person in crew if person["job"] == "Director"]

    # Top 3 cast members
    top_cast = [person["name"] for person in cast[:3]]

    return directors, top_cast

# Apply to all movies
for movie_id in tqdm(df_movies['id'], desc="Fetching crew & cast"):
    directors, top_cast = get_crew_and_cast(movie_id)
    directors_list.append(directors)
    cast_list.append(top_cast)
    time.sleep(0.25)  # Be nice to TMDb (4 req/sec)

# Add columns
df_movies['directors'] = directors_list
df_movies['top_cast'] = cast_list


In [None]:

# Step 1: Ensure one-hot columns are integers
df_one_hot = df_movies[mlb.classes_].astype(int)

# Step 2: Convert to NumPy array
one_hot_array = df_one_hot.to_numpy()

# Step 3: Reverse the one-hot encoding
reconstructed_genres = mlb.inverse_transform(one_hot_array)

# Step 4: Add back to DataFrame
df_movies['genres'] = reconstructed_genres


In [None]:
df_movies.head()


In [None]:
df_movies.to_csv("tmdb_movie_data.csv", index=False)