In [2]:
import requests
import time
import requests.exceptions as RequestException
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("TMDB_API_KEY")
BASE_URL = "https://api.themoviedb.org/3"

# getting movie ids from tmdb api and saving them to a csv file
def get_movie_ids(start_page=500, end_page=501):

  all_ids = []

  for page in range(start_page, end_page):
    try:
      url = f"{BASE_URL}/discover/movie"
      params = {
        "api_key": API_KEY,
        "language": "en-US",
        "sort": "popularity.desc",
        "page": page
      }

      response = requests.get(url, params=params)

      if response.status_code != 200:
        print(f"Page {page} returned {response.status_code}. Stopping.")
        break
      data = response.json()

      if "results" not in data:
        print(f"No results key on page {page}")
        print(data)
        break

      ids = [movie["id"] for movie in data["results"]]
      all_ids.extend(ids)

      print(f"Collected page {page}")
      time.sleep(0.75)  # To respect API rate limits

      id_series = pd.DataFrame({"id": list(range(len(all_ids))), "Movie_id": all_ids})
      id_series.to_csv("../data/movie_ids_1.csv", index=False)

      print(f"Total IDs collected so far: {len(all_ids)}")
    
    except RequestException as e:
      print(f"Error fetching page {page}: {e}")
      print("Stopping program")
      break

  return len(all_ids)



In [2]:
if __name__ == "__main__":
    total = get_movie_ids()
    print(f"Final total IDs collected: {total}")

Collected page 1
Total IDs collected so far: 20
Collected page 2
Total IDs collected so far: 40
Collected page 3
Total IDs collected so far: 60
Collected page 4
Total IDs collected so far: 80
Collected page 5
Total IDs collected so far: 100
Collected page 6
Total IDs collected so far: 120
Collected page 7
Total IDs collected so far: 140
Collected page 8
Total IDs collected so far: 160
Collected page 9
Total IDs collected so far: 180
Collected page 10
Total IDs collected so far: 200
Collected page 11
Total IDs collected so far: 220
Collected page 12
Total IDs collected so far: 240
Collected page 13
Total IDs collected so far: 260
Collected page 14
Total IDs collected so far: 280
Collected page 15
Total IDs collected so far: 300
Collected page 16
Total IDs collected so far: 320
Collected page 17
Total IDs collected so far: 340
Collected page 18
Total IDs collected so far: 360
Collected page 19
Total IDs collected so far: 380
Collected page 20
Total IDs collected so far: 400
Collected pag

In [3]:
# getting movie genres from tmdb api and saving them to a csv file
def get_movie_genres():
  url = f"{BASE_URL}/genre/movie/list"
  params = {
    "api_key": API_KEY,
    "language": "en-US"
  }

  response = requests.get(url, params=params)
  data = response.json()

  genres = data.get("genres", [])
  genre_dict = {genre["id"]: genre["name"] for genre in genres}

  genre_df = pd.DataFrame(list(genre_dict.items()), columns=["Genre_id", "Genre_name"])
  genre_df.to_csv("../data/movie_genres.csv", index=False)

In [4]:
get_movie_genres()

In [3]:
def get_movie_data(movie_id):
  url = f"{BASE_URL}/movie/{movie_id}"
  params = {
    "api_key": API_KEY,
    "language": "en-US"
  }

  response = requests.get(url, params=params)
  if response.status_code != 200:
    print(f"Movie ID {movie_id} returned {response.status_code}. Skipping.")
    return None
  data = response.json()

  movie_title = data['title']
  movie_genres = [genre['id'] for genre in data['genres']]
  movie_release_year = data['release_date'].split("-")[0] if data['release_date'] else None
  movie_runtime = data['runtime']
  movie_language = data['original_language']
  movie_adult = data['adult']
  movie_poster_path = data['poster_path']
  movie_backdrop_path = data['backdrop_path']
  movie_vote_average = data['vote_average']
  movie_vote_count = data['vote_count']

  print(f"Fetched data for movie ID {movie_id}: {movie_title}")

  return [movie_id, movie_title, movie_genres, movie_release_year, movie_runtime, movie_language, movie_adult, movie_poster_path, movie_backdrop_path, movie_vote_average, movie_vote_count]


In [9]:
import numpy as np

movie_ids = pd.read_csv("../data/movie_ids.csv")["Movie_id"].tolist()
movie_data = []
movie_link=[]
movie_rates = []

for idx, movie_id in enumerate(movie_ids[6561:][:]):
  data = get_movie_data(movie_id)
  time.sleep(0.25)  # To respect API rate limits
  if data:
    movie_data.append(data[:7])  # Movie_id, Title, Genres, Release_year, Runtime, Language, Adult
    movie_link.append([data[0], data[7], data[8]])  # Movie_id, Poster_path, Backdrop_path
    movie_rates.append([data[0], data[9], data[10]])  # Movie_id, Vote_average, Vote_count

  if (idx + 1) % 100 == 0:
    print(f"Processed {idx + 1} movies")

  movie_df = pd.DataFrame(movie_data, columns=["Movie_id", "Title", "Genres", "Release_year", "Runtime", "Language", "Adult"])
  movie_df.to_csv("../data/movie_data_4.csv", index=False)

  movie_link_df = pd.DataFrame(movie_link, columns=["Movie_id", "Poster_path", "Backdrop_path"])
  movie_link_df.to_csv("../data/movie_links_4.csv", index=False)

  movie_rating_df = pd.DataFrame(movie_rates, columns=["Movie_id", "Vote_average", "Vote_count"])
  movie_rating_df.to_csv("../data/movie_ratings_4.csv", index=False)

  print(f"Saved data for {idx + 1} movies so far")



Fetched data for movie ID 379648: Shajarur Kanta
Saved data for 1 movies so far
Fetched data for movie ID 12410: Good
Saved data for 2 movies so far
Fetched data for movie ID 11855: Pecker
Saved data for 3 movies so far
Fetched data for movie ID 901383: Mirage
Saved data for 4 movies so far
Fetched data for movie ID 2405: Joseph
Saved data for 5 movies so far
Fetched data for movie ID 98894: Jessica
Saved data for 6 movies so far
Fetched data for movie ID 1128692: Crack
Saved data for 7 movies so far
Fetched data for movie ID 845404: Tyler Perry's A Madea Homecoming
Saved data for 8 movies so far
Fetched data for movie ID 408033: Tamara
Saved data for 9 movies so far
Fetched data for movie ID 347969: The Ridiculous 6
Saved data for 10 movies so far
Fetched data for movie ID 214643: Co/Ma
Saved data for 11 movies so far
Fetched data for movie ID 1167271: Weekend in Taipei
Saved data for 12 movies so far
Fetched data for movie ID 630594: Limelight
Saved data for 13 movies so far
Fetched 

In [3]:
movie_ids_ids = pd.read_csv("../data/movie_ids.csv")["Movie_id"].tolist()
movie_data_ids = pd.read_csv("../data/movie_data.csv")["Movie_id"].tolist()
missing_ids = set(movie_ids_ids) - set(movie_data_ids)  
print(f"Missing movie IDs: {missing_ids}")

Missing movie IDs: set()


In [4]:
movie_links_ids = pd.read_csv("../data/movie_links.csv")["Movie_id"].tolist()
missing_link_ids = set(movie_ids_ids) - set(movie_links_ids)
print(f"Missing link IDs: {missing_link_ids}")

Missing link IDs: set()


In [5]:
movie_link_ids = pd.read_csv("../data/movie_links.csv")["Movie_id"].tolist()
missing_link_ids = set(movie_ids_ids) - set(movie_link_ids)
print(f"Missing link IDs: {missing_link_ids}")

Missing link IDs: set()


In [3]:
def get_movie_popularity(movie_id):
  url = f"{BASE_URL}/movie/{movie_id}"
  params = {
    "api_key": API_KEY,
    "language": "en-US"
  }

  response = requests.get(url, params=params)
  if response.status_code != 200:
    print(f"Movie ID {movie_id} returned {response.status_code}. Skipping.")
    return None
  data = response.json()
  print(f"Fetched popularity for movie ID {movie_id}: {data['popularity']}")

  return data['popularity']

movie_ids = pd.read_csv("../data/movie_ids.csv")["Movie_id"].tolist()
movie_popularity = []

for idx, movie_id in enumerate(movie_ids[3738:]):
  popularity = get_movie_popularity(movie_id)
  time.sleep(0.25)  # To respect API rate limits
  if popularity is not None:
    movie_popularity.append([movie_id, popularity])  # Movie_id, Popularity

  if (idx + 1) % 100 == 0:
    print(f"Processed {idx + 1} movies")

  popularity_df = pd.DataFrame(movie_popularity, columns=["Movie_id", "Popularity"])
  popularity_df.to_csv("../data/movie_popularity_6.csv", index=False)

  print(f"Saved popularity data for {idx + 1} movies so far")


Fetched popularity for movie ID 13648: 4.7174
Saved popularity data for 1 movies so far
Fetched popularity for movie ID 1078600: 4.3822
Saved popularity data for 2 movies so far
Fetched popularity for movie ID 316727: 4.8538
Saved popularity data for 3 movies so far
Fetched popularity for movie ID 39108: 4.0498
Saved popularity data for 4 movies so far
Fetched popularity for movie ID 1282138: 4.4958
Saved popularity data for 5 movies so far
Fetched popularity for movie ID 13492: 4.443
Saved popularity data for 6 movies so far
Fetched popularity for movie ID 872170: 3.6418
Saved popularity data for 7 movies so far
Fetched popularity for movie ID 614930: 4.3496
Saved popularity data for 8 movies so far
Fetched popularity for movie ID 340601: 3.8408
Saved popularity data for 9 movies so far
Fetched popularity for movie ID 612: 4.6984
Saved popularity data for 10 movies so far
Fetched popularity for movie ID 215: 4.4568
Saved popularity data for 11 movies so far
Fetched popularity for movi

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))