In [None]:
import requests
import numpy as np
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv() 
import os
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/sentence-t5-base")

def get_director(id):
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer " + os.getenv("TMDB_API_KEY"),
    }
    url_credits = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    response = requests.get(url_credits, headers=headers)
    data_credits = response.json()

    for d in data_credits["crew"]:
        if d["job"] == "Director":
            return d["name"]
        
def get_film(id):
    url_movie = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    headers = {
        "accept": "application/json",
        "Authorization": (
            ("Bearer "+os.getenv("TMDB_API_KEY"))
        ),
    }
    response = requests.get(url_movie, headers=headers)
    data = response.json()
    movies_keys_to_remove = [
        "backdrop_path",
        "belongs_to_collection",
        "homepage",
        "poster_path",
        "production_companies",
        "production_countries",
        "spoken_languages",
        "status",
        "tagline",
        "video",
    ]
    for k in movies_keys_to_remove:
        if k in data:
            data.pop(k)

    url_credits = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    response = requests.get(url_credits, headers=headers)
    data_credits = response.json()
    if "cast" not in data_credits:
        top_actors = []
    else:
        top_actors = data_credits["cast"][0:2]
    actors_keys_to_remove = [
        "adult",
        "known_for_department",
        "original_name",
        "cast_id",
        "credit_id",
        "order",
        "profile_path",
    ]

    for t in top_actors:
        for k in actors_keys_to_remove:
            if k in t:
                t.pop(k)

    data["actors"] = top_actors

    embeddings = model.encode(data["overview"] if len(data["overview"]) > 0 else data["title"])
    data["overview_embedding"] = embeddings.tolist()
    data["director"] = get_director(id)
    return data

import tqdm
def sample(ids, file_path):
    films = {}
    for id in tqdm.tqdm(ids):
        try:
            films[id] = get_film(id)
        except:
            print(f"Error with {id}")
    with open(file_path, "w") as outfile:
        json.dump(films, outfile)

In [None]:
james_bond = [370172, 206647, 37724]
mission_impossible = [575265, 575264, 353081]
avengers = [299536, 299534, 24428, 99861]
star_wars = [140607, 181808, 181812, 1891, 1892, 1893, 1894, 11, 1895]
harry_potter = [671, 672, 673, 674, 675, 767, 12444, 12445]
the_lord_of_the_rings = [120, 121, 122]
fast_and_furious = [9799, 584, 9615, 13804, 51497, 82992, 168259]
the_matrix = [603, 604, 605]
jurassic_park = [329, 330, 331, 135397, 351286]
indiana_jones = [85, 87, 89]
pirates_of_the_caribbean = [22, 58, 285, 1865, 166426]
toy_story = [862, 863, 10193, 301528]
after = [537915, 613504, 744275, 744276]
fifty_shades = [216015, 341174, 337167]
before = [76, 80, 132344]
to_all_the_boys = [466282, 565426, 614409]
a_nightmare_on_elm_street = [377, 10014, 10072, 10131, 10160, 11284, 11596]
friday_the_13th = [4488, 9725, 9728, 9730]
saw = [176, 215, 214, 663]
halloween = [948, 11281, 10676, 11357]
paranormal_activity = [23827, 41436, 72571, 82990, 227348]
american_pie = [2105, 2770, 8273, 71552]

all_films_grouped = [
    james_bond,
    mission_impossible,
    avengers,
    star_wars,
    harry_potter,
    the_lord_of_the_rings,
    fast_and_furious,
    the_matrix,
    jurassic_park,
    indiana_jones,
    pirates_of_the_caribbean,
    toy_story,
    after,
    fifty_shades,
    before,
    to_all_the_boys,
    a_nightmare_on_elm_street,
    friday_the_13th,
    saw,
    halloween,
    paranormal_activity,
    american_pie,
]
all_films = np.concatenate(all_films_grouped).tolist()

In [None]:
sample(all_films, "./movie_sagas_data.json")

In [None]:
with open("./movie_sagas_group.json", "w") as outfile:
    json.dump(all_films_grouped, outfile)