In [1]:
import requests
import numpy as np
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv() 
import os
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/sentence-t5-base")

def get_director(id):
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer " + os.getenv("TMDB_API_KEY"),
    }
    url_credits = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    response = requests.get(url_credits, headers=headers)
    data_credits = response.json()

    for d in data_credits["crew"]:
        if d["job"] == "Director":
            return d["name"]
        
def get_film(id):
    url_movie = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    headers = {
        "accept": "application/json",
        "Authorization": (
            ("Bearer "+os.getenv("TMDB_API_KEY"))
        ),
    }
    response = requests.get(url_movie, headers=headers)
    data = response.json()
    movies_keys_to_remove = [
        "backdrop_path",
        "belongs_to_collection",
        "homepage",
        "poster_path",
        "production_companies",
        "production_countries",
        "spoken_languages",
        "status",
        "tagline",
        "video",
    ]
    for k in movies_keys_to_remove:
        if k in data:
            data.pop(k)

    url_credits = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    response = requests.get(url_credits, headers=headers)
    data_credits = response.json()
    if "cast" not in data_credits:
        top_actors = []
    else:
        top_actors = data_credits["cast"][0:2]
    actors_keys_to_remove = [
        "adult",
        "known_for_department",
        "original_name",
        "cast_id",
        "credit_id",
        "order",
        "profile_path",
    ]

    for t in top_actors:
        for k in actors_keys_to_remove:
            if k in t:
                t.pop(k)

    data["actors"] = top_actors

    embeddings = model.encode(data["overview"] if len(data["overview"]) > 0 else data["title"])
    data["overview_embedding"] = embeddings.tolist()
    data["director"] = get_director(id)
    return data

import tqdm
def sample(ids, file_path):
    films = {}
    for id in tqdm.tqdm(ids):
        try:
            films[id] = get_film(id)
        except:
            print(f"Error with {id}")
    with open(file_path, "w") as outfile:
        json.dump(films, outfile)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
action = [955555, 1085544, 926680, 649392, 706904, 991953, 687972, 822110, 687779, 
          461053, 423988, 443700, 337339, 249070, 370131, 77555, 36668, 435, 5146, 676]
animation = [85151, 808, 209511, 273137, 273147, 23154, 45662, 387508, 174121, 116711, 
             952073, 359549, 1106741, 593994, 618159, 385103, 708352, 704953, 569094, 1134407]
family = [1136767, 1136397, 1101118, 1008102, 620705, 681781, 961262, 979296, 756187, 655424,678585,
          467181, 432383, 508834, 399268, 385722, 342562, 330715, 14165, 10025]
comedy = [128375, 9767, 18154, 108842, 24978, 271490, 141457, 180794, 150709, 365756, 430162,
            497515, 505270, 775738, 597856, 693285, 710221, 1137919, 1008102, 1016084]
documentary = [1123309, 1123031, 1119679, 817127, 879977, 935543, 620051, 534369, 448711, 233863, 426637,
                418293, 390553, 218934, 223221, 456989, 938055, 493324, 220363, 54943]
fantasy = [150421, 641415, 559, 9785, 13128, 52634, 253574, 17046, 71315, 137936,
           281778, 105001, 336149, 293863, 444135, 595503, 633811, 793100,997776, 455476 ]
romance = [1121388, 851830, 976125, 723968, 961520, 657412, 776586, 775326, 525454, 488113, 
           643274, 590102, 301372,1117755, 351800, 153133, 618176, 10761, 17021,16643 ]
horror = [97889, 19085, 71147, 1872, 25579, 49706, 479755, 31541, 86703, 282070, 445218, 446360,
          784630, 460378, 660175, 546724, 773982, 552322, 749645, 1079837]

In [3]:
sample(action, './action.json')
sample(animation, './animation.json')
sample(family, './family.json')
sample(comedy, './comedy.json')
sample(documentary, './documentary.json')
sample(fantasy, './fantasy.json')
sample(romance, './romance.json')
sample(horror, './horror.json')



100%|██████████| 20/20 [00:03<00:00,  5.55it/s]
100%|██████████| 20/20 [00:01<00:00, 14.29it/s]
100%|██████████| 20/20 [00:01<00:00, 15.71it/s]
100%|██████████| 20/20 [00:01<00:00, 10.44it/s]
100%|██████████| 20/20 [00:01<00:00, 10.28it/s]
100%|██████████| 20/20 [00:01<00:00, 10.33it/s]
100%|██████████| 20/20 [00:01<00:00, 10.55it/s]
100%|██████████| 20/20 [00:01<00:00, 10.45it/s]


In [3]:
all = action + animation + family + comedy + documentary + fantasy + romance + horror

In [None]:

sample(all, './all.json')

In [4]:
np.random.seed(42)
all_subset_20 = np.random.choice(all, 20, replace=False)
all_subset_20 = all_subset_20.tolist()

In [5]:

sample(all_subset_20, './all_subset_20.json')

100%|██████████| 20/20 [00:05<00:00,  3.51it/s]


In [5]:
import json 
import numpy as np
with open("all.json") as json_file:
    data = json.load(json_file)

genres = ['action', 'animation', 'comedy', 'documentary', 'family', 'fantasy', 'horror', 'romance']

for genre in genres:
    without_genre = []
    for m in data:
        flag = True
        for g in data[m]['genres']:
            if g['name'] == genre.capitalize():
                flag = False
        if flag:
            without_genre.append(data[m])

    np.random.seed(42)
    c = np.random.choice(without_genre, 20, replace=False)

    new_data = {}
    for m in c:
        new_data[m['id']] = m

    with open(f"no_{genre}.json", 'w') as outfile:
        json.dump(new_data, outfile)

: 