In [2]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

notebook_dir = Path().resolve()
base_path = os.path.abspath(notebook_dir.parent.parent)
sys.path.append(base_path)
from src.data_utils import *

In [28]:
embeddings, movie_ids  = load_movie_embeddings(os.path.join(base_path, "data", 'data_final'))

In [29]:
embeddings.shape

(87718, 1024)

In [4]:
movie_df = pd.read_csv(os.path.join(base_path, "data", "data_final", "cleaned_movie_data.csv"), dtype=str, low_memory=False)

In [5]:
movie_df.shape

(98494, 32)

In [7]:
movie_df = movie_df[movie_df.genre.notna()].reset_index(drop=True)

In [8]:
movie_df["new_genre"] = movie_df["genre"].apply(preprocess_genres)

In [10]:
import json

In [13]:
with open("../cluster_to_genre_mapping_01122025.json", "r") as f:
    genre_fix_mapping = json.load(f)

In [20]:
movie_df = movie_df[movie_df["new_genre"] != "Unknown"].reset_index(drop=True)

In [22]:
movie_df["new_genre"] = movie_df["new_genre"].apply(lambda x: [genre_fix_mapping[i] for i in x.split("|")])

In [37]:
embeddings.shape, movie_ids.shape

((87718, 1024), (87718,))

In [34]:
movie_id_to_embedding_map = {
    movie_id: embeddings[idx]
    for idx, movie_id in enumerate(movie_df.movie_id)
}

In [38]:
movie_df = movie_df[movie_df.movie_id.isin(movie_ids)].reset_index(drop=True)

In [None]:
# For each year, I want to see what are the "nearest" genres to "fantasy and scifi" (the query)
# I will have first compute the mean embedding of each genre per year
# Then for each year, I will compute the nearest genres to "fantasy and scifi"

In [71]:
from scipy.spatial.distance import cdist

In [99]:
# Year 1950
year_1950_df = movie_df[(movie_df.year.astype(int) >= 1950) & (movie_df.year.astype(int) <= 1960)].reset_index(drop=True)

In [100]:
def contains_old_genre(genre_raw_string: str, query_genre: str) -> bool:
    genre_raw_string_lowered = genre_raw_string.lower()
    query_genre_lowered = query_genre.lower()
    return query_genre_lowered in genre_raw_string_lowered

In [101]:
year_1950_df.shape

(6769, 33)

In [102]:
# Calculate pairwise cosine distance for movies in 1950 first (calculate once and just use index to retrieve later)

# Get all embeddings for movies in 1950
year_1950_embeddings =np.array([movie_id_to_embedding_map[movie_id] for movie_id in year_1950_df["movie_id"].tolist()])

# Calculate pairwise cosine distance matrix
distance_matrix_1950 = cdist(year_1950_embeddings, year_1950_embeddings, metric='cosine')

In [110]:
# Most average movie in 1950 (the most similar in terms of cosine distance)
year_1950_most_ave_movie = year_1950_df.iloc[np.argmin(distance_matrix_1950.sum(axis=0))]
print(year_1950_most_ave_movie.title)
print(year_1950_most_ave_movie.genre)
print(year_1950_most_ave_movie.new_genre)
print(year_1950_most_ave_movie["plot"])

The Beautiful Galatea
romantic comedy
['romance']
During the first 18 years of the 20th century, in a small town, two men, the sculptor Viktor Kolin and the Kapellmeister Marcel Thomas work on the Galathée theme, each in his own way. While one tries to approach the Galathée in a musical way, the other plans to carton the nymph in stone. Viktor has already chosen a young woman to model for the statue: it is the young Leni, a simple girl from the people who works as a temp at the vegetable market. Leni feels very flattered and promptly falls in love with Viktor, making her a competitor of the singer Victoria Mertens, the sculptor's girlfriend. Viktor on the other hand, also shows interest in Victoria, which gives the erotic round additional piquancy. Upon completing his masterpiece, Marcel suddenly loses interest in Leni, who had hoped that the sculpture would attract him more to her. Full of anger, she goes to his studio and smashes the artwork. A court case is scheduled, and only then 

In [111]:
# Query for action movies in 1950
year_1950_action_df = year_1950_df[year_1950_df["genre"].apply(lambda x: contains_old_genre(x, "action film"))]

# Get the pairwise cosine distance of the action_df
year_1950_most_ave_action_movie = year_1950_action_df.iloc[
    np.argmin(
        distance_matrix_1950[year_1950_action_df.index.values, :][:, year_1950_action_df.index.values].sum(axis=0)
        )]

In [113]:
print(year_1950_most_ave_action_movie.title)
print(year_1950_most_ave_action_movie.genre)
print(year_1950_most_ave_action_movie.new_genre)
print(year_1950_most_ave_action_movie["plot"])

Hot Rod Girl
action film
['action western and international']
To combat the problem of teenagers drag-racing their hot rods on city streets, sympathetic Lt. Ben Merril (Connors) has set up a dragstrip for them where they can race under controlled, safe conditions. But after a race meet, Steve Northrup (Del Erickson) is goaded into a street race, while his brother Jeff (Smith) is a passenger. Steve ignores Jeff's entreaties to not race. Steve is killed in the ensuing crash and a heartbroken Jeff breaks off all contact with the other drag-racing kids. He also avoids Lisa Vernon (Nelson), his girlfriend, who is also a drag racer. A biker jacket-wearing bully named Bronc Talbott (Mark Andrews) arrives in town and after terrorizing the teens, appoints himself leader of them, replacing Jeff in that role. Following an argument at the teens' hangout, a diner run by Yo-Yo (Fred Essler), Bronc challenges Flat Top (Frank Gorshin) to a chicken race, in which they'll accelerate their cars straight 

In [115]:
# Try using Niklas' "Find K nearest neighbours" method
from src.analysis.find_n_closest_neighbours import find_n_closest_neighbours as knn_movies

test_results = knn_movies(year_1950_most_ave_action_movie.movie_id, n=10, start_year=1950, end_year=1960, data_dir=os.path.join(base_path, "data", "data_final"))

2025-12-01 15:38:44,329 - INFO - Auto-detected chunking suffix: '_cls_token'
2025-12-01 15:38:44,330 - INFO - Loading embeddings...
2025-12-01 15:38:44,336 - INFO - Total movies with embeddings: 7539
2025-12-01 15:38:44,337 - INFO - Embedding shape: (7539, 1024)
2025-12-01 15:38:44,337 - INFO - Found query movie at index 4624
2025-12-01 15:38:44,338 - INFO - Loading movie metadata...
2025-12-01 15:38:45,917 - INFO - Loaded 141119 movies from metadata files
2025-12-01 15:38:45,920 - INFO - Query movie: Hot Rod Girl (QID: Q12060548)
2025-12-01 15:38:45,921 - INFO - Calculating cosine similarities...
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [117]:
for res in test_results:
    print(res[1])

Hot Rod Rumble
Daddy-O
Destry
One Cab's Family
April Love
Dragstrip Riot
The Strip
Colt .45
Saddle the Wind
Stakeout on Dope Street


In [122]:
# Now I need to extend this and repeat this for all genres, basically get a dataframe of the most average movie per genre for 1950
# Pseudo code:
# For every decade:
#   For every genre:
#       Filter movies in that decade and genre
#       Calculate pairwise cosine distance matrix
#       Find most average movie in that genre and decade

# implementation
decade_start_years = [1950, 1980, 2000, 2020]
decades = []
for start_year in decade_start_years:
    end_year = start_year + 9
    decade_df = movie_df[(movie_df.year.astype(int) >= start_year) & (movie_df.year.astype(int) <= end_year)].reset_index(drop=True)
    unique_genres = set()
    for genres in decade_df["genre"]:
        for genre in genres:
            unique_genres.add(genre)
    for genre in unique_genres:
        genre_decade_df = decade_df[decade_df["genre"].apply(lambda x: genre in x)].reset_index(drop=True)
        if genre_decade_df.shape[0] == 0:
            continue
        genre_decade_embeddings = np.array([movie_id_to_embedding_map[movie_id] for movie_id in genre_decade_df["movie_id"].tolist()])
        distance_matrix_genre_decade = cdist(genre_decade_embeddings, genre_decade_embeddings, metric='cosine')
        most_ave_movie = genre_decade_df.iloc[np.argmin(distance_matrix_genre_decade.sum(axis=0))]
        decades.append({
            "start_year": start_year,
            "end_year": end_year,
            "genre": genre,
            "most_ave_movie_id": most_ave_movie.movie_id,
            "most_ave_movie_title": most_ave_movie.title,
            "most_ave_movie_plot": most_ave_movie["plot"]
        })

In [123]:
len(decades)

183

In [129]:
decades[3]

{'start_year': 1950,
 'end_year': 1959,
 'genre': 'S',
 'most_ave_movie_id': 'Q5267192',
 'most_ave_movie_title': "Devil's Partner",
 'most_ave_movie_plot': "Set in rural Furnace Flats, New Mexico, the film opens with a hunched old man, Pete Jensen, slaughtering a goat and daubing its blood within a hexagon drawn on the floor of his shack. Days later, a young man, Nick Richards, arrives in town, asking about Pete, claiming he was his uncle. The town's sheriff informs Nick that Pete is dead. Nick decides to set up residence in Pete's shack. While there, he engages in a series of demonic rituals designed to drive a wedge between pretty Nell Lucas and her fiancé, auto-mechanic David Simpson. One evening, after a date with Nell, David is mysteriously attacked and disfigured by his pet dog. Nick offers to substitute for David at his gas station while he recovers. Those alerted to Nick's presence notice that even though it's incredibly hot, the immaculately dressed Nick fails to perspire. Ye