In [1]:
"""
Visualizations for Similar Movies

This notebook creates UMAP visualizations of movie embeddings with interactive features.
"""

import os
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Configure plotly renderer for notebooks
# Try to use notebook renderer, but fallback to browser if nbformat is not available
try:
    import nbformat
    if hasattr(nbformat, '__version__'):
        pio.renderers.default = 'notebook'
    else:
        pio.renderers.default = 'browser'
except ImportError:
    # nbformat not installed - use browser renderer
    print("Note: nbformat not installed. Plotly will open plots in browser.")
    print("To show plots inline in notebook, run: conda install nbformat or pip install nbformat>=4.2.0")
    pio.renderers.default = 'browser'

# Set up paths
try:
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except NameError:
    # Fallback for notebooks
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Add project root and src directory to path for imports
SRC_DIR = os.path.join(BASE_DIR, 'src')
if BASE_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

# Import functions from data_utils and data_cleaning
from src.data_utils import (
    load_movie_embeddings, 
    load_movie_data, 
    cluster_genres,
    search_movies_by_keywords
)
from src.data_cleaning import clean_dataset

DATA_DIR = os.path.join(BASE_DIR, 'data', 'data_final')
START_YEAR = 1930
END_YEAR = 2024
CHUNKING_SUFFIX = None  # Auto-detect

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Year range: {START_YEAR} to {END_YEAR}")

# Set random seed for reproducibility
np.random.seed(42)


  from .autonotebook import tqdm as notebook_tqdm


Base directory: /home/nab/Niklas/GroupDataLiteracy
Data directory: /home/nab/Niklas/GroupDataLiteracy/data/data_final
Year range: 1930 to 2024


In [2]:
# Auto-detect chunking suffix if not specified
if CHUNKING_SUFFIX is None:
    test_year = START_YEAR
    found_suffix = None
    for suffix in ['_cls_token', '_mean_pooling', '']:
        test_path = os.path.join(DATA_DIR, f'movie_embeddings_{test_year}{suffix}.npy')
        if os.path.exists(test_path):
            found_suffix = suffix
            break
    
    if found_suffix is not None:
        CHUNKING_SUFFIX = found_suffix
        print(f"Auto-detected chunking suffix: '{CHUNKING_SUFFIX}'")
    else:
        CHUNKING_SUFFIX = ''
        print("No chunking suffix detected, using default (no suffix)")
else:
    print(f"Using chunking suffix: '{CHUNKING_SUFFIX}'")

# Load all embeddings and corresponding movie IDs
print("\nLoading embeddings...")
all_embeddings, all_movie_ids = load_movie_embeddings(
    DATA_DIR,
    chunking_suffix=CHUNKING_SUFFIX,
    start_year=START_YEAR,
    end_year=END_YEAR,
    verbose=True
)

if len(all_movie_ids) == 0:
    raise ValueError(f"No embeddings found in {DATA_DIR}")

print(f"\nTotal movies with embeddings: {len(all_movie_ids)}")
print(f"Embedding shape: {all_embeddings.shape}")


Auto-detected chunking suffix: '_cls_token'

Loading embeddings...
Data directory: /home/nab/Niklas/GroupDataLiteracy/data/data_final
Year range: 1930 to 2024
Chunking suffix: '_cls_token'
Year 1930: Loaded 441 embeddings (shape: (441, 1024))
Year 1931: Loaded 540 embeddings (shape: (540, 1024))
Year 1932: Loaded 577 embeddings (shape: (577, 1024))
Year 1933: Loaded 570 embeddings (shape: (570, 1024))
Year 1934: Loaded 618 embeddings (shape: (618, 1024))
Year 1935: Loaded 661 embeddings (shape: (661, 1024))
Year 1936: Loaded 683 embeddings (shape: (683, 1024))
Year 1937: Loaded 727 embeddings (shape: (727, 1024))
Year 1938: Loaded 668 embeddings (shape: (668, 1024))
Year 1939: Loaded 624 embeddings (shape: (624, 1024))
Year 1940: Loaded 629 embeddings (shape: (629, 1024))
Year 1941: Loaded 657 embeddings (shape: (657, 1024))
Year 1942: Loaded 636 embeddings (shape: (636, 1024))
Year 1943: Loaded 571 embeddings (shape: (571, 1024))
Year 1944: Loaded 482 embeddings (shape: (482, 1024))
Y

In [3]:
# Load movie metadata
print("Loading movie metadata...")
movie_data = load_movie_data(DATA_DIR, verbose=True)

if movie_data.empty:
    raise ValueError(f"No movie data found in {DATA_DIR}")

print(f"Loaded {len(movie_data)} movies from metadata files")

# Apply data cleaning
print("\nApplying data cleaning...")
movie_data = clean_dataset(movie_data, filter_single_genres=True)
print(f"Movies after cleaning: {len(movie_data)}")

# Fill NaN values in genre column
if 'genre' in movie_data.columns:
    movie_data['genre'] = movie_data['genre'].fillna('')

# Apply genre clustering
original_cwd = os.getcwd()
try:
    os.chdir(SRC_DIR)
    print("\nProcessing genres...")
    movie_data = cluster_genres(movie_data)
finally:
    os.chdir(original_cwd)

# Create mappings from movie_id to title
movie_to_title = {}
for idx, row in movie_data.iterrows():
    movie_id = row['movie_id']
    title = row.get('title', 'Unknown')
    movie_to_title[movie_id] = title

print(f"Created title mappings for {len(movie_to_title)} movies")


Loading movie metadata...
Found 95 year files
Year 1930: Loaded 758 movies from wikidata_movies_1930.csv
Year 1931: Loaded 908 movies from wikidata_movies_1931.csv
Year 1932: Loaded 952 movies from wikidata_movies_1932.csv
Year 1933: Loaded 944 movies from wikidata_movies_1933.csv
Year 1934: Loaded 1015 movies from wikidata_movies_1934.csv
Year 1935: Loaded 1050 movies from wikidata_movies_1935.csv
Year 1936: Loaded 1167 movies from wikidata_movies_1936.csv
Year 1937: Loaded 1141 movies from wikidata_movies_1937.csv
Year 1938: Loaded 1000 movies from wikidata_movies_1938.csv
Year 1939: Loaded 968 movies from wikidata_movies_1939.csv
Year 1940: Loaded 999 movies from wikidata_movies_1940.csv
Year 1941: Loaded 935 movies from wikidata_movies_1941.csv
Year 1942: Loaded 1017 movies from wikidata_movies_1942.csv
Year 1943: Loaded 928 movies from wikidata_movies_1943.csv
Year 1944: Loaded 777 movies from wikidata_movies_1944.csv
Year 1945: Loaded 695 movies from wikidata_movies_1945.csv
Year

In [20]:
# Filter embeddings to only include movies that are in the cleaned metadata
print("\nFiltering embeddings to match cleaned metadata...")
cleaned_movie_ids_set = set(movie_data['movie_id'].values)
mask = np.array([mid in cleaned_movie_ids_set for mid in all_movie_ids])
all_embeddings = all_embeddings[mask]
all_movie_ids = all_movie_ids[mask]

print(f"After filtering: {len(all_movie_ids)} movies with both embeddings and cleaned metadata")
print(f"Embedding shape: {all_embeddings.shape}")

# Sample random movies for UMAP visualization
n_samples_umap = 30_000
if len(all_movie_ids) < n_samples_umap:
    n_samples_umap = len(all_movie_ids)
    print(f"Only {n_samples_umap} movies available, using all of them")

sample_indices_umap = np.random.choice(len(all_movie_ids), size=n_samples_umap, replace=False)
sampled_embeddings_umap = all_embeddings[sample_indices_umap]
sampled_movie_ids_umap = all_movie_ids[sample_indices_umap]

print(f"\nSampled {n_samples_umap} movies for UMAP visualization")

# Compute UMAP reduction
print("\nComputing UMAP reduction...")
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
umap_embedding = reducer.fit_transform(sampled_embeddings_umap)

print(f"UMAP embedding shape: {umap_embedding.shape}")



Filtering embeddings to match cleaned metadata...
After filtering: 98494 movies with both embeddings and cleaned metadata
Embedding shape: (98494, 1024)

Sampled 30000 movies for UMAP visualization

Computing UMAP reduction...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP embedding shape: (30000, 2)


In [21]:
# Sample 2000 movies for interactive plot (from the 5000 already sampled)
n_samples_interactive = 8000
if n_samples_interactive > len(sampled_movie_ids_umap):
    n_samples_interactive = len(sampled_movie_ids_umap)

sample_indices_interactive = np.random.choice(
    len(sampled_movie_ids_umap), 
    size=n_samples_interactive, 
    replace=False
)

interactive_movie_ids = sampled_movie_ids_umap[sample_indices_interactive]
interactive_umap = umap_embedding[sample_indices_interactive]

# Get titles for interactive plot
interactive_titles = [movie_to_title.get(mid, 'Unknown') for mid in interactive_movie_ids]

# Create DataFrame for plotly
plot_df = pd.DataFrame({
    'x': interactive_umap[:, 0],
    'y': interactive_umap[:, 1],
    'movie_id': interactive_movie_ids,
    'title': interactive_titles
})

# Create interactive plot with plotly
# Format hover text directly with title and QID
hover_texts = []
for idx, row in plot_df.iterrows():
    hover_texts.append(f"<b>Title:</b> {row['title']}<br><b>QID:</b> {row['movie_id']}")

In [27]:
# Example: Search for movies by keywords
# You can modify the keywords and search_columns as needed

# Search example 1: Search in titles
keywords = ['Harry Potter']
search_columns = ['title']  # Can also search in other columns like ['title', 'director']

matching_qids = search_movies_by_keywords(
    movie_data,
    keywords=keywords,
    search_columns=search_columns,
    case_sensitive=False
)

print(f"Found {len(matching_qids)} movies matching keywords: {keywords}")
print(f"Matching QIDs: {matching_qids[:10]}")  # Show first 10

# Get titles for the matching movies
matching_titles = []
for qid in matching_qids[:10]:
    title = movie_to_title.get(qid, 'Unknown')
    matching_titles.append((qid, title))
    print(f"  {qid}: {title}")

# Check which of the matching movies are in our sampled set
highlight_qids_set = set(matching_qids)
highlight_mask = np.array([mid in highlight_qids_set for mid in sampled_movie_ids_umap])
n_highlighted = np.sum(highlight_mask)

print(f"\n{n_highlighted} of the matching movies are in the sampled UMAP set")


Found 8 movies matching keywords: ['Harry Potter']
Matching QIDs: ['Q102438', 'Q102244', 'Q102448', 'Q102225', 'Q102235', 'Q161687', 'Q161678', 'Q232009']
  Q102438: Harry Potter and the Philosopher's Stone
  Q102244: Harry Potter and the Chamber of Secrets
  Q102448: Harry Potter and the Prisoner of Azkaban
  Q102225: Harry Potter and the Goblet of Fire
  Q102235: Harry Potter and the Order of the Phoenix
  Q161687: Harry Potter and the Half-Blood Prince
  Q161678: Harry Potter and the Deathly Hallows – Part 1
  Q232009: Harry Potter and the Deathly Hallows – Part 2

3 of the matching movies are in the sampled UMAP set


In [28]:
# Create UMAP plot with highlighted search results
# You can update the keywords above and re-run to highlight different movies

# Find matching movies from the full dataset (not just the sampled 10,000)
# Create mapping from movie_id to index in all_movie_ids
movie_id_to_index = {mid: idx for idx, mid in enumerate(all_movie_ids)}

# Get embeddings for all matching movies from the full dataset
matching_embeddings = []
matching_movie_ids_full = []
for qid in matching_qids:
    if qid in movie_id_to_index:
        idx = movie_id_to_index[qid]
        matching_embeddings.append(all_embeddings[idx])
        matching_movie_ids_full.append(qid)

if len(matching_embeddings) > 0:
    matching_embeddings = np.array(matching_embeddings)
    # Transform matching movie embeddings using the fitted UMAP reducer
    matching_umap = reducer.transform(matching_embeddings)
    print(f"Found {len(matching_movie_ids_full)} matching movies in full dataset with embeddings")
else:
    matching_umap = np.array([]).reshape(0, 2)
    print("No matching movies found in full dataset with embeddings")

# Prepare data for plotting (sampled movies)
plot_data = pd.DataFrame({
    'x': umap_embedding[:, 0],
    'y': umap_embedding[:, 1],
    'movie_id': sampled_movie_ids_umap,
    'title': [movie_to_title.get(mid, 'Unknown') for mid in sampled_movie_ids_umap]
})

# Create interactive plot with highlighted movies
fig = go.Figure()

# Plot all sampled movies (non-highlighted)
fig.add_trace(go.Scatter(
    x=plot_data['x'],
    y=plot_data['y'],
    mode='markers',
    marker=dict(size=5, color='lightblue', opacity=0.7, line=dict(width=0)),
    text=plot_data['title'],
    customdata=plot_data['movie_id'],
    hovertemplate='<b>%{text}</b><br>QID: %{customdata}<extra></extra>',
    name='Other movies',
    showlegend=False
))

# Plot highlighted movies from full dataset with different marker
if len(matching_umap) > 0:
    matching_titles_full = [movie_to_title.get(mid, 'Unknown') for mid in matching_movie_ids_full]
    fig.add_trace(go.Scatter(
        x=matching_umap[:, 0],
        y=matching_umap[:, 1],
        mode='markers',
        marker=dict(size=15, color='red', opacity=0.8, symbol='star', line=dict(width=2, color='darkred')),
        text=matching_titles_full,
        customdata=matching_movie_ids_full,
        hovertemplate='<b>%{text}</b><br>QID: %{customdata}<extra></extra>',
        name='Search results',
        showlegend=True
    ))

fig.update_layout(
    title=f'UMAP Visualization with Highlighted Search Results<br><sub>Keywords: {keywords} | Highlighted: {len(matching_movie_ids_full)} movies (from full dataset)</sub>',
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    width=1200,
    height=900,
    hovermode='closest'
)

fig.show()

print(f"\nHighlighted {len(matching_movie_ids_full)} movies in red (star markers) from the full dataset")


Found 8 matching movies in full dataset with embeddings



Highlighted 8 movies in red (star markers) from the full dataset
