# 09 · TMDB Feature Enrichment

Enrich every movie with signals from **The Movie Database (TMDB)**:

| Feature | Description | How used |
|---------|-------------|----------|
| `overview` | Plot summary (1-2 paragraphs) | TF-IDF, merged into CB tag string |
| `director` | Primary director name | Binary feature per director |
| `top_cast` | Top 5 billed cast members | Binary features |
| `keywords` | TMDB keyword tags | Merged into tag string |
| `runtime`, `vote_average` | Runtime & global rating | Numeric features |

The fetcher caches every response to `data/external/tmdb_cache.json`  

In [7]:
import os
import json
import time
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import requests

# find_dotenv() walks up from the notebook directory until it finds .env
load_dotenv(find_dotenv())

ROOT    = Path.cwd().parents[1]
SRC     = ROOT / 'src'
FEAT    = SRC / 'data' / 'features'
PROC    = SRC / 'data' / 'processed'
RAW     = SRC / 'data' / 'raw' / 'ml-25m'
EXT     = SRC / 'data' / 'external'
MODELS  = SRC / 'models'
EXT.mkdir(parents=True, exist_ok=True)

TMDB_KEY      = os.environ.get('TMDB_API_KEY', '')
TMDB_BASE     = 'https://api.themoviedb.org/3'
CACHE_FILE    = EXT / 'tmdb_cache.json'
RATE_LIMIT    = 40   # requests per 10 seconds (TMDB free tier)

if not TMDB_KEY:
    print('⚠  TMDB_API_KEY not set — API calls will be skipped.')
    print('   Set the env var and re-run, or use the cached data if available.')
else:
    print(f'API key found ({TMDB_KEY[:4]}…)')

print('Paths OK:', FEAT.exists())

API key found (6e8d…)
Paths OK: True


## 1 · Load MovieLens Links (TMDB IDs)

In [None]:
links_df   = pd.read_parquet(PROC / 'links_cleaned.parquet')
movies_df  = pd.read_parquet(FEAT / 'movie_features.parquet')

# Only fetch for movies that are in the processed feature set
valid_ids  = set(movies_df['movieId'])
links_df   = links_df[links_df['movieId'].isin(valid_ids)].copy()
links_df   = links_df.dropna(subset=['tmdbId'])
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

print(f'Movies with TMDB IDs: {len(links_df):,} / {len(movies_df):,}')
links_df.head(3)

Movies with TMDB IDs: 32,388 / 32,424


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602


## 2 · TMDB API Fetcher (with Caching)

In [9]:
# Load existing cache
if CACHE_FILE.exists():
    with open(CACHE_FILE) as f:
        cache = json.load(f)
    print(f'Loaded cache: {len(cache):,} entries')
else:
    cache = {}
    print('Starting fresh cache.')


def save_cache():
    with open(CACHE_FILE, 'w') as f:
        json.dump(cache, f)


_request_times = []

def tmdb_get(endpoint, params=None):
    """Rate-limited TMDB GET with caching. Returns parsed JSON or None."""
    key = endpoint + str(sorted((params or {}).items()))
    if key in cache:
        return cache[key]

    if not TMDB_KEY:
        return None

    # Enforce rate limit: ≤ RATE_LIMIT requests per 10 seconds
    now = time.time()
    _request_times[:] = [t for t in _request_times if now - t < 10]
    if len(_request_times) >= RATE_LIMIT:
        sleep_for = 10 - (now - _request_times[0]) + 0.1
        time.sleep(max(sleep_for, 0))
    _request_times.append(time.time())

    url  = f'{TMDB_BASE}/{endpoint}'
    resp = requests.get(url, params={'api_key': TMDB_KEY, **(params or {})})

    if resp.status_code == 200:
        data = resp.json()
        cache[key] = data
        return data
    elif resp.status_code == 429:
        time.sleep(10)
        return tmdb_get(endpoint, params)
    else:
        cache[key] = None
        return None


def fetch_movie_details(tmdb_id):
    """Fetch movie details + credits + keywords in one call."""
    return tmdb_get(
        f'movie/{tmdb_id}',
        params={'append_to_response': 'credits,keywords'}
    )


print('Fetcher ready.')

Starting fresh cache.
Fetcher ready.


## 3 · Fetch Data for Top Movies

Fetching all 32 k movies takes might take to long. fetches top 500 movies for now

In [10]:
TOP_N = 5000   # set to None to fetch all

# Sort by popularity so most-impactful movies are fetched first
fetch_queue = (
    movies_df[['movieId', 'num_ratings']]
    .merge(links_df[['movieId', 'tmdbId']], on='movieId')
    .sort_values('num_ratings', ascending=False)
)
if TOP_N:
    fetch_queue = fetch_queue.head(TOP_N)

print(f'Will fetch {len(fetch_queue):,} movies')

# Actually fetch (skip if API key missing)
if TMDB_KEY:
    SAVE_EVERY = 500
    fetched = 0
    for _, row in fetch_queue.iterrows():
        tmdb_id = int(row['tmdbId'])
        fetch_movie_details(tmdb_id)   # result stored in cache automatically
        fetched += 1
        if fetched % SAVE_EVERY == 0:
            save_cache()
            print(f'  {fetched:,}/{len(fetch_queue):,} fetched, cache saved')
    save_cache()
    print(f'Done — {fetched:,} movies fetched, cache saved to {CACHE_FILE}')
else:
    print('Skipping API fetch (no key).  Run with TMDB_API_KEY set to populate cache.')

Will fetch 5,000 movies
  500/5,000 fetched, cache saved
  1,000/5,000 fetched, cache saved
  1,500/5,000 fetched, cache saved
  2,000/5,000 fetched, cache saved
  2,500/5,000 fetched, cache saved
  3,000/5,000 fetched, cache saved
  3,500/5,000 fetched, cache saved
  4,000/5,000 fetched, cache saved
  4,500/5,000 fetched, cache saved
  5,000/5,000 fetched, cache saved
Done — 5,000 movies fetched, cache saved to c:\Users\ololi\StudioProjects\movie-recommender\src\data\external\tmdb_cache.json


## 4 · Parse Cache into a Feature DataFrame

In [11]:
def extract_features(tmdb_id):
    """Extract structured features from a cached TMDB response."""
    key  = f'movie/{tmdb_id}' + str(sorted({'append_to_response': 'credits,keywords'}.items()))
    data = cache.get(key)
    if not data:
        return None

    # Overview / plot summary
    overview = (data.get('overview') or '').strip()

    # Runtime (minutes)
    runtime = data.get('runtime')

    # TMDB vote average (independent of MovieLens ratings)
    vote_avg = data.get('vote_average')

    # Director (take first director from crew)
    credits  = data.get('credits', {})
    crew     = credits.get('crew', [])
    director = next(
        (c['name'] for c in crew if c.get('job') == 'Director'), None
    )

    # Top-5 cast
    cast_list = credits.get('cast', [])
    top_cast  = [c['name'] for c in sorted(cast_list,
                    key=lambda x: x.get('order', 999))[:5]]

    # Keywords
    kws = data.get('keywords', {}).get('keywords', [])
    keywords = [k['name'] for k in kws]

    return {
        'overview': overview,
        'runtime':  runtime,
        'vote_avg': vote_avg,
        'director': director,
        'top_cast': top_cast,
        'keywords': keywords,
    }


# Build TMDB feature DataFrame
tmdb_rows = []
for _, row in fetch_queue.iterrows():
    feats = extract_features(int(row['tmdbId']))
    if feats:
        feats['movieId'] = int(row['movieId'])
        tmdb_rows.append(feats)

tmdb_df = pd.DataFrame(tmdb_rows)
print(f'Parsed {len(tmdb_df):,} TMDB records')
tmdb_df.head(2)

Parsed 4,986 TMDB records


Unnamed: 0,overview,runtime,vote_avg,director,top_cast,keywords,movieId
0,A man with a low IQ has accomplished great thi...,142,8.462,Robert Zemeckis,"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...","[new year's eve, vietnam war, vietnam veteran,...",356
1,Imprisoned in the 1940s for the double murder ...,142,8.715,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...","[prison, friendship, police brutality, corrupt...",318


## 5 · Merge with Movie Features

In [12]:
# Merge TMDB features into the main movies DataFrame
enriched_df = movies_df.merge(
    tmdb_df[['movieId', 'overview', 'runtime', 'vote_avg', 'director',
             'top_cast', 'keywords']],
    on='movieId', how='left'
)

n_with_overview = enriched_df['overview'].notna().sum()
n_with_director = enriched_df['director'].notna().sum()
print(f'Movies with overview:  {n_with_overview:,}')
print(f'Movies with director:  {n_with_director:,}')
print(f'Total movies:          {len(enriched_df):,}')

Movies with overview:  4,986
Movies with director:  4,986
Total movies:          32,424


## 6 · Build Enriched Tag String

Concatenate the existing `tag_string` with:
- Plot overview words
- Director name
- Cast names
- TMDB keywords

This single string is then fed into TF-IDF.

In [13]:
def build_enriched_text(row):
    parts = []

    # Existing user tags
    if pd.notna(row.get('tag_string')) and row['tag_string']:
        parts.append(str(row['tag_string']))

    # Director (repeated 3x for higher weight)
    if pd.notna(row.get('director')) and row['director']:
        director_tok = row['director'].lower().replace(' ', '_')
        parts.extend([director_tok] * 3)

    # Cast (repeated 2x)
    if isinstance(row.get('top_cast'), list):
        for name in row['top_cast']:
            cast_tok = name.lower().replace(' ', '_')
            parts.extend([cast_tok] * 2)

    # TMDB keywords
    if isinstance(row.get('keywords'), list):
        parts.extend([k.lower().replace(' ', '_') for k in row['keywords']])

    # Overview (plain text — TF-IDF will tokenise)
    if pd.notna(row.get('overview')) and row['overview']:
        parts.append(row['overview'].lower())

    return ' '.join(parts)


enriched_df['enriched_text'] = enriched_df.apply(build_enriched_text, axis=1)

# Preview
sample = enriched_df[enriched_df['enriched_text'].str.len() > 50].iloc[0]
print(f"Movie: {sample['title']}")
print(f"Text snippet: {sample['enriched_text'][:300]} …")

Movie: Toy Story (1995)
Text snippet: john_lasseter john_lasseter john_lasseter tom_hanks tom_hanks tim_allen tim_allen don_rickles don_rickles jim_varney jim_varney wallace_shawn wallace_shawn rescue friendship mission jealousy villain bullying elementary_school rivalry anthropomorphism friends computer_animation buddy walkie_talkie to …


## 7 · Re-train TF-IDF on Enriched Text

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Use positions aligned with movie_id_map
with open(FEAT / 'id_mappings.pkl', 'rb') as f:
    mappings = pickle.load(f)

movie_id_map = mappings['movie_id_map']
n_movies     = len(movie_id_map)

enriched_indexed = enriched_df.set_index('movieId')

# Build text corpus in movie_idx order
corpus = []
valid_movie_ids = []
for movie_id in sorted(movie_id_map, key=lambda m: movie_id_map[m]):
    if movie_id in enriched_indexed.index:
        text = enriched_indexed.loc[movie_id, 'enriched_text']
        corpus.append(text if isinstance(text, str) else '')
        valid_movie_ids.append(movie_id)
    else:
        corpus.append('')
        valid_movie_ids.append(movie_id)

print(f'Corpus size: {len(corpus):,} documents')
print(f'Non-empty: {sum(1 for t in corpus if t):,}')

Corpus size: 32,424 documents
Non-empty: 4,986


In [15]:
tfidf = TfidfVectorizer(
    max_features = 3000,      # more features than original (was 1000)
    min_df       = 3,         # lower threshold catches rare directors / actors
    ngram_range  = (1, 2),
    sublinear_tf = True,
    strip_accents = 'unicode',
)

tfidf_mat = tfidf.fit_transform(corpus)   # (n_movies, 3000)
print(f'TF-IDF matrix: {tfidf_mat.shape}')

# Top distinctive terms
feature_names = tfidf.get_feature_names_out()
mean_tfidf    = np.asarray(tfidf_mat.mean(axis=0)).ravel()
top_idx       = mean_tfidf.argsort()[::-1][:20]
print('Top 20 features:', list(feature_names[top_idx]))

TF-IDF matrix: (32424, 3000)
Top 20 features: ['the', 'to', 'and', 'of', 'in', 'his', 'is', 'with', 'an', 'he', 'for', 'on', 'her', 'their', 'when', 'that', 'by', 'as', 'who', 'from']


## 8 · Combine TF-IDF with Genre Features → New CB Matrix

In [16]:
genre_cols   = [c for c in enriched_df.columns if c.startswith('genre_')]
n_genres     = len(genre_cols)

# Genre feature matrix in movie_idx order
genre_data   = np.zeros((n_movies, n_genres), dtype=np.float32)
for movie_id, midx in movie_id_map.items():
    if movie_id in enriched_indexed.index:
        genre_data[midx] = enriched_indexed.loc[movie_id, genre_cols].values.astype(np.float32)

genre_mat = sp.csr_matrix(genre_data)

# Weighted combination: TF-IDF (weight 2) + genres (weight 1)
combined  = sp.hstack([tfidf_mat * 2.0, genre_mat * 1.0], format='csr')

# L2-normalise each row so dot product = cosine similarity
combined_norm = normalize(combined, norm='l2')

print(f'Combined feature matrix: {combined_norm.shape}')

Combined feature matrix: (32424, 3020)


## 9 · Quick Sanity Check — Movie Similarity

In [17]:
def similar_movies(movie_id, n=10):
    if movie_id not in movie_id_map:
        print('Movie not found.')
        return
    midx = movie_id_map[movie_id]
    sims = (combined_norm @ combined_norm[midx].T).toarray().ravel()
    sims[midx] = -1
    top  = np.argsort(sims)[::-1][:n]
    rows = []
    for t in top:
        mid = mappings['idx_to_movie'].get(t)
        if mid and mid in enriched_indexed.index:
            r = enriched_indexed.loc[mid]
            rows.append({'title': r['title'], 'genres': r['genres'],
                         'director': r.get('director'), 'similarity': round(float(sims[t]), 4)})
    return pd.DataFrame(rows)


# Toy Story (movieId=1)
print('Similar to Toy Story (1995) — enriched CB:')
similar_movies(1, n=10)

Similar to Toy Story (1995) — enriched CB:


Unnamed: 0,title,genres,director,similarity
0,Frozen II (2019),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
1,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,,0.7454
2,Aladdin (1992),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
3,Penguin Highway (2018),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
4,Brother Bear 2 (2006),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
5,"Tale of Despereaux, The (2008)",Adventure|Animation|Children|Comedy|Fantasy,,0.7454
6,Wonder Park (2019),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
7,Toy Story Toons: Small Fry (2011),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
8,Toy Story Toons: Hawaiian Vacation (2011),Adventure|Animation|Children|Comedy|Fantasy,,0.7454
9,DuckTales: The Movie - Treasure of the Lost La...,Adventure|Animation|Children|Comedy|Fantasy,,0.7454


## 10 · Save Enriched CB Model

Save this as a drop-in replacement for `cb_model.pkl`.

In [18]:
# Build index maps matching the ACTUAL key names in cb_model.pkl:
#   movie_idx_lookup  – {movie_id → row in feature matrix}
#   idx_to_movie_id   – {row → movie_id}
# The corpus was built in movie_idx order, so row midx = movie at mappings['idx_to_movie'][midx]
idx_to_movie = mappings['idx_to_movie']   # {movie_idx → movie_id}

movie_idx_lookup = {idx_to_movie[midx]: midx for midx in range(n_movies)
                    if midx in idx_to_movie}
idx_to_movie_id  = {midx: idx_to_movie[midx] for midx in range(n_movies)
                    if midx in idx_to_movie}

cb_enriched = {
    # Keys match the original cb_model.pkl so all downstream code works unchanged
    'movie_feature_matrix': combined_norm,
    'movie_idx_lookup':     movie_idx_lookup,   # {movie_id → row}
    'idx_to_movie_id':      idx_to_movie_id,    # {row → movie_id}
    'tfidf':                tfidf,
    'genre_cols':           genre_cols,
    'weights':              {'tfidf': 2.0, 'genres': 1.0},
    'rating_midpoint':      3.0,
    # Metadata
    'enriched':             True,
    'n_tfidf_features':     tfidf_mat.shape[1],
    'n_genre_features':     n_genres,
    'tmdb_movies_count':    len(tmdb_df),
}

with open(MODELS / 'cb_enriched_model.pkl', 'wb') as f:
    pickle.dump(cb_enriched, f)

# Save enriched movie features for use in the app
enriched_df.drop(columns=['top_cast', 'keywords'], errors='ignore') \
           .to_parquet(FEAT / 'movie_features_enriched.parquet', index=False)

print('Saved:')
print('  src/models/cb_enriched_model.pkl')
print('  src/data/features/movie_features_enriched.parquet')
print()
print(f'Feature matrix: {combined_norm.shape}  (was {tfidf_mat.shape[0]} x 1020 before TMDB)')
print(f'Lookup entries: {len(movie_idx_lookup):,}')

Saved:
  src/models/cb_enriched_model.pkl
  src/data/features/movie_features_enriched.parquet

Feature matrix: (32424, 3020)  (was 32424 x 1020 before TMDB)
Lookup entries: 32,424
