In [99]:
# Reel Hits

In [39]:
import datetime
import spotipy
import os
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from rapidfuzz import fuzz
from dotenv import load_dotenv
from typing import Union

load_dotenv()

cred_manager = SpotifyClientCredentials(client_id=os.getenv("SPOTIFY_CLIENT_ID"),
                                        client_secret=os.getenv("SPOTIFY_CLIENT_SECRET"))
sp = spotipy.Spotify(client_credentials_manager=cred_manager,
                     requests_timeout=15, retries=3, status_forcelist=(429, 500, 502, 503, 504))

In [40]:
SOUNDTRACK_HINTS = ("soundtrack", "original motion picture", "music from the motion picture", "original score", "ost")

def candidate_queries(title: str, year: Union[int, None]):
    base = [f'album:"{title}"', f'{title} soundtrack', f'album:"{title} Original Motion Picture Soundtrack"',
            f'album:"{title} Original Score"', f'album:"{title} Music From the Motion Picture"']
    if year:
        base += [f'album:"{title}" year:{year}', f'{title} year:{year} soundtrack']
    return base

In [41]:
def album_release_year(album):
    # 'release_date' can be "YYYY" or "YYYY-MM-DD"
    rd = album.get('release_date')
    if not rd: return None
    try:
        return datetime.date.fromisoformat(rd).year
    except:
        try: return int(rd[:4])
        except: return None

In [42]:
def heuristic_score(album, movie_title, movie_year):
    name = album.get('name', '')
    # Title similarity (robust to parentheses)
    title_sim = fuzz.token_set_ratio(movie_title, name)
    # Hints
    hint_bonus = 15 if any(h in name.lower() for h in SOUNDTRACK_HINTS) else 0
    # Year proximity
    ay = album_release_year(album)
    year_bonus = 0
    if movie_year and ay:
        diff = abs(ay - movie_year)
        year_bonus = 12 if diff <= 1 else 6 if diff == 2 else 0
    return title_sim + hint_bonus + year_bonus

In [43]:
def find_best_soundtrack_album(title, year):
    tried = set()
    best, best_score = None, -1

    for q_title in [title]:
        for q in candidate_queries(q_title, year):
            if q in tried: continue
            tried.add(q)
            res = sp.search(q=q, type='album', limit=10)
            for alb in res.get('albums', {}).get('items', []):
                score = heuristic_score(alb, title, year)
                if score > best_score:
                    best, best_score = alb, score
    return best, best_score


In [44]:
def get_album_popularity_metrics(album_id: str):
    album = sp.album(album_id)
    pop_album = album.get('popularity')  # 0–100 (may be None rarely)
    # Gather all tracks’ popularity
    results = sp.album_tracks(album_id, limit=50)
    items = results.get('items', [])
    album_artists = set(artist['name'] for track in items for artist in track.get('artists', []))
    while results.get('next'):
        results = sp.next(results)
        items += results.get('items', [])

    album_length_ms = sum(t.get('duration_ms') for t in items if t.get('duration_ms'))
    album_length_min = album_length_ms / 60000
    # Fetch track details in batches to get popularity
    ids = [t['id'] for t in items if t.get('id')]
    pop_list = []
    for i in range(0, len(ids), 50):
        batch = sp.tracks(ids[i:i+50]).get('tracks', [])
        pop_list.extend([t.get('popularity') for t in batch if t and t.get('popularity') is not None])
    if pop_list:
        avg_track_pop = sum(pop_list) / len(pop_list)
        sum_track_pop = sum(pop_list)
    else:
        avg_track_pop = None
        sum_track_pop = None
    return {
        'album_popularity': pop_album,
        'avg_track_popularity': avg_track_pop,
        'sum_track_popularity': sum_track_pop,
        'n_tracks': len(ids),
        'album_length_ms': album_length_ms,
        'album_length_min': album_length_min,
        'album_artists': album_artists
    }


In [45]:
def process_movie(tconst: str, title: str, year: int, revenue: float):

    album, score = find_best_soundtrack_album(title, year)
    if not album:
        return {'tconst': tconst, 'title': title, 'year': year,
                'revenue': revenue, 'spotify_album_id': None, 'match_score': None}

    metrics = get_album_popularity_metrics(album['id'])
    return {
        'tconst': tconst,
        'title': title,
        'year': year,
        'revenue': revenue,
        'spotify_album_id': album['id'],
        'spotify_album_name': album['name'],
        'spotify_album_year': album_release_year(album),
        'match_score': score,
        **metrics
    }


In [46]:
df = pd.read_csv("..\\data\\clean_tmdb.csv")
print("Loaded clean_tmdb.csv with shape:", df.shape)

df = df[(df['revenue'] > 0) & (df['release_date'].notna()) & (df['imdb_id'].notna()) & (df['title'].notna())]
df = df[df['release_date'].apply(lambda x: datetime.date.fromisoformat(x).year if pd.notna(x) else None) >= 1990]
df = df[df['original_language'] == 'en']
df = df[(df['vote_average'] >= 7.0) & (df['vote_count'] >= 2000)]
df = df[['imdb_id', 'title', 'release_date', 'revenue']]

print("Filtered DataFrame shape:", df.shape)

results = []
tuples = list(df.itertuples(index=False))
from tqdm import tqdm
for row in tqdm(tuples[:500]):
    tconst, title, year, revenue = row.imdb_id, row.title, datetime.date.fromisoformat(row.release_date).year, float(row.revenue)
    result = process_movie(tconst, title, year, revenue)
    results.append(result)

results_df = pd.DataFrame(results)
print(results_df.head())
results_df.to_csv("data\\reel_hits_big.csv", index=False)

Loaded clean_tmdb.csv with shape: (16013, 24)
Filtered DataFrame shape: (745, 4)


  0%|          | 0/500 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [123]:
metrics = get_album_popularity_metrics("37ddKI6C7HW9O1gX1gI0ei")

print(metrics)

{'album_popularity': 22, 'avg_track_popularity': 14.4, 'sum_track_popularity': 144, 'n_tracks': 10}


In [119]:
row = tuples[539]
tconst, title, year, revenue = row.imdb_id, row.title, datetime.date.fromisoformat(row.release_date).year, float(row.revenue)
result = process_movie(tconst, title, year, revenue)
print(result)

{'tconst': 'tt1398426', 'title': 'Straight Outta Compton', 'year': 2015, 'revenue': 201634991.0, 'spotify_album_id': '1rMnLDmzyEBRiCj7yoGK2n', 'spotify_album_name': 'Straight Outta Compton (Music From The Motion Picture)', 'spotify_album_year': 2016, 'match_score': 127.0, 'album_popularity': 41, 'avg_track_popularity': 23.705882352941178, 'sum_track_popularity': 403, 'n_tracks': 17}


In [47]:
df = pd.read_csv("..\\data\\clean_tmdb.csv")
print("Loaded clean_tmdb.csv with shape:", df.shape)

df = df[(df['revenue'] > 0) & (df['release_date'].notna()) & (df['imdb_id'].notna()) & (df['title'].notna())]
df = df[df['release_date'].apply(lambda x: datetime.date.fromisoformat(x).year if pd.notna(x) else None) >= 1990]
df = df[df['original_language'] == 'en']
df = df[(df['vote_average'] < 7.0) & (df['vote_count'] >= 2000)]
df = df[['imdb_id', 'title', 'release_date', 'revenue']]

print("Filtered DataFrame shape:", df.shape)

results = []
tuples = list(df.itertuples(index=False))
from tqdm import tqdm
for row in tqdm(tuples[:500]):
    tconst, title, year, revenue = row.imdb_id, row.title, datetime.date.fromisoformat(row.release_date).year, float(row.revenue)
    result = process_movie(tconst, title, year, revenue)
    results.append(result)

results_df = pd.DataFrame(results)
print(results_df.head())
results_df.to_csv("..\\data\\reel_hits_small.csv", index=False)

Loaded clean_tmdb.csv with shape: (16013, 24)
Filtered DataFrame shape: (1057, 4)


100%|██████████| 500/500 [15:47<00:00,  1.90s/it]

      tconst                               title  year       revenue  \
0  tt1300854                          Iron Man 3  2013  1.215577e+09   
1  tt0458339  Captain America: The First Avenger  2011  3.705698e+08   
2  tt1386697                       Suicide Squad  2016  7.468469e+08   
3  tt0800369                                Thor  2011  4.493266e+08   
4  tt1228705                          Iron Man 2  2010  6.239333e+08   

         spotify_album_id                               spotify_album_name  \
0  7x6etI9jQ2ePxZUOQaPrbi  Iron Man 3 (Original Motion Picture Soundtrack)   
1  0La1Tskamjd6akysepE8r5               Captain America: The First Avenger   
2  0C8fz2LJQotVr0SZNta8H6    Suicide Squad (Original Motion Picture Score)   
3  3OCSaoAx2IdkeUtd4T5cuS                                         Thor OST   
4  6bnGdcfwPDRSkj6H8GcbAh         Original Motion Picture Score Iron Man 2   

   spotify_album_year  match_score  album_popularity  avg_track_popularity  \
0                201




In [None]:
df = pd.read_csv("..\\data\\clean_tmdb.csv")
print("Loaded clean_tmdb.csv with shape:", df.shape)

df = df[(df['revenue'] > 0) & (df['release_date'].notna()) & (df['imdb_id'].notna()) & (df['title'].notna())]
df = df[df['release_date'].apply(lambda x: datetime.date.fromisoformat(x).year if pd.notna(x) else None) >= 1990]
df = df[df['original_language'] == 'en']
df = df[(df['vote_average'] < 7.0) & (df['vote_count'] >= 2000)]
df = df[['imdb_id', 'title', 'release_date', 'revenue']]

print("Filtered DataFrame shape:", df.shape)

results = []
tuples = list(df.itertuples(index=False))

In [197]:
row = tuples[546]
tconst, title, year, revenue = row.imdb_id, row.title, datetime.date.fromisoformat(row.release_date).year, float(row.revenue)
result = process_movie(tconst, title, year, revenue)
print(result)

{'tconst': 'tt1071875', 'title': 'Ghost Rider: Spirit of Vengeance', 'year': 2011, 'revenue': 132563930.0, 'spotify_album_id': '2si4shX0mUKYLP3Radt20j', 'spotify_album_name': 'Ghost Rider: Spirit of Vengeance (Original Motion Picture Score)', 'spotify_album_year': 2012, 'match_score': 127.0, 'album_popularity': 20, 'avg_track_popularity': 4.7073170731707314, 'sum_track_popularity': 193, 'n_tracks': 41, 'album_length_ms': 4296772, 'album_length_min': 71.61286666666666, 'album_artists': {'David Sardy'}}


In [205]:
ALBUM_ID = "0yrK3jKRexjtP4CRdYizjs"
metrics = get_album_popularity_metrics(ALBUM_ID)

print(sp.album(ALBUM_ID)['name'])
print(metrics)

Kill Bill Vol. 2 Original Soundtrack
{'album_popularity': 44, 'avg_track_popularity': 28.933333333333334, 'sum_track_popularity': 434, 'n_tracks': 15, 'album_length_ms': 2776501, 'album_length_min': 46.275016666666666, 'album_artists': {'Shivaree', 'Johnny Cash', 'David Carradine', 'Uma Thurman', 'Lole Y Manuel', 'Malcolm McLaren', 'Meiko Kaji', 'Ennio Morricone', 'Alan Reeves', 'Phil Steele', 'Luis Bacalov', 'Philip Brigham', 'Chingon', 'Charlie Feathers'}}


In [203]:
# Add reel_hits_small and reel_hits_big to create reel_hits
df_small = pd.read_csv("..\\data\\reel_hits_small.csv")

print("Small DataFrame shape:", df_small.shape)

df_big = pd.read_csv("..\\data\\reel_hits_big.csv")

print("Big DataFrame shape:", df_big.shape)

df_reel_hits = pd.concat([df_small, df_big], ignore_index=True)
print("Combined DataFrame shape:", df_reel_hits.shape)

df_reel_hits.to_csv("..\\data\\reel_hits.csv", index=False)


Small DataFrame shape: (500, 15)
Big DataFrame shape: (500, 15)
Combined DataFrame shape: (1000, 15)
