In [1]:
import pandas as pd

# Load datasets (adjust the path if needed)
movies = pd.read_csv('../data/movies.csv')
tags = pd.read_csv('../data/tags.csv')

# Show some data
print("Movies:")
display(movies.head())

print("\nTags:")
display(tags.head())


Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



Tags:


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [2]:
# Step 1: Group tags by movieId and join them into a single string
tags['tag'] = tags['tag'].astype(str)  # just in case
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Step 2: Merge with movies
movies_with_tags = pd.merge(movies, movie_tags, on='movieId', how='left')

# Step 3: Fill NaNs in tag column (some movies might not have tags)
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

# Optional: clean genre formatting (remove "|")
movies_with_tags['genres'] = movies_with_tags['genres'].str.replace('|', ' ', regex=False)

# Step 4: Create the soup
movies_with_tags['soup'] = movies_with_tags['title'] + ' ' + movies_with_tags['genres'] + ' ' + movies_with_tags['tag']

# Preview the new dataframe
movies_with_tags[['movieId', 'title', 'soup']].head(5)


Unnamed: 0,movieId,title,soup
0,1,Toy Story (1995),Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),Jumanji (1995) Adventure Children Fantasy fant...
2,3,Grumpier Old Men (1995),Grumpier Old Men (1995) Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Waiting to Exhale (1995) Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Father of the Bride Part II (1995) Comedy preg...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to convert the soup into feature vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_with_tags['soup'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (9742, 9949)


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [5]:
# Reset index just in case
movies_with_tags = movies_with_tags.reset_index()
indices = pd.Series(movies_with_tags.index, index=movies_with_tags['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # top 10 excluding itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_with_tags['title'].iloc[movie_indices]


In [6]:
get_recommendations('Toy Story (1995)')


2355                       Toy Story 2 (1999)
1757                     Bug's Life, A (1998)
7355                       Toy Story 3 (2010)
3595                          Toy, The (1982)
7039                                Up (2009)
3733                               Fun (1994)
2539    We're Back! A Dinosaur's Story (1993)
26                        Now and Then (1995)
4089                      Toy Soldiers (1991)
1617            NeverEnding Story, The (1984)
Name: title, dtype: object

In [5]:
import requests

api_key = 'd7c9981e44efcbc07dd8416b7d2f1891'
base_url = 'https://api.themoviedb.org/3'


In [6]:
def get_movie_metadata_raw(title):
    try:
        print(f"Searching TMDb for: {title}")

        # 1. Search for the movie
        search_url = f"{base_url}/search/movie"
        params = {'api_key': api_key, 'query': title}
        response = requests.get(search_url, params=params)
        response.raise_for_status()

        results = response.json().get('results')
        if not results:
            print("No results found.")
            return None

        movie_id = results[0]['id']
        print(f"Found movie ID: {movie_id}")

        # 2. Get movie details
        details_url = f"{base_url}/movie/{movie_id}"
        details = requests.get(details_url, params={'api_key': api_key}).json()
        overview = details.get('overview', '')

        # 3. Get credits
        credits_url = f"{base_url}/movie/{movie_id}/credits"
        credits = requests.get(credits_url, params={'api_key': api_key}).json()
        director = ''
        for crew in credits.get('crew', []):
            if crew.get('job') == 'Director':
                director = crew.get('name')
                break

        cast = [member.get('name') for member in credits.get('cast', [])[:3]]

        # 4. Get keywords
        keywords_url = f"{base_url}/movie/{movie_id}/keywords"
        keywords = requests.get(keywords_url, params={'api_key': api_key}).json()
        keyword_list = [kw.get('name') for kw in keywords.get('keywords', [])]

        return {
            'overview': overview,
            'director': director,
            'cast': cast,
            'keywords': keyword_list
        }

    except Exception as e:
        print(f"Error: {e}")
        return None


In [7]:
meta = get_movie_metadata_raw("The Matrix")
print(meta)


Searching TMDb for: The Matrix
Found movie ID: 603
{'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.', 'director': 'Lana Wachowski', 'cast': ['Keanu Reeves', 'Laurence Fishburne', 'Carrie-Anne Moss'], 'keywords': ['man vs machine', 'martial arts', 'dreams', 'artificial intelligence (a.i.)', 'saving the world', 'hacker', 'self sacrifice', 'virtual reality', 'fight', 'prophecy', 'truth', 'philosophy', 'dystopia', 'insurgence', 'simulated reality ', 'cyberpunk', 'dream world', 'messiah', 'action hero', 'gnosticism', 'awestruck', 'hopeful', 'allegory of the cave']}


In [1]:
import pandas as pd

movies = pd.read_csv('../data/movies.csv')  # adjust path as needed
movies = movies.dropna(subset=['title'])  # clean null titles
movies = movies.drop_duplicates(subset='title')
movies = movies.reset_index(drop=True)

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
import re

def clean_title(title):
    # Removes anything in parentheses at the end like " (1999)"
    return re.sub(r'\s*\(\d{4}\)$', '', title).strip()

movies['clean_title'] = movies['title'].apply(clean_title)


In [3]:
subset_size = 800  # or 1000
movie_subset = movies.sample(subset_size, random_state=42).reset_index(drop=True)
movie_subset['clean_title'] = movie_subset['title'].apply(clean_title)


In [8]:
# Add metadata columns
for col in ['overview_tmdb', 'director', 'cast', 'keywords']:
    if col not in movie_subset.columns:
        movie_subset[col] = ''

# Optional: Load or create a cache
import os, json

if os.path.exists('metadata_cache.json'):
    with open('metadata_cache.json', 'r') as f:
        metadata_cache = json.load(f)
else:
    metadata_cache = {}

# Enrich
for i in range(len(movie_subset)):
    title = movie_subset.loc[i, 'clean_title']
    
    if title in metadata_cache:
        meta = metadata_cache[title]
    else:
        meta = get_movie_metadata_raw(title)
        metadata_cache[title] = meta
        with open('metadata_cache.json', 'w') as f:
            json.dump(metadata_cache, f)
    
    if meta:
        movie_subset.at[i, 'overview_tmdb'] = meta['overview']
        movie_subset.at[i, 'director'] = meta['director']
        movie_subset.at[i, 'cast'] = ' '.join(meta['cast'])
        movie_subset.at[i, 'keywords'] = ' '.join(meta['keywords'])

    if i % 10 == 0:
        print(f"Processed {i}/{len(movie_subset)}")


Searching TMDb for: Teenage Mutant Ninja Turtles
Found movie ID: 98566
Processed 0/800
Searching TMDb for: America's Sweethearts
Found movie ID: 11467
Searching TMDb for: Cast Away
Found movie ID: 8358
Searching TMDb for: Persepolis
Found movie ID: 2011
Searching TMDb for: Anger Management
Found movie ID: 9506
Searching TMDb for: Eulogy
Found movie ID: 16358
Searching TMDb for: Beer League
Found movie ID: 14137
Searching TMDb for: Hudson Hawk
Found movie ID: 9292
Searching TMDb for: New Kids Nitro
Found movie ID: 79723
Searching TMDb for: City Slickers
Found movie ID: 1406
Searching TMDb for: Funny Face
Found movie ID: 13320
Processed 10/800
Searching TMDb for: Pusher III: I'm the Angel of Death
Found movie ID: 11330
Searching TMDb for: Suspect Zero
Found movie ID: 8080
Searching TMDb for: Last King of Scotland, The
Found movie ID: 1523
Searching TMDb for: Perfect Blue
Found movie ID: 10494
Searching TMDb for: Catwalk
Found movie ID: 89333
Searching TMDb for: Bean
Found movie ID: 1281


In [11]:
movie_subset.to_csv("../data/movie_subset_with_metadata.csv", index=False)


In [12]:
movie_subset

Unnamed: 0,movieId,title,genres,clean_title,overview_tmdb,director,cast,keywords
0,113348,Teenage Mutant Ninja Turtles (2014),Action|Adventure|Comedy,Teenage Mutant Ninja Turtles,"When a kingpin threatens New York City, a grou...",Jonathan Liebesman,Pete Ploszek Alan Ritchson Jeremy Howard,new york city van martial arts hero experiment...
1,4639,America's Sweethearts (2001),Comedy|Romance,America's Sweethearts,In the midst of a nasty public breakup of marr...,Joe Roth,Julia Roberts John Cusack Catherine Zeta-Jones,husband wife relationship fictitious marriage ...
2,4022,Cast Away (2000),Drama,Cast Away,"Chuck Nolan, a top international manager for F...",Robert Zemeckis,Tom Hanks Helen Hunt Chris Noth,exotic island suicide attempt volleyball survi...
3,55442,Persepolis (2007),Animation|Drama,Persepolis,"In 1970s Iran, Marjane 'Marji' Satrapi watches...",Vincent Paronnaud,Chiara Mastroianni Danielle Darrieux Catherine...,puberty civil war parent child relationship 19...
4,6287,Anger Management (2003),Comedy,Anger Management,After a small misunderstanding aboard an airpl...,Peter Segal,Adam Sandler Jack Nicholson Marisa Tomei,penalty therapist psychology aggression rage a...
...,...,...,...,...,...,...,...,...
795,3269,Forever Young (1992),Drama|Romance|Sci-Fi,Forever Young,A 1939 test pilot asks his best friend to use ...,Steve Miner,Mel Gibson Jamie Lee Curtis Elijah Wood,airplane experiment test person u.s. air force...
796,6234,Equus (1977),Drama|Mystery,Equus,"A psychiatrist, Martin Dysart, investigates th...",Sidney Lumet,Richard Burton Peter Firth Joan Plowright,horse psychiatrist
797,6849,Scrooge (1970),Drama|Fantasy|Musical,Scrooge,"Ebenezer Scrooge, the ultimate Victorian miser...",Henry Edwards,Seymour Hicks Donald Calthrop Robert Cochran,clerk holiday redemption spirit miser ghost ch...
798,44937,"Child, The (L'enfant) (2005)",Crime|Drama,"Child, The (L'enfant)","After a Tibetan boy, the mystical Golden Child...",Michael Ritchie,Eddie Murphy Charles Dance Charlotte Lewis,buddhism ritual social worker monk anti hero m...


In [13]:
def create_soup(row):
    return ' '.join([
        row['overview_tmdb'] or '',
        row['director'] or '',
        row['cast'] or '',
        row['keywords'] or ''
    ])

movie_subset['soup'] = movie_subset.apply(create_soup, axis=1)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_subset['soup'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (800, 11743)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [16]:
indices = pd.Series(movie_subset.index, index=movie_subset['title'].str.lower())


In [17]:
def recommend(title, cosine_sim=cosine_sim):
    title = title.lower()
    if title not in indices:
        print("Movie not found in dataset.")
        return []

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the movie itself
    movie_indices = [i[0] for i in sim_scores]

    return movie_subset['title'].iloc[movie_indices]


In [24]:
recommend("Ice Age (2002)")


140                              Year One (2009)
560                               Caveman (1981)
748    Journey to the Center of the Earth (2008)
639                 The Angry Birds Movie (2016)
742                       Doctor Dolittle (1967)
548                              Godzilla (1998)
150        Jurassic World: Fallen Kingdom (2018)
668        House at the End of the Street (2012)
656                           Last Orders (2001)
599                               Rampage (2018)
Name: title, dtype: object

In [25]:
movie_subset

Unnamed: 0,movieId,title,genres,clean_title,overview_tmdb,director,cast,keywords,soup
0,113348,Teenage Mutant Ninja Turtles (2014),Action|Adventure|Comedy,Teenage Mutant Ninja Turtles,"When a kingpin threatens New York City, a grou...",Jonathan Liebesman,Pete Ploszek Alan Ritchson Jeremy Howard,new york city van martial arts hero experiment...,"When a kingpin threatens New York City, a grou..."
1,4639,America's Sweethearts (2001),Comedy|Romance,America's Sweethearts,In the midst of a nasty public breakup of marr...,Joe Roth,Julia Roberts John Cusack Catherine Zeta-Jones,husband wife relationship fictitious marriage ...,In the midst of a nasty public breakup of marr...
2,4022,Cast Away (2000),Drama,Cast Away,"Chuck Nolan, a top international manager for F...",Robert Zemeckis,Tom Hanks Helen Hunt Chris Noth,exotic island suicide attempt volleyball survi...,"Chuck Nolan, a top international manager for F..."
3,55442,Persepolis (2007),Animation|Drama,Persepolis,"In 1970s Iran, Marjane 'Marji' Satrapi watches...",Vincent Paronnaud,Chiara Mastroianni Danielle Darrieux Catherine...,puberty civil war parent child relationship 19...,"In 1970s Iran, Marjane 'Marji' Satrapi watches..."
4,6287,Anger Management (2003),Comedy,Anger Management,After a small misunderstanding aboard an airpl...,Peter Segal,Adam Sandler Jack Nicholson Marisa Tomei,penalty therapist psychology aggression rage a...,After a small misunderstanding aboard an airpl...
...,...,...,...,...,...,...,...,...,...
795,3269,Forever Young (1992),Drama|Romance|Sci-Fi,Forever Young,A 1939 test pilot asks his best friend to use ...,Steve Miner,Mel Gibson Jamie Lee Curtis Elijah Wood,airplane experiment test person u.s. air force...,A 1939 test pilot asks his best friend to use ...
796,6234,Equus (1977),Drama|Mystery,Equus,"A psychiatrist, Martin Dysart, investigates th...",Sidney Lumet,Richard Burton Peter Firth Joan Plowright,horse psychiatrist,"A psychiatrist, Martin Dysart, investigates th..."
797,6849,Scrooge (1970),Drama|Fantasy|Musical,Scrooge,"Ebenezer Scrooge, the ultimate Victorian miser...",Henry Edwards,Seymour Hicks Donald Calthrop Robert Cochran,clerk holiday redemption spirit miser ghost ch...,"Ebenezer Scrooge, the ultimate Victorian miser..."
798,44937,"Child, The (L'enfant) (2005)",Crime|Drama,"Child, The (L'enfant)","After a Tibetan boy, the mystical Golden Child...",Michael Ritchie,Eddie Murphy Charles Dance Charlotte Lewis,buddhism ritual social worker monk anti hero m...,"After a Tibetan boy, the mystical Golden Child..."
