In [4]:
# 📚 Basic Libraries
import numpy as np 
import pandas as pd
import warnings
import os

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# 🤖 Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from scipy.sparse import hstack
from sklearn.metrics import silhouette_score

In [5]:
df= pd.read_csv("../data/all_movies_combined.csv")

In [6]:
df.head(5)

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,release_date,original_language,popularity,adult,runtime,budget,revenue,cast,director,keywords,poster_url
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"['Family', 'Comedy', 'Adventure', 'Fantasy']",6.1,295,2025-03-31,en,1022.7906,False,101,150000000,313453003,"['Jason Momoa', 'Jack Black', 'Sebastian Eugen...",Jared Hess,"['friendship', 'surrealism', 'exploration', 'p...",https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...
1,1125899,Cleaner,When a group of radical activists take over an...,"['Action', 'Thriller']",6.517,174,2025-02-19,en,343.3057,False,96,0,0,"['Daisy Ridley', 'Clive Owen', 'Taz Skylar', '...",Martin Campbell,[],https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"['Action', 'Thriller', 'Science Fiction']",6.091,1232,2025-02-12,en,339.0809,False,119,180000000,411409721,"['Anthony Mackie', 'Harrison Ford', 'Danny Ram...",Julius Onah,"['hero', 'superhero', 'revenge', 'aftercredits...",https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"['Action', 'Crime', 'Thriller']",6.467,151,2025-03-26,en,293.7838,False,116,40000000,44417000,"['Jason Statham', 'David Harbour', 'Michael Pe...",David Ayer,"['based on novel or book', 'kidnapping', 'vigi...",https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","['Romance', 'Drama']",5.681,119,2024-11-29,es,286.593,False,0,0,0,"['Gabriela Andrada', 'Mario Ermito', 'Celia Fr...",Lucía Alemany,"['spain', 'based on novel or book', 'woman dir...",https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...


In [7]:
df.shape

(5000, 17)

In [8]:
df['original_language'].value_counts()

original_language
en    4156
ja     234
fr     125
ko      90
es      83
zh      51
it      45
cn      39
de      35
hi      27
ru      20
sv      13
pt      10
no      10
th       9
da       8
pl       7
te       6
id       6
ta       3
tr       3
uk       3
nl       2
kn       2
ar       2
ca       1
ga       1
mn       1
sr       1
el       1
lt       1
bn       1
tl       1
fi       1
lv       1
fa       1
Name: count, dtype: int64

In [9]:
df['title'].value_counts()

title
The Killer                     4
Pinocchio                      3
Robin Hood                     3
Prey                           3
Close                          3
                              ..
The Sword in the Stone         1
Forever My Girl                1
Resident Evil: Death Island    1
Samaritan                      1
Unfriended: Dark Web           1
Name: count, Length: 4797, dtype: int64

In [10]:
df.columns

Index(['id', 'title', 'overview', 'genres', 'vote_average', 'vote_count',
       'release_date', 'original_language', 'popularity', 'adult', 'runtime',
       'budget', 'revenue', 'cast', 'director', 'keywords', 'poster_url'],
      dtype='object')

In [11]:
df.dtypes

id                     int64
title                 object
overview              object
genres                object
vote_average         float64
vote_count             int64
release_date          object
original_language     object
popularity           float64
adult                   bool
runtime                int64
budget                 int64
revenue                int64
cast                  object
director              object
keywords              object
poster_url            object
dtype: object

In [20]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

In [22]:
# selecting relevant columns for the recommender
movies = df[['id','title','overview','genres','keywords','cast','director','vote_average', 'vote_count','release_date','runtime','poster_url']]

In [24]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"['Family', 'Comedy', 'Adventure', 'Fantasy']","['friendship', 'surrealism', 'exploration', 'p...","['Jason Momoa', 'Jack Black', 'Sebastian Eugen...",Jared Hess,6.1,295,2025-03-31,101,https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...
1,1125899,Cleaner,When a group of radical activists take over an...,"['Action', 'Thriller']",[],"['Daisy Ridley', 'Clive Owen', 'Taz Skylar', '...",Martin Campbell,6.517,174,2025-02-19,96,https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"['Action', 'Thriller', 'Science Fiction']","['hero', 'superhero', 'revenge', 'aftercredits...","['Anthony Mackie', 'Harrison Ford', 'Danny Ram...",Julius Onah,6.091,1232,2025-02-12,119,https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"['Action', 'Crime', 'Thriller']","['based on novel or book', 'kidnapping', 'vigi...","['Jason Statham', 'David Harbour', 'Michael Pe...",David Ayer,6.467,151,2025-03-26,116,https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","['Romance', 'Drama']","['spain', 'based on novel or book', 'woman dir...","['Gabriela Andrada', 'Mario Ermito', 'Celia Fr...",Lucía Alemany,5.681,119,2024-11-29,0,https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...


In [26]:
# preprocess the dataset
movies.isnull().sum()

id              0
title           0
overview        0
genres          0
keywords        0
cast            0
director        3
vote_average    0
vote_count      0
release_date    0
runtime         0
poster_url      0
dtype: int64

In [28]:
movies.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.dropna(inplace=True)


In [30]:
movies.shape

(4997, 12)

In [32]:
movies.duplicated().sum()

54

In [34]:
dupes = movies[movies.duplicated(keep=False)]
print(f"{len(dupes)} duplicated rows found:\n")
dupes.sort_values(by=movies.columns.tolist()).head(10)

108 duplicated rows found:



Unnamed: 0,id,title,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url
99,27205,Inception,"Cobb, a skilled thief who commits corporate es...","['Action', 'Science Fiction', 'Adventure']","['rescue', 'mission', 'dreams', 'airplane', 'p...","['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",Christopher Nolan,8.4,37298,2010-07-15,148,https://image.tmdb.org/t/p/w500/ljsZTbVsrQSqZg...
102,27205,Inception,"Cobb, a skilled thief who commits corporate es...","['Action', 'Science Fiction', 'Adventure']","['rescue', 'mission', 'dreams', 'airplane', 'p...","['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",Christopher Nolan,8.4,37298,2010-07-15,148,https://image.tmdb.org/t/p/w500/ljsZTbVsrQSqZg...
976,44833,Battleship,"When mankind beams a radio signal into space, ...","['Thriller', 'Action', 'Adventure', 'Science F...","['fight', 'u.s. navy', 'mind reading', 'hong k...","['Taylor Kitsch', 'Alexander Skarsgård', 'Riha...",Peter Berg,5.862,5588,2012-04-11,131,https://image.tmdb.org/t/p/w500/9b0Im7SfedHiaj...
1200,44833,Battleship,"When mankind beams a radio signal into space, ...","['Thriller', 'Action', 'Adventure', 'Science F...","['fight', 'u.s. navy', 'mind reading', 'hong k...","['Taylor Kitsch', 'Alexander Skarsgård', 'Riha...",Peter Berg,5.862,5588,2012-04-11,131,https://image.tmdb.org/t/p/w500/9b0Im7SfedHiaj...
975,62764,Mirror Mirror,"After she spends all her money, an evil enchan...","['Adventure', 'Comedy', 'Family', 'Fantasy']","['fairy tale', 'villainess', 'attempted murder...","['Lily Collins', 'Julia Roberts', 'Armie Hamme...",Tarsem Singh,5.924,3176,2012-03-15,106,https://image.tmdb.org/t/p/w500/pys6zIJN8Sxlk2...
1078,62764,Mirror Mirror,"After she spends all her money, an evil enchan...","['Adventure', 'Comedy', 'Family', 'Fantasy']","['fairy tale', 'villainess', 'attempted murder...","['Lily Collins', 'Julia Roberts', 'Armie Hamme...",Tarsem Singh,5.924,3176,2012-03-15,106,https://image.tmdb.org/t/p/w500/pys6zIJN8Sxlk2...
944,65754,The Girl with the Dragon Tattoo,Disgraced journalist Mikael Blomkvist investig...,"['Thriller', 'Crime', 'Mystery']","['journalist', 'island', 'rape', 'hacker', 'ba...","['Daniel Craig', 'Rooney Mara', 'Christopher P...",David Fincher,7.375,7114,2011-12-14,158,https://image.tmdb.org/t/p/w500/vbLedKc1BUF4FO...
1114,65754,The Girl with the Dragon Tattoo,Disgraced journalist Mikael Blomkvist investig...,"['Thriller', 'Crime', 'Mystery']","['journalist', 'island', 'rape', 'hacker', 'ba...","['Daniel Craig', 'Rooney Mara', 'Christopher P...",David Fincher,7.375,7114,2011-12-14,158,https://image.tmdb.org/t/p/w500/vbLedKc1BUF4FO...
911,72545,Journey 2: The Mysterious Island,Sean Anderson partners with his mom's boyfrien...,"['Adventure', 'Action', 'Science Fiction']","['mission', 'giant lizard', 'missing person', ...","['Dwayne Johnson', 'Josh Hutcherson', 'Vanessa...",Brad Peyton,6.147,4311,2012-01-19,94,https://image.tmdb.org/t/p/w500/8WbZOiplh0xuVN...
1187,72545,Journey 2: The Mysterious Island,Sean Anderson partners with his mom's boyfrien...,"['Adventure', 'Action', 'Science Fiction']","['mission', 'giant lizard', 'missing person', ...","['Dwayne Johnson', 'Josh Hutcherson', 'Vanessa...",Brad Peyton,6.147,4311,2012-01-19,94,https://image.tmdb.org/t/p/w500/8WbZOiplh0xuVN...


In [36]:
movies.drop_duplicates(inplace=True)
movies.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.drop_duplicates(inplace=True)


(4943, 12)

In [38]:
movies.duplicated().sum()

0

In [40]:
movies.iloc[0]['genres']

"['Family', 'Comedy', 'Adventure', 'Fantasy']"

In [42]:
type(movies.iloc[0]['genres'])

str

In [44]:
# convert to list, get the value (literal_eval)
import ast

# Make a clean copy to avoid SettingWithCopyWarning
movies = movies.copy()

# Define the columns to convert from string to list
columns_to_convert = ['genres', 'keywords', 'cast']

# Apply ast.literal_eval to each specified column
for col in columns_to_convert:
    movies[col] = movies[col].apply(ast.literal_eval)

In [46]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason Momoa, Jack Black, Sebastian Eugene Han...",Jared Hess,6.1,295,2025-03-31,101,https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...
1,1125899,Cleaner,When a group of radical activists take over an...,"[Action, Thriller]",[],"[Daisy Ridley, Clive Owen, Taz Skylar, Flavia ...",Martin Campbell,6.517,174,2025-02-19,96,https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"[Action, Thriller, Science Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony Mackie, Harrison Ford, Danny Ramirez,...",Julius Onah,6.091,1232,2025-02-12,119,https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]","[based on novel or book, kidnapping, vigilante...","[Jason Statham, David Harbour, Michael Peña, J...",David Ayer,6.467,151,2025-03-26,116,https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","[Romance, Drama]","[spain, based on novel or book, woman director...","[Gabriela Andrada, Mario Ermito, Celia Freijei...",Lucía Alemany,5.681,119,2024-11-29,0,https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...


In [48]:
movies.iloc[0]['keywords']

['friendship',
 'surrealism',
 'exploration',
 'portal',
 'miner',
 'friends',
 'survival',
 'zombie',
 'based on video game',
 'aftercreditsstinger',
 'duringcreditsstinger',
 'journey',
 'imagination',
 'teamwork',
 'fantasy',
 'embarrassed',
 'building']

In [50]:
movies.iloc[0]['overview']

"Four misfits find themselves struggling with ordinary problems when they are suddenly pulled through a mysterious portal into the Overworld: a bizarre, cubic wonderland that thrives on imagination. To get back home, they'll have to master this world while embarking on a magical quest with an unexpected, expert crafter, Steve."

In [52]:
# vectorize the text — and splitting it into words
movies['overview_vector'] = movies['overview'].apply(lambda x: x.lower().split())
movies.iloc[0]['overview_vector']

['four',
 'misfits',
 'find',
 'themselves',
 'struggling',
 'with',
 'ordinary',
 'problems',
 'when',
 'they',
 'are',
 'suddenly',
 'pulled',
 'through',
 'a',
 'mysterious',
 'portal',
 'into',
 'the',
 'overworld:',
 'a',
 'bizarre,',
 'cubic',
 'wonderland',
 'that',
 'thrives',
 'on',
 'imagination.',
 'to',
 'get',
 'back',
 'home,',
 "they'll",
 'have',
 'to',
 'master',
 'this',
 'world',
 'while',
 'embarking',
 'on',
 'a',
 'magical',
 'quest',
 'with',
 'an',
 'unexpected,',
 'expert',
 'crafter,',
 'steve.']

In [54]:
# vectorize the text — and splitting it into words
movies['title_vector'] = movies['title'].apply(lambda x: x.lower().split())
movies.iloc[0]['title_vector']

['a', 'minecraft', 'movie']

In [56]:
# names stay as one token instead of being split into meaningless pieces 
movies['cast_slug'] = movies['cast'].apply(lambda x: [i.replace(" ", "_") for i in x])
movies['director_slug'] = movies['director'].apply(lambda x: x.replace(" ", "_"))

In [58]:
movies['genres_slug'] = movies['genres'].apply(lambda x: [i.replace(" ", "_") for i in x])
movies['keywords_slug'] = movies['keywords'].apply(lambda x: [i.replace(" ", "_") for i in x])

In [60]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url,overview_vector,title_vector,cast_slug,director_slug,genres_slug,keywords_slug
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason Momoa, Jack Black, Sebastian Eugene Han...",Jared Hess,6.1,295,2025-03-31,101,https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...,"[four, misfits, find, themselves, struggling, ...","[a, minecraft, movie]","[Jason_Momoa, Jack_Black, Sebastian_Eugene_Han...",Jared_Hess,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ..."
1,1125899,Cleaner,When a group of radical activists take over an...,"[Action, Thriller]",[],"[Daisy Ridley, Clive Owen, Taz Skylar, Flavia ...",Martin Campbell,6.517,174,2025-02-19,96,https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...,"[when, a, group, of, radical, activists, take,...",[cleaner],"[Daisy_Ridley, Clive_Owen, Taz_Skylar, Flavia_...",Martin_Campbell,"[Action, Thriller]",[]
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"[Action, Thriller, Science Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony Mackie, Harrison Ford, Danny Ramirez,...",Julius Onah,6.091,1232,2025-02-12,119,https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...,"[after, meeting, with, newly, elected, u.s., p...","[captain, america:, brave, new, world]","[Anthony_Mackie, Harrison_Ford, Danny_Ramirez,...",Julius_Onah,"[Action, Thriller, Science_Fiction]","[hero, superhero, revenge, aftercreditsstinger..."
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]","[based on novel or book, kidnapping, vigilante...","[Jason Statham, David Harbour, Michael Peña, J...",David Ayer,6.467,151,2025-03-26,116,https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...,"[levon, cade, left, behind, a, decorated, mili...","[a, working, man]","[Jason_Statham, David_Harbour, Michael_Peña, J...",David_Ayer,"[Action, Crime, Thriller]","[based_on_novel_or_book, kidnapping, vigilante..."
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","[Romance, Drama]","[spain, based on novel or book, woman director...","[Gabriela Andrada, Mario Ermito, Celia Freijei...",Lucía Alemany,5.681,119,2024-11-29,0,https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...,"[after, his, father's, death,, eric, zimmerman...","[ask, me, what, you, want]","[Gabriela_Andrada, Mario_Ermito, Celia_Freijei...",Lucía_Alemany,"[Romance, Drama]","[spain, based_on_novel_or_book, woman_director..."


In [62]:
# combine all relevant textual info into one single string -> tag
def combine_tags(row):
    return ' '.join(row['title_vector']) + ' ' + \
           ' '.join(row['overview_vector']) + ' ' + \
           ' '.join(row['genres_slug']) + ' ' + \
           ' '.join(row['keywords_slug']) + ' ' + \
           ' '.join(row['cast_slug']) + ' ' + \
           row['director_slug']

movies['tag'] = movies.apply(combine_tags, axis=1)
movies['tag'] = movies['tag'].apply(lambda x: x.lower())

In [64]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url,overview_vector,title_vector,cast_slug,director_slug,genres_slug,keywords_slug,tag
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason Momoa, Jack Black, Sebastian Eugene Han...",Jared Hess,6.1,295,2025-03-31,101,https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...,"[four, misfits, find, themselves, struggling, ...","[a, minecraft, movie]","[Jason_Momoa, Jack_Black, Sebastian_Eugene_Han...",Jared_Hess,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...",a minecraft movie four misfits find themselves...
1,1125899,Cleaner,When a group of radical activists take over an...,"[Action, Thriller]",[],"[Daisy Ridley, Clive Owen, Taz Skylar, Flavia ...",Martin Campbell,6.517,174,2025-02-19,96,https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...,"[when, a, group, of, radical, activists, take,...",[cleaner],"[Daisy_Ridley, Clive_Owen, Taz_Skylar, Flavia_...",Martin_Campbell,"[Action, Thriller]",[],cleaner when a group of radical activists take...
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"[Action, Thriller, Science Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony Mackie, Harrison Ford, Danny Ramirez,...",Julius Onah,6.091,1232,2025-02-12,119,https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...,"[after, meeting, with, newly, elected, u.s., p...","[captain, america:, brave, new, world]","[Anthony_Mackie, Harrison_Ford, Danny_Ramirez,...",Julius_Onah,"[Action, Thriller, Science_Fiction]","[hero, superhero, revenge, aftercreditsstinger...",captain america: brave new world after meeting...
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]","[based on novel or book, kidnapping, vigilante...","[Jason Statham, David Harbour, Michael Peña, J...",David Ayer,6.467,151,2025-03-26,116,https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...,"[levon, cade, left, behind, a, decorated, mili...","[a, working, man]","[Jason_Statham, David_Harbour, Michael_Peña, J...",David_Ayer,"[Action, Crime, Thriller]","[based_on_novel_or_book, kidnapping, vigilante...",a working man levon cade left behind a decorat...
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","[Romance, Drama]","[spain, based on novel or book, woman director...","[Gabriela Andrada, Mario Ermito, Celia Freijei...",Lucía Alemany,5.681,119,2024-11-29,0,https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...,"[after, his, father's, death,, eric, zimmerman...","[ask, me, what, you, want]","[Gabriela_Andrada, Mario_Ermito, Celia_Freijei...",Lucía_Alemany,"[Romance, Drama]","[spain, based_on_novel_or_book, woman_director...","ask me what you want after his father's death,..."


In [66]:
movies.iloc[0]['tag']

"a minecraft movie four misfits find themselves struggling with ordinary problems when they are suddenly pulled through a mysterious portal into the overworld: a bizarre, cubic wonderland that thrives on imagination. to get back home, they'll have to master this world while embarking on a magical quest with an unexpected, expert crafter, steve. family comedy adventure fantasy friendship surrealism exploration portal miner friends survival zombie based_on_video_game aftercreditsstinger duringcreditsstinger journey imagination teamwork fantasy embarrassed building jason_momoa jack_black sebastian_eugene_hansen emma_myers danielle_brooks jared_hess"

In [68]:
new_df = movies[['id','title','tag','overview','genres','keywords','cast','director','vote_average', 'vote_count','release_date','runtime','poster_url']]

In [70]:
new_df.head()

Unnamed: 0,id,title,tag,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url
0,950387,A Minecraft Movie,a minecraft movie four misfits find themselves...,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason Momoa, Jack Black, Sebastian Eugene Han...",Jared Hess,6.1,295,2025-03-31,101,https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...
1,1125899,Cleaner,cleaner when a group of radical activists take...,When a group of radical activists take over an...,"[Action, Thriller]",[],"[Daisy Ridley, Clive Owen, Taz Skylar, Flavia ...",Martin Campbell,6.517,174,2025-02-19,96,https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...
2,822119,Captain America: Brave New World,captain america: brave new world after meeting...,After meeting with newly elected U.S. Presiden...,"[Action, Thriller, Science Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony Mackie, Harrison Ford, Danny Ramirez,...",Julius Onah,6.091,1232,2025-02-12,119,https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...
3,1197306,A Working Man,a working man levon cade left behind a decorat...,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]","[based on novel or book, kidnapping, vigilante...","[Jason Statham, David Harbour, Michael Peña, J...",David Ayer,6.467,151,2025-03-26,116,https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...
4,1252309,Ask Me What You Want,"ask me what you want after his father's death,...","After his father's death, Eric Zimmerman trave...","[Romance, Drama]","[spain, based on novel or book, woman director...","[Gabriela Andrada, Mario Ermito, Celia Freijei...",Lucía Alemany,5.681,119,2024-11-29,0,https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...


In [72]:
new_df.iloc[0]['tag']

"a minecraft movie four misfits find themselves struggling with ordinary problems when they are suddenly pulled through a mysterious portal into the overworld: a bizarre, cubic wonderland that thrives on imagination. to get back home, they'll have to master this world while embarking on a magical quest with an unexpected, expert crafter, steve. family comedy adventure fantasy friendship surrealism exploration portal miner friends survival zombie based_on_video_game aftercreditsstinger duringcreditsstinger journey imagination teamwork fantasy embarrassed building jason_momoa jack_black sebastian_eugene_hansen emma_myers danielle_brooks jared_hess"

In [74]:
# Make sure release_date is in datetime format
new_df['release_date'] = pd.to_datetime(new_df['release_date'], errors='coerce')

# Sort by release_date in descending order (latest first)
new_df = new_df.sort_values('release_date', ascending=False)

# Drop duplicates by title, keeping the first (i.e. the latest one)
new_df = new_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['release_date'] = pd.to_datetime(new_df['release_date'], errors='coerce')


In [76]:
new_df['title'].value_counts().loc[lambda x: x > 1]

Series([], Name: count, dtype: int64)

In [78]:
frozen_movies = new_df[new_df['title'].str.contains("Frozen", case=False, na=False)]
frozen_movies

Unnamed: 0,id,title,tag,overview,genres,keywords,cast,director,vote_average,vote_count,release_date,runtime,poster_url
316,967847,Ghostbusters: Frozen Empire,ghostbusters: frozen empire when the discovery...,When the discovery of an ancient artifact unle...,"[Fantasy, Adventure, Comedy]","[new york city, supernatural, ghostbuster, seq...","[Mckenna Grace, Paul Rudd, Carrie Coon, Finn W...",Gil Kenan,6.502,1759,2024-03-20,115,https://image.tmdb.org/t/p/w500/e1J2oNzSBdou01...
667,573171,Little Eggs: A Frozen Rescue,little eggs: a frozen rescue in the final huev...,"In the final Huevos adventure, Toto and his fa...","[Animation, Adventure, Comedy, Family]",[],"[Bruno Bichir, Carlos Espejel, Angélica Vale, ...",Gabriel Riva Palacio Alatriste,7.7,351,2022-12-14,91,https://image.tmdb.org/t/p/w500/8xCO3IarklLD4t...
1221,330457,Frozen II,"frozen ii elsa, anna, kristoff and olaf head f...","Elsa, Anna, Kristoff and Olaf head far into th...","[Family, Animation, Adventure, Comedy, Fantasy]","[princess, magic, kingdom, winter, queen, cast...","[Kristen Bell, Idina Menzel, Josh Gad, Jonatha...",Jennifer Lee,7.25,9866,2019-11-20,103,https://image.tmdb.org/t/p/w500/mINJaa34MtknCY...
1607,460793,Olaf's Frozen Adventure,olaf's frozen adventure olaf is on a mission t...,Olaf is on a mission to harness the best holid...,"[Animation, Family, Adventure, Comedy, Fantasy]","[holiday, cartoon, snowman, christmas, short f...","[Josh Gad, Kristen Bell, Idina Menzel, Jonatha...",Stevie Wermers-Skelton,6.46,1443,2017-10-27,22,https://image.tmdb.org/t/p/w500/As8WTtxXs9e3cB...
2044,326359,Frozen Fever,"frozen fever on anna's birthday, elsa and kris...","On Anna's birthday, Elsa and Kristoff are dete...","[Animation, Family, Adventure, Comedy]","[princess, sibling relationship, birthday, car...","[Kristen Bell, Idina Menzel, Jonathan Groff, J...",Jennifer Lee,6.8,1837,2015-03-09,8,https://image.tmdb.org/t/p/w500/mPrDJ7puYzPLG5...
2242,109445,Frozen,frozen young princess anna of arendelle dreams...,Young princess Anna of Arendelle dreams about ...,"[Animation, Family, Adventure, Fantasy]","[princess, magic, mistake in person, queen, ca...","[Kristen Bell, Idina Menzel, Jonathan Groff, J...",Chris Buck,7.246,16777,2013-11-20,102,https://image.tmdb.org/t/p/w500/mmWheq3cFI4tYr...
2305,199373,The Frozen Ground,the frozen ground an alaska state trooper part...,An Alaska State Trooper partners with a young ...,"[Thriller, Crime]","[prostitute, escape, hitman, winter, gun, kidn...","[Nicolas Cage, Vanessa Hudgens, John Cusack, R...",Scott Walker,6.287,1485,2013-07-11,105,https://image.tmdb.org/t/p/w500/hHDj1h3lJvYd9C...


-- Remove punctuation / symbols
-- Lemmatization (It reduces words to their base form (lemma), using context and grammar.) e.g running -> run, flies -> fly
-- Custom stopwords e.g "the", "and", "in", "of", "this", "is", "he", "she"

In [81]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.lower().split()
    
    # Keep words with letters, numbers, and underscores
    words = [w for w in words if re.match(r'^[a-z0-9_]+$', w)]
    
    # Remove stopwords (but don't remove names like jason_momoa)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    
    return ' '.join(words)

new_df['tag'] = new_df['tag'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebeccawoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rebeccawoo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [85]:
new_df.iloc[0]['tag']

'minecraft movie four misfit find struggling ordinary problem suddenly pulled mysterious portal cubic wonderland thrives get back master world embarking magical quest expert family comedy adventure fantasy friendship surrealism exploration portal miner friend survival zombie based_on_video_game aftercreditsstinger duringcreditsstinger journey imagination teamwork fantasy embarrassed building jason_momoa jack_black sebastian_eugene_hansen emma_myers danielle_brooks jared_hess'

#### TfidfVectorizer - Term Frequency–Inverse Document Frequency

-- It gives less weight to very common words (like "the", "movie", etc.) and more weight to unique words that distinguish documents.

-- It's often better for recommender systems and search engines.

In [87]:
# Initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Transform the lemmatized tag column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(new_df['tag'])

In [89]:
print(tfidf_matrix.shape)

(4794, 35292)


In [91]:
# 4794 movies, 35292 unique words

fit_transform() = learns the vocabulary + converts each movie's tag into a vector of numbers

Each vector = the importance (TF-IDF weight) of each word in that movie, compared to all other movies

The result: a matrix of shape (num_movies × num_words)

✅ Now each movie is a vector, ready for similarity comparison!

#### Cosine similarity

To compare two movies, we ask: How “close” are these two vectors, based on angle between them?
The closer the arrows point in the same direction, the more similar the movies.

In [96]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [98]:
cosine_sim

array([[1.        , 0.01147206, 0.02742951, ..., 0.00612656, 0.        ,
        0.01270456],
       [0.01147206, 1.        , 0.01081013, ..., 0.01996458, 0.        ,
        0.01347813],
       [0.02742951, 0.01081013, 1.        , ..., 0.        , 0.00243809,
        0.00950377],
       ...,
       [0.00612656, 0.01996458, 0.        , ..., 1.        , 0.00900965,
        0.02148369],
       [0.        , 0.        , 0.00243809, ..., 0.00900965, 1.        ,
        0.07232607],
       [0.01270456, 0.01347813, 0.00950377, ..., 0.02148369, 0.07232607,
        1.        ]])

In [100]:
cosine_sim.shape

(4794, 4794)

In [102]:
new_df[new_df['title'] == 'Cleaner'].index[0]

15

Finds the index of the movie

Gets cosine similarity scores between that movie and all others

Sorts them by similarity (highest first)

Prints the top 5 similar movies (excluding itself)

In [105]:
def recommend(movie):
    if movie not in new_df['title'].values:
        print(f"'{movie}' not found in the dataset. Please check the spelling or try another title.")
        return

    index = new_df[new_df['title'] == movie].index[0]
    distances = list(enumerate(cosine_sim[index]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)

    print(f"\n🎬 Top 5 movies similar to '{movie}':\n")
    for i in distances[1:6]:  # Skip the first one (itself)
        title = new_df.iloc[i[0]]['title']
        score = i[1]
        print(f"{title}  (similarity: {score:.2f})")

In [107]:
recommend('Captain America: Civil War')


🎬 Top 5 movies similar to 'Captain America: Civil War':

Captain America: The Winter Soldier  (similarity: 0.25)
Captain America: The First Avenger  (similarity: 0.23)
Spider-Man: Homecoming  (similarity: 0.19)
Avengers: Endgame  (similarity: 0.14)
Iron Man 2  (similarity: 0.14)


In [109]:
import pickle

with open('new_df.pkl', 'wb') as f:
    pickle.dump(new_df, f)

with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

print("✅ saved successfully!")


✅ saved successfully!


In [118]:
movie_title = "Harry Potter and the Philosopher's Stone"
overview = new_df[new_df['title'] == movie_title]['overview']
print(overview)

3682    Harry Potter has lived under the stairs at his...
Name: overview, dtype: object


In [135]:
new_df.describe()

Unnamed: 0,id,vote_average,vote_count,release_date,runtime
count,4794.0,4794.0,4794.0,4794,4794.0
mean,292905.6,6.779259,3223.09053,2009-04-04 23:48:53.166457856,109.21214
min,11.0,2.9,100.0,1920-02-27 00:00:00,0.0
25%,9889.25,6.275,731.0,2002-12-12 00:00:00,95.0
50%,82265.0,6.8,1799.0,2012-10-25 00:00:00,107.0
75%,514033.8,7.332,3918.75,2019-12-30 06:00:00,121.0
max,1418522.0,9.414,37298.0,2025-03-31 00:00:00,316.0
std,355111.9,0.769387,4152.483151,,23.574552
