In [1]:
# 📚 Basic Libraries
import numpy as np 
import pandas as pd
import warnings
import os

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# 🤖 Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.sparse import hstack
from sklearn.metrics import silhouette_score

In [2]:
df= pd.read_csv("../data/all_movies_combined.csv")

In [3]:
df.head(5)

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,release_date,original_language,popularity,adult,runtime,budget,revenue,cast,director,keywords,poster_url
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"['Family', 'Comedy', 'Adventure', 'Fantasy']",6.1,295,2025-03-31,en,1022.7906,False,101,150000000,313453003,"['Jason Momoa', 'Jack Black', 'Sebastian Eugen...",Jared Hess,"['friendship', 'surrealism', 'exploration', 'p...",https://image.tmdb.org/t/p/w500/yFHHfHcUgGAxzi...
1,1125899,Cleaner,When a group of radical activists take over an...,"['Action', 'Thriller']",6.517,174,2025-02-19,en,343.3057,False,96,0,0,"['Daisy Ridley', 'Clive Owen', 'Taz Skylar', '...",Martin Campbell,[],https://image.tmdb.org/t/p/w500/mwzDApMZAGeYCE...
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"['Action', 'Thriller', 'Science Fiction']",6.091,1232,2025-02-12,en,339.0809,False,119,180000000,411409721,"['Anthony Mackie', 'Harrison Ford', 'Danny Ram...",Julius Onah,"['hero', 'superhero', 'revenge', 'aftercredits...",https://image.tmdb.org/t/p/w500/pzIddUEMWhWzfv...
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"['Action', 'Crime', 'Thriller']",6.467,151,2025-03-26,en,293.7838,False,116,40000000,44417000,"['Jason Statham', 'David Harbour', 'Michael Pe...",David Ayer,"['based on novel or book', 'kidnapping', 'vigi...",https://image.tmdb.org/t/p/w500/6FRFIogh3zFnVW...
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","['Romance', 'Drama']",5.681,119,2024-11-29,es,286.593,False,0,0,0,"['Gabriela Andrada', 'Mario Ermito', 'Celia Fr...",Lucía Alemany,"['spain', 'based on novel or book', 'woman dir...",https://image.tmdb.org/t/p/w500/76qnVxU2rPdVvi...


In [4]:
df.shape

(5000, 17)

In [5]:
df['original_language'].value_counts()

original_language
en    4156
ja     234
fr     125
ko      90
es      83
zh      51
it      45
cn      39
de      35
hi      27
ru      20
sv      13
pt      10
no      10
th       9
da       8
pl       7
te       6
id       6
ta       3
tr       3
uk       3
nl       2
kn       2
ar       2
ca       1
ga       1
mn       1
sr       1
el       1
lt       1
bn       1
tl       1
fi       1
lv       1
fa       1
Name: count, dtype: int64

In [6]:
df['title'].value_counts()

title
The Killer                     4
Pinocchio                      3
Robin Hood                     3
Prey                           3
Close                          3
                              ..
The Sword in the Stone         1
Forever My Girl                1
Resident Evil: Death Island    1
Samaritan                      1
Unfriended: Dark Web           1
Name: count, Length: 4797, dtype: int64

In [7]:
df.columns

Index(['id', 'title', 'overview', 'genres', 'vote_average', 'vote_count',
       'release_date', 'original_language', 'popularity', 'adult', 'runtime',
       'budget', 'revenue', 'cast', 'director', 'keywords', 'poster_url'],
      dtype='object')

In [8]:
# selecting relevant columns for the recommender
movies = df[['id','title','overview','genres','keywords','cast','director']]

In [9]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"['Family', 'Comedy', 'Adventure', 'Fantasy']","['friendship', 'surrealism', 'exploration', 'p...","['Jason Momoa', 'Jack Black', 'Sebastian Eugen...",Jared Hess
1,1125899,Cleaner,When a group of radical activists take over an...,"['Action', 'Thriller']",[],"['Daisy Ridley', 'Clive Owen', 'Taz Skylar', '...",Martin Campbell
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"['Action', 'Thriller', 'Science Fiction']","['hero', 'superhero', 'revenge', 'aftercredits...","['Anthony Mackie', 'Harrison Ford', 'Danny Ram...",Julius Onah
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"['Action', 'Crime', 'Thriller']","['based on novel or book', 'kidnapping', 'vigi...","['Jason Statham', 'David Harbour', 'Michael Pe...",David Ayer
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","['Romance', 'Drama']","['spain', 'based on novel or book', 'woman dir...","['Gabriela Andrada', 'Mario Ermito', 'Celia Fr...",Lucía Alemany


In [10]:
# preprocess the dataset
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
director    3
dtype: int64

In [11]:
movies.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.dropna(inplace=True)


In [12]:
movies.shape

(4997, 7)

In [13]:
movies.duplicated().sum()

54

In [14]:
dupes = movies[movies.duplicated(keep=False)]
print(f"{len(dupes)} duplicated rows found:\n")
dupes.sort_values(by=movies.columns.tolist()).head(10)

108 duplicated rows found:



Unnamed: 0,id,title,overview,genres,keywords,cast,director
99,27205,Inception,"Cobb, a skilled thief who commits corporate es...","['Action', 'Science Fiction', 'Adventure']","['rescue', 'mission', 'dreams', 'airplane', 'p...","['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",Christopher Nolan
102,27205,Inception,"Cobb, a skilled thief who commits corporate es...","['Action', 'Science Fiction', 'Adventure']","['rescue', 'mission', 'dreams', 'airplane', 'p...","['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",Christopher Nolan
976,44833,Battleship,"When mankind beams a radio signal into space, ...","['Thriller', 'Action', 'Adventure', 'Science F...","['fight', 'u.s. navy', 'mind reading', 'hong k...","['Taylor Kitsch', 'Alexander Skarsgård', 'Riha...",Peter Berg
1200,44833,Battleship,"When mankind beams a radio signal into space, ...","['Thriller', 'Action', 'Adventure', 'Science F...","['fight', 'u.s. navy', 'mind reading', 'hong k...","['Taylor Kitsch', 'Alexander Skarsgård', 'Riha...",Peter Berg
975,62764,Mirror Mirror,"After she spends all her money, an evil enchan...","['Adventure', 'Comedy', 'Family', 'Fantasy']","['fairy tale', 'villainess', 'attempted murder...","['Lily Collins', 'Julia Roberts', 'Armie Hamme...",Tarsem Singh
1078,62764,Mirror Mirror,"After she spends all her money, an evil enchan...","['Adventure', 'Comedy', 'Family', 'Fantasy']","['fairy tale', 'villainess', 'attempted murder...","['Lily Collins', 'Julia Roberts', 'Armie Hamme...",Tarsem Singh
944,65754,The Girl with the Dragon Tattoo,Disgraced journalist Mikael Blomkvist investig...,"['Thriller', 'Crime', 'Mystery']","['journalist', 'island', 'rape', 'hacker', 'ba...","['Daniel Craig', 'Rooney Mara', 'Christopher P...",David Fincher
1114,65754,The Girl with the Dragon Tattoo,Disgraced journalist Mikael Blomkvist investig...,"['Thriller', 'Crime', 'Mystery']","['journalist', 'island', 'rape', 'hacker', 'ba...","['Daniel Craig', 'Rooney Mara', 'Christopher P...",David Fincher
911,72545,Journey 2: The Mysterious Island,Sean Anderson partners with his mom's boyfrien...,"['Adventure', 'Action', 'Science Fiction']","['mission', 'giant lizard', 'missing person', ...","['Dwayne Johnson', 'Josh Hutcherson', 'Vanessa...",Brad Peyton
1187,72545,Journey 2: The Mysterious Island,Sean Anderson partners with his mom's boyfrien...,"['Adventure', 'Action', 'Science Fiction']","['mission', 'giant lizard', 'missing person', ...","['Dwayne Johnson', 'Josh Hutcherson', 'Vanessa...",Brad Peyton


In [15]:
movies.drop_duplicates(inplace=True)
movies.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.drop_duplicates(inplace=True)


(4943, 7)

In [16]:
movies.duplicated().sum()

0

In [17]:
movies.iloc[0]['genres']

"['Family', 'Comedy', 'Adventure', 'Fantasy']"

In [18]:
type(movies.iloc[0]['genres'])

str

In [19]:
# convert to list, get the value (literal_eval)
import ast

# Make a clean copy to avoid SettingWithCopyWarning
movies = movies.copy()

# Define the columns to convert from string to list
columns_to_convert = ['genres', 'keywords', 'cast']

# Apply ast.literal_eval to each specified column
for col in columns_to_convert:
    movies[col] = movies[col].apply(ast.literal_eval)

In [20]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director
0,950387,A Minecraft Movie,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason Momoa, Jack Black, Sebastian Eugene Han...",Jared Hess
1,1125899,Cleaner,When a group of radical activists take over an...,"[Action, Thriller]",[],"[Daisy Ridley, Clive Owen, Taz Skylar, Flavia ...",Martin Campbell
2,822119,Captain America: Brave New World,After meeting with newly elected U.S. Presiden...,"[Action, Thriller, Science Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony Mackie, Harrison Ford, Danny Ramirez,...",Julius Onah
3,1197306,A Working Man,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]","[based on novel or book, kidnapping, vigilante...","[Jason Statham, David Harbour, Michael Peña, J...",David Ayer
4,1252309,Ask Me What You Want,"After his father's death, Eric Zimmerman trave...","[Romance, Drama]","[spain, based on novel or book, woman director...","[Gabriela Andrada, Mario Ermito, Celia Freijei...",Lucía Alemany


In [44]:
movies.iloc[0]['keywords']

['friendship',
 'surrealism',
 'exploration',
 'portal',
 'miner',
 'friends',
 'survival',
 'zombie',
 'based on video game',
 'aftercreditsstinger',
 'duringcreditsstinger',
 'journey',
 'imagination',
 'teamwork',
 'fantasy',
 'embarrassed',
 'building']

In [46]:
movies.iloc[0]['overview']

"Four misfits find themselves struggling with ordinary problems when they are suddenly pulled through a mysterious portal into the Overworld: a bizarre, cubic wonderland that thrives on imagination. To get back home, they'll have to master this world while embarking on a magical quest with an unexpected, expert crafter, Steve."

In [48]:
# vectorize the text — and splitting it into words
movies['overview'] = movies['overview'].apply(lambda x: x.lower().split())
movies.iloc[0]['overview']

['four',
 'misfits',
 'find',
 'themselves',
 'struggling',
 'with',
 'ordinary',
 'problems',
 'when',
 'they',
 'are',
 'suddenly',
 'pulled',
 'through',
 'a',
 'mysterious',
 'portal',
 'into',
 'the',
 'overworld:',
 'a',
 'bizarre,',
 'cubic',
 'wonderland',
 'that',
 'thrives',
 'on',
 'imagination.',
 'to',
 'get',
 'back',
 'home,',
 "they'll",
 'have',
 'to',
 'master',
 'this',
 'world',
 'while',
 'embarking',
 'on',
 'a',
 'magical',
 'quest',
 'with',
 'an',
 'unexpected,',
 'expert',
 'crafter,',
 'steve.']

In [50]:
# names stay as one token instead of being split into meaningless pieces 
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "_") for i in x])
movies['director'] = movies['director'].apply(lambda x: x.replace(" ", "_"))

In [54]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "_") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "_") for i in x])

In [56]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director
0,950387,A Minecraft Movie,"[four, misfits, find, themselves, struggling, ...","[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason_Momoa, Jack_Black, Sebastian_Eugene_Han...",Jared_Hess
1,1125899,Cleaner,"[when, a, group, of, radical, activists, take,...","[Action, Thriller]",[],"[Daisy_Ridley, Clive_Owen, Taz_Skylar, Flavia_...",Martin_Campbell
2,822119,Captain America: Brave New World,"[after, meeting, with, newly, elected, u.s., p...","[Action, Thriller, Science_Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony_Mackie, Harrison_Ford, Danny_Ramirez,...",Julius_Onah
3,1197306,A Working Man,"[levon, cade, left, behind, a, decorated, mili...","[Action, Crime, Thriller]","[based_on_novel_or_book, kidnapping, vigilante...","[Jason_Statham, David_Harbour, Michael_Peña, J...",David_Ayer
4,1252309,Ask Me What You Want,"[after, his, father's, death,, eric, zimmerman...","[Romance, Drama]","[spain, based_on_novel_or_book, woman_director...","[Gabriela_Andrada, Mario_Ermito, Celia_Freijei...",Lucía_Alemany


In [58]:
# combine all relevant textual info into one single string -> tag
def combine_tags(row):
    return ' '.join(row['overview']) + ' ' + \
           ' '.join(row['genres']) + ' ' + \
           ' '.join(row['keywords']) + ' ' + \
           ' '.join(row['cast']) + ' ' + \
           row['director']

movies['tag'] = movies.apply(combine_tags, axis=1)
movies['tag'] = movies['tag'].apply(lambda x: x.lower())

In [60]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,tag
0,950387,A Minecraft Movie,"[four, misfits, find, themselves, struggling, ...","[Family, Comedy, Adventure, Fantasy]","[friendship, surrealism, exploration, portal, ...","[Jason_Momoa, Jack_Black, Sebastian_Eugene_Han...",Jared_Hess,four misfits find themselves struggling with o...
1,1125899,Cleaner,"[when, a, group, of, radical, activists, take,...","[Action, Thriller]",[],"[Daisy_Ridley, Clive_Owen, Taz_Skylar, Flavia_...",Martin_Campbell,when a group of radical activists take over an...
2,822119,Captain America: Brave New World,"[after, meeting, with, newly, elected, u.s., p...","[Action, Thriller, Science_Fiction]","[hero, superhero, revenge, aftercreditsstinger...","[Anthony_Mackie, Harrison_Ford, Danny_Ramirez,...",Julius_Onah,after meeting with newly elected u.s. presiden...
3,1197306,A Working Man,"[levon, cade, left, behind, a, decorated, mili...","[Action, Crime, Thriller]","[based_on_novel_or_book, kidnapping, vigilante...","[Jason_Statham, David_Harbour, Michael_Peña, J...",David_Ayer,levon cade left behind a decorated military ca...
4,1252309,Ask Me What You Want,"[after, his, father's, death,, eric, zimmerman...","[Romance, Drama]","[spain, based_on_novel_or_book, woman_director...","[Gabriela_Andrada, Mario_Ermito, Celia_Freijei...",Lucía_Alemany,"after his father's death, eric zimmerman trave..."


In [62]:
movies.iloc[0]['tag']

"four misfits find themselves struggling with ordinary problems when they are suddenly pulled through a mysterious portal into the overworld: a bizarre, cubic wonderland that thrives on imagination. to get back home, they'll have to master this world while embarking on a magical quest with an unexpected, expert crafter, steve. family comedy adventure fantasy friendship surrealism exploration portal miner friends survival zombie based_on_video_game aftercreditsstinger duringcreditsstinger journey imagination teamwork fantasy embarrassed building jason_momoa jack_black sebastian_eugene_hansen emma_myers danielle_brooks jared_hess"

In [64]:
new_df = movies[['id','title','tag']]

In [66]:
new_df.head()

Unnamed: 0,id,title,tag
0,950387,A Minecraft Movie,four misfits find themselves struggling with o...
1,1125899,Cleaner,when a group of radical activists take over an...
2,822119,Captain America: Brave New World,after meeting with newly elected u.s. presiden...
3,1197306,A Working Man,levon cade left behind a decorated military ca...
4,1252309,Ask Me What You Want,"after his father's death, eric zimmerman trave..."
