In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [5]:
df = movies.merge(credits, on="title")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [6]:
df = df[["movie_id", "genres", "keywords", "title", "overview", "cast", "crew"]] # only those cols which will imapct to recommended on the basis ofcontent
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   genres    4809 non-null   object
 2   keywords  4809 non-null   object
 3   title     4809 non-null   object
 4   overview  4806 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


# Preprocessing

In [7]:
df[df["overview"].isna()]

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
2658,370980,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...",Chiamatemi Francesco - Il Papa della gente,,"[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de..."
4145,459488,"[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...","To Be Frank, Sinatra at 100",,"[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de..."
4437,292539,"[{""id"": 99, ""name"": ""Documentary""}]",[],Food Chains,,[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de..."


In [8]:
df.dropna(inplace=True)

In [9]:
df.isna().sum()

movie_id    0
genres      0
keywords    0
title       0
overview    0
cast        0
crew        0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
def convert(obj):
    L= []
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    return L


In [12]:
convert(df["genres"][0])

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [13]:
df["genres"] = df["genres"].apply(convert)

In [14]:
df.sample(3)

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
3735,34069,"[Thriller, Mystery, Science Fiction]","[{""id"": 3388, ""name"": ""space colony""}, {""id"": ...",Cargo,The story of CARGO takes place on rusty space-...,"[{""cast_id"": 4, ""character"": ""Samuel Decker"", ...","[{""credit_id"": ""52fe454f9251416c9102e051"", ""de..."
4168,49020,"[Drama, Comedy, Romance]","[{""id"": 966, ""name"": ""beach""}, {""id"": 1209, ""n...",Submarine,"15-year-old deep-thinking Welsh schoolboy, Oli...","[{""cast_id"": 6, ""character"": ""Oliver Tate"", ""c...","[{""credit_id"": ""52fe4780c3a36847f8139637"", ""de..."
498,2023,"[Western, Adventure]","[{""id"": 643, ""name"": ""horse race""}, {""id"": 267...",Hidalgo,"Set in 1890, this is the story of a Pony Expre...","[{""cast_id"": 11, ""character"": ""Frank Hopkins"",...","[{""credit_id"": ""52fe432dc3a36847f80403ef"", ""de..."


In [15]:
df["keywords"] = df["keywords"].apply(convert)

In [16]:
def convert3(obj):
    counter = 0
    L= []
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i["name"])
            counter +=1
        else:
            break
    return L


In [17]:
df["cast"] = df["cast"].apply(convert3)

In [18]:
def fetch_director(obj):
    directors = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            directors.append(i["name"])
    return directors

In [19]:
df["crew"] = df["crew"].apply(fetch_director)

In [20]:
df.head()

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [21]:
df["overview"]  = df["overview"].apply(lambda x:x.split())

In [22]:
df.head()

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [23]:
# Change "Sam Worthington" -> "SamWorthington"
# Because Sam and Worthington are one single word, they will be 2 tags but both will be pointing to the same entity, now if therese another name called
# Sam Stones, the model will be confused is is sam worhington or sam stones, which to recommend which is why we remved space and create it as one entity

df["keywords"] = df["keywords"].apply(lambda x:[i.replace(" ", "") for i in x])
df["cast"] = df["cast"].apply(lambda x:[i.replace(" ", "") for i in x])
df["genres"] = df["genres"].apply(lambda x:[i.replace(" ", "") for i in x])
df["crew"] = df["crew"].apply(lambda x:[i.replace(" ", "") for i in x])

In [24]:
df["tags"] = df["crew"] + df["genres"] + df["cast"] + df["overview"] + df["keywords"]

In [25]:
df = df[["movie_id", "title", "tags"]]
df.sample()

Unnamed: 0,movie_id,title,tags
1145,10610,The Medallion,"[GordonChan, Thriller, Fantasy, Action, Comedy..."


In [26]:
df["tags"] = df["tags"].apply(lambda x:" ".join(x))
df["tags"] = df["tags"].str.lower()

In [27]:
df["tags"][0]

'jamescameron action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

# Vectorization

In [28]:
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(df["tags"]).toarray()

In [29]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
cv.get_feature_names_out()[50:100]

array(['abducted', 'abigailbreslin', 'abilities', 'ability', 'able',
       'aboard', 'abuse', 'abusive', 'academic', 'academy', 'accept',
       'accepted', 'accepts', 'access', 'accident', 'accidental',
       'accidentally', 'accompanied', 'accomplish', 'account',
       'accountant', 'accused', 'ace', 'achieve', 'act', 'acting',
       'action', 'actionhero', 'actions', 'activist', 'activities',
       'activity', 'actor', 'actors', 'actress', 'acts', 'actual',
       'actually', 'adam', 'adams', 'adamsandler', 'adamshankman',
       'adaptation', 'adapted', 'addict', 'addicted', 'addiction',
       'adolescence', 'adopt', 'adopted'], dtype=object)

## Steming

In [31]:
ps = PorterStemmer()

def steam(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return  " ".join(y)


df["tags"] = df["tags"].apply(steam)
df["tags"][0]

'jamescameron action adventur fantasi sciencefict samworthington zoesaldana sigourneyweav in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d'

In [32]:
df["tags"] = df["tags"].apply(steam)

cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(df["tags"]).toarray()

print(cv.get_feature_names_out()[50:100])

['500' '60' '70' '80' 'aaron' 'aaroneckhart' 'abandon' 'abduct'
 'abigailbreslin' 'abil' 'abl' 'aboard' 'abov' 'abu' 'academ' 'academi'
 'accept' 'access' 'accid' 'accident' 'acclaim' 'accompani' 'accomplish'
 'account' 'accu' 'ace' 'achiev' 'acquaint' 'act' 'action' 'actionhero'
 'activ' 'activist' 'activities' 'actor' 'actress' 'actual' 'ad' 'adam'
 'adamsandl' 'adamshankman' 'adapt' 'add' 'addict' 'adjust' 'admir'
 'admit' 'adolesc' 'adopt' 'ador']


### Why Euclidean Distance Isn’t a Good Measure in High-Dimensional Text Data

1. **Euclidean distance loses meaning in high dimensions**  
   - When you go to thousands of dimensions (like 5K features from bag-of-words), distances between points tend to become very similar.  
   - In low dimensions, close points are clearly distinguishable from far ones.  
   - In high dimensions, the ratio of the nearest distance to the farthest distance → **tends to 1**.  
   - (So the "nearest neighbor" is not much nearer than the "farthest neighbor".)  
   - This makes Euclidean distance **less discriminative**.  

2. **Sparsity problem**  
   - With text data (like your `CountVectorizer`), most vectors are **sparse** (lots of zeros).  
   - Two documents may share only a few words.  
   - Euclidean distance is heavily influenced by the **number of zero entries** and **absolute word counts**, not by the **relative overlap** of terms.  
   - That means two documents with slightly different lengths (word counts) may appear far apart, even if they talk about the same thing.  

3. **Scale sensitivity**  
   - Euclidean distance cares about **magnitude**.  
   - Longer documents (more words) will automatically have larger distances compared to shorter ones, even if they use the same vocabulary.  
   - That’s why it’s **not ideal for text data**.  


In [33]:
cosine_similarity(vectors)

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [34]:
cosine_similarity(vectors).shape

(4806, 4806)

In [35]:
similarity = cosine_similarity(vectors)
similarity[0] # distance of first movie with each movie

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [36]:
# i need to recommend top5 movies for that i need to sort, now while sorting the index will be shuffled i wont be able to tell whats the index of the 
# top 5 most closed movie which is why enumerate is used to caputre the index between the similarity of one movie with other
list(enumerate(similarity[0]))[:10]

[(0, 1.0000000000000002),
 (1, 0.08346223261119858),
 (2, 0.08603090020146065),
 (3, 0.0734718358370645),
 (4, 0.1892994097121204),
 (5, 0.10838874619051501),
 (6, 0.03993615319154359),
 (7, 0.14673479641335554),
 (8, 0.05923488777590923),
 (9, 0.0967301666813349)]

In [37]:
# so now we know which movies are the most similar. THe most similar movie to Avatar is at index 1214 with score 0.28
sorted(list(enumerate(similarity[0])), reverse=True, key= lambda x:x[1])[:5]

[(0, 1.0000000000000002),
 (1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383)]

In [38]:
# i dont nedd the frist as its always 1, similarity with itself is always 1
sorted(list(enumerate(similarity[0])), reverse=True, key= lambda x:x[1])[1:6]

[(1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [39]:
def recommend(movie):
    movie_index = df[df["title"] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:11]

    for i in movies_list:
        print(df.iloc[i[0]]["title"])


In [40]:
recommend("Inception")

12 Rounds
Abduction
RED
Krrish
The Animal
Timecop
Copying Beethoven
Big Trouble in Little China
The Truman Show
Commando


In [41]:
import pickle

In [42]:
pickle.dump(df,open("movies.pkl", "wb"))

In [48]:
pickle.dump(similarity,open("similarity.pkl", "wb"))

# with TF-IDF

In [39]:
# Steming
ps = PorterStemmer()

def steam(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return  " ".join(y)


df["tags"] = df["tags"].apply(steam)
df["tags"][0]

'jamescameron action adventur fantasi sciencefict samworthington zoesaldana sigourneyweav in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d'

In [40]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
vectors = tfidf.fit_transform(df["tags"]).toarray()

In [41]:
vectors[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [42]:
df["tags"] = df["tags"].apply(steam)

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
vectors = tfidf.fit_transform(df["tags"]).toarray()

print(tfidf.get_feature_names_out()[50:100])

['500' '60' '70' '80' 'aaron' 'aaroneckhart' 'abandon' 'abduct'
 'abigailbreslin' 'abil' 'abl' 'aboard' 'abov' 'abu' 'academ' 'academi'
 'accept' 'access' 'accid' 'accident' 'acclaim' 'accompani' 'accomplish'
 'account' 'accu' 'ace' 'achiev' 'acquaint' 'act' 'action' 'actionhero'
 'activ' 'activist' 'activities' 'actor' 'actress' 'actual' 'ad' 'adam'
 'adamsandl' 'adamshankman' 'adapt' 'add' 'addict' 'adjust' 'admir'
 'admit' 'adolesc' 'adopt' 'ador']


In [43]:
cosine_similarity(vectors)

array([[1.        , 0.02202856, 0.03038566, ..., 0.02320005, 0.        ,
        0.        ],
       [0.02202856, 1.        , 0.01291308, ..., 0.01701124, 0.        ,
        0.00644367],
       [0.03038566, 0.01291308, 1.        , ..., 0.0162864 , 0.        ,
        0.        ],
       ...,
       [0.02320005, 0.01701124, 0.0162864 , ..., 1.        , 0.01861174,
        0.02858086],
       [0.        , 0.        , 0.        , ..., 0.01861174, 1.        ,
        0.01975975],
       [0.        , 0.00644367, 0.        , ..., 0.02858086, 0.01975975,
        1.        ]])

In [44]:
similarity = cosine_similarity(vectors)
similarity[0]

array([1.        , 0.02202856, 0.03038566, ..., 0.02320005, 0.        ,
       0.        ])

In [45]:
similarity[0]

array([1.        , 0.02202856, 0.03038566, ..., 0.02320005, 0.        ,
       0.        ])

In [46]:
def recommend(movie):
    movie_index = df[df["title"] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:11]

    for i in movies_list:
        print(df.iloc[i[0]]["title"])


In [47]:
recommend("Batman Begins")

The Dark Knight
The Dark Knight Rises
Batman
Batman v Superman: Dawn of Justice
Batman Returns
Batman
Batman & Robin
Batman Forever
Defendor
Batman: The Dark Knight Returns, Part 2
