In [None]:
# IMPORTS
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import ast
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Define paths
MOVIE_DS_PATH = "/content/drive/MyDrive/TMDB_MOVIE_DS/tmdb_5000_movies.csv"
CREDITS_DS_PATH = "/content/drive/MyDrive/TMDB_MOVIE_DS/tmdb_5000_credits.csv"

# DATASET LINK : https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata/

# Load data
movie_data = pd.read_csv(MOVIE_DS_PATH)
creds_data = pd.read_csv(CREDITS_DS_PATH)

In [None]:
# movie_data.head() # this shall give the entire dataframe struct
movie_data.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [None]:
creds_data.head(1)
# creds_data.head(1)['cast'].values

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
# Now we can merge the dataframes based on id or title. (let's use title here)
print("Before merge :", movie_data.shape)
movie_data.rename(columns={'id':'movie_id'}, inplace=True)
movie_data = movie_data.merge(creds_data, on='movie_id')
print("After merge :", movie_data.shape)

Before merge : (4803, 20)
After merge : (4803, 23)


In [None]:
columns_to_keep = ['genres', 'movie_id', 'keywords', 'title_x', 'overview', 'vote_average', 'cast']
# these will contribute in generating atleast some similarities between the movies, the rest are almost irrelevant
movie_data = movie_data[columns_to_keep]
movie_data.rename(columns={'title_x':'title'}, inplace=True)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        4803 non-null   object 
 1   movie_id      4803 non-null   int64  
 2   keywords      4803 non-null   object 
 3   title         4803 non-null   object 
 4   overview      4800 non-null   object 
 5   vote_average  4803 non-null   float64
 6   cast          4803 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 262.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data.rename(columns={'title_x':'title'}, inplace=True)


In [None]:
# our new dataframe shall contain the columns : movie_id, title, tags, score
# we need to somehow efficiently merge overview, genres, keywords and cast (for cast, join only the top 3-5 casts)

# let's convert the vote_average to score
movie_data.rename(columns={'vote_average':'score'}, inplace=True)

# but before that we need to remove any missing data, and remove duplicate data
print(movie_data.isnull().sum()) # check missing data (check which columns has missing data)
movie_data.dropna(inplace=True) # remove missing or null data

# now lets remove the duplicate datas
movie_data.duplicated().sum() # no duplicates present so we are good to go !

genres      0
movie_id    0
keywords    0
title       0
overview    3
score       0
cast        0
dtype: int64


0

In [None]:
# Finally we now want to bring these columns to correct format. (let's start with the genres)
print(movie_data.iloc[0].genres) # this is in some weird format, we need only the 'name' of the genre

def genKey_converter(genres):
  obj = ast.literal_eval(genres) # the genres is in string format, so convert it to object first
  genre_lst = []
  for _dict in obj:
    genre_lst.append(_dict['name'])
  return genre_lst

genKey_converter(movie_data.iloc[0].genres) # checking the function
# note that the same function can be done for the keywords column as well. (check it's format)

# So, lets convert the genres and the keywords column
movie_data['genres'] = movie_data['genres'].apply(genKey_converter)
movie_data['keywords'] = movie_data['keywords'].apply(genKey_converter)

movie_data.head()

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]


Unnamed: 0,genres,movie_id,keywords,title,overview,score,cast
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",7.2,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",6.9,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,6.3,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,7.6,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba..."
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...",6.1,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."


In [None]:
# Now, from cast, we need to takeout the top 3 columns, ie, the 1st 3 dictionaries
print(movie_data['cast'][0])

def cast_converter(casts):
  obj = ast.literal_eval(casts)
  cast_lst = []
  for _dict in obj[:3]:
    cast_lst.append(_dict['name'])
  return cast_lst

cast_converter(movie_data['cast'][0]) # It works !
# So, let's apply the fuction to our casts column
movie_data['cast'] = movie_data['cast'].apply(cast_converter)

movie_data.head()

[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 2

Unnamed: 0,genres,movie_id,keywords,title,overview,score,cast
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",7.2,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]"
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",6.9,"[Johnny Depp, Orlando Bloom, Keira Knightley]"
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,6.3,"[Daniel Craig, Christoph Waltz, Léa Seydoux]"
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,7.6,"[Christian Bale, Michael Caine, Gary Oldman]"
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...",6.1,"[Taylor Kitsch, Lynn Collins, Samantha Morton]"


In [None]:
# Now the overview column is a string, so we shall try to convert it to a list aswell.
# The best thing is to remove the stopwords like 'a', 'an', 'the', and take the important aspects only

# But before that, let's store the initial overview as a summary of the movie
movie_data['summary'] = movie_data['overview']

def remove_stop_words(string):
  stop_words = set(stopwords.words('english'))
  words = string.split()
  filtered_words = [word for word in words if word.lower() not in stop_words]
  return filtered_words

remove_stop_words(movie_data['overview'][0]) # It is working !
# So, let's apply the fuction to our casts column
movie_data['overview'] = movie_data['overview'].apply(remove_stop_words)

movie_data.head()

Unnamed: 0,genres,movie_id,keywords,title,overview,score,cast,summary
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"[22nd, century,, paraplegic, Marine, dispatche...",7.2,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","In the 22nd century, a paraplegic Marine is di..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, dead,, co...",6.9,"[Johnny Depp, Orlando Bloom, Keira Knightley]","Captain Barbossa, long believed to be dead, ha..."
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",Spectre,"[cryptic, message, Bond’s, past, sends, trail,...",6.3,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",A cryptic message from Bond’s past sends him o...
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, death, District, Attorney, Harvey,...",7.6,"[Christian Bale, Michael Caine, Gary Oldman]",Following the death of District Attorney Harve...
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...",John Carter,"[John, Carter, war-weary,, former, military, c...",6.1,"[Taylor Kitsch, Lynn Collins, Samantha Morton]","John Carter is a war-weary, former military ca..."


In [None]:
# Finally, let's remove any spaces in the tags (to avoid confusion) [change "Sam Walker" to "SamWalker"]
movie_data['genres2'] = movie_data['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movie_data['cast'] = movie_data['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movie_data['keywords'] = movie_data['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])

movie_data.head()

Unnamed: 0,genres,movie_id,keywords,title,overview,score,cast,summary,genres2
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[22nd, century,, paraplegic, Marine, dispatche...",7.2,"[SamWorthington, ZoeSaldana, SigourneyWeaver]","In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]"
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, dead,, co...",6.9,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]"
2,"[Action, Adventure, Crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",Spectre,"[cryptic, message, Bond’s, past, sends, trail,...",6.3,"[DanielCraig, ChristophWaltz, LéaSeydoux]",A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]"
3,"[Action, Crime, Drama, Thriller]",49026,"[dccomics, crimefighter, terrorist, secretiden...",The Dark Knight Rises,"[Following, death, District, Attorney, Harvey,...",7.6,"[ChristianBale, MichaelCaine, GaryOldman]",Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]"
4,"[Action, Adventure, Science Fiction]",49529,"[basedonnovel, mars, medallion, spacetravel, p...",John Carter,"[John, Carter, war-weary,, former, military, c...",6.1,"[TaylorKitsch, LynnCollins, SamanthaMorton]","John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]"


In [None]:
# Finally let's create the tags column (concatenate the genres, cast, keywords and overview)
movie_data['tags'] = movie_data['genres2'] + movie_data['cast'] + movie_data['keywords'] + movie_data['overview']
# And also, let's convert everything (the tags) to lowercase
movie_data['tags'] = movie_data['tags'].apply(lambda x:[i.lower() for i in x])
movie_data.head()

# Finally, we shall remove the overview, cast and keywords columns
movie_data = movie_data[['movie_id', 'title', 'genres', 'score', 'tags']]
print(movie_data.info())
movie_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   movie_id  4800 non-null   int64  
 1   title     4800 non-null   object 
 2   genres    4800 non-null   object 
 3   score     4800 non-null   float64
 4   tags      4800 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 354.0+ KB
None


Unnamed: 0,movie_id,title,genres,score,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,"[action, adventure, fantasy, sciencefiction, s..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,"[adventure, fantasy, action, johnnydepp, orlan..."
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,"[action, adventure, crime, danielcraig, christ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6,"[action, crime, drama, thriller, christianbale..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]",6.1,"[action, adventure, sciencefiction, taylorkits..."


In [None]:
# Now finally we need to calculate the similarity between any 2 tags somehow using some algorithms
# Combine the tags list into a single string for each movie
movie_data['tags'] = movie_data['tags'].apply(lambda x: " ".join(x))

# Use TF-IDF to vectorize the tags
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(movie_data['tags']).toarray()

# Check the shape of the TF-IDF matrix
print(tfidf_matrix.shape)  # Should be (number of movies, 5000)


(4800, 5000)


In [None]:
# Compute the cosine similarity between all movies
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim.shape)  # Should be (number of movies, number of movies)


(4800, 4800)


In [None]:
# Function to find movie index by title
def find_movie_index(title, data):
    title = title.lower().replace(" ", "")
    for idx, row in data.iterrows():
        if row['title'].replace(" ", "").lower() == title:
            return idx
    return None

# Function to get top N similar movies
def get_similar_movies(title, data, similarity_matrix, top_n=10):
    movie_idx = find_movie_index(title, data)
    if movie_idx is None:
        return None

    # Get similarity scores for the given movie
    sim_scores = list(enumerate(similarity_matrix[movie_idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top N similar movies (excluding the first one as it is the same movie)
    top_similar_movies = sim_scores[1:top_n+1]

    # Get the details of the top similar movies
    similar_movies = []
    for i in top_similar_movies:
        idx = i[0]
        movie_details = {
            'title': data.iloc[idx]['title'],
            'movie_id': data.iloc[idx]['movie_id'],
            'genres': data.iloc[idx]['genres'],
            'score': data.iloc[idx]['score']
        }
        similar_movies.append(movie_details)

    return similar_movies

# Example usage
similar_movies = get_similar_movies('The Godfather', movie_data, cosine_sim, top_n=10)
for movie in similar_movies:
    print(movie)


{'title': 'Desert Dancer', 'movie_id': 266102, 'genres': ['Drama'], 'score': 6.5}
{'title': 'Take the Lead', 'movie_id': 12763, 'genres': ['Music'], 'score': 6.6}
{'title': 'Center Stage', 'movie_id': 10560, 'genres': ['Drama', 'Music'], 'score': 6.8}
{'title': 'Footloose', 'movie_id': 1788, 'genres': ['Drama', 'Family', 'Music', 'Romance'], 'score': 6.4}
{'title': 'Step Up 2: The Streets', 'movie_id': 8328, 'genres': ['Music', 'Drama', 'Romance'], 'score': 6.5}
{'title': 'Step Up', 'movie_id': 9762, 'genres': ['Music', 'Drama', 'Romance', 'Crime'], 'score': 6.7}
{'title': 'Black Swan', 'movie_id': 44214, 'genres': ['Drama', 'Thriller'], 'score': 7.3}
{'title': 'ABCD (Any Body Can Dance)', 'movie_id': 157293, 'genres': ['Drama', 'Music'], 'score': 5.6}
{'title': 'Tango', 'movie_id': 65749, 'genres': ['Drama', 'Foreign', 'Romance'], 'score': 6.8}
{'title': 'Step Up Revolution', 'movie_id': 85446, 'genres': ['Music', 'Drama', 'Romance'], 'score': 6.7}


In [None]:
# Function to handle movie title query
def recommend_movies(query, data, similarity_matrix, top_n=10):
    movie_idx = find_movie_index(query, data)
    if movie_idx is not None:
        # Movie found, return similar movies
        return get_similar_movies(query, data, similarity_matrix, top_n)
    else:
        # Movie not found, return top N movies by score
        top_movies = data.nlargest(top_n, 'score')[['title', 'movie_id', 'genres', 'score']]
        return top_movies.to_dict('records')

# Example usage
recommendations = recommend_movies('Avatar', movie_data, cosine_sim, top_n=40)
for movie in recommendations:
    print(movie)


{'title': 'Falcon Rising', 'movie_id': 270938, 'genres': ['Adventure', 'Action'], 'score': 5.5}
{'title': 'Battle: Los Angeles', 'movie_id': 44943, 'genres': ['Action', 'Science Fiction'], 'score': 5.5}
{'title': 'Apollo 18', 'movie_id': 50357, 'genres': ['Horror', 'Thriller', 'Science Fiction'], 'score': 5.0}
{'title': 'Titan A.E.', 'movie_id': 7450, 'genres': ['Animation', 'Action', 'Science Fiction', 'Family', 'Adventure'], 'score': 6.3}
{'title': 'Star Trek Into Darkness', 'movie_id': 54138, 'genres': ['Action', 'Adventure', 'Science Fiction'], 'score': 7.4}
{'title': 'The Book of Life', 'movie_id': 228326, 'genres': ['Romance', 'Animation', 'Adventure', 'Comedy', 'Family', 'Fantasy'], 'score': 7.3}
{'title': 'Aliens', 'movie_id': 679, 'genres': ['Horror', 'Action', 'Thriller', 'Science Fiction'], 'score': 7.7}
{'title': "Ender's Game", 'movie_id': 80274, 'genres': ['Science Fiction', 'Action', 'Adventure'], 'score': 6.6}
{'title': 'Aliens vs Predator: Requiem', 'movie_id': 440, 'g