# (version-03) Content based filltering movie recommendation system
### SentenceTransformer
### Embeddings(384) - separate embedding for each feature/column, combine them into a single 384-dimensional vector per movie
### Cosing similarity
### Each feature assigned a custom weight

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ast # to convert string into list

from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
import joblib

from sentence_transformers import SentenceTransformer

In [50]:
movies = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_movies.csv') # 'r' for raw string, o/w use / or \\
credits = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_credits.csv')

In [51]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [52]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Merge movies table with credits table according to the titles

In [53]:
movies = movies.merge(credits, on='title')
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [54]:
movies.shape

(4809, 23)

In [55]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

## Create new dataframe with important columns

In [56]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] # [[ ]]  for multiple cvolumns together
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [57]:
movies.shape

(4809, 7)

## Data cleaning

In [58]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [59]:
movies.dropna(inplace=True)

In [60]:
movies.isnull().sum()
movies.shape

(4806, 7)

In [61]:
movies.duplicated().sum()

np.int64(0)

In [62]:
print(movies.iloc[0])

movie_id                                                19995
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
genres      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
keywords    [{"id": 1463, "name": "culture clash"}, {"id":...
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object


## Data Cleaning Functions

In [63]:
# we have some column records as list but here they are consdidered as string so let convert them into original list

# convert string list into list for fetchign specific df, always pass a string
def string_to_list_convertor(text, no_of_items=None):

    # text is null return []
    if isinstance(text, float) and pd.isna(text):
        return []
        
    name_list = []
    count=1
    
    for i in ast.literal_eval(text):
        name_list.append(i['name'])
        if no_of_items is not None:
            if count >= no_of_items:
                break
            count += 1
            
    return name_list

# fetch director name, always pass a string
def fetch_director(text):
    
    # text is null return []
    if pd.isna(text):
        return []
  
    director_list = []

    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            director_list.append(i['name'])
            break
    return director_list

# Reomove spaces betweeen names, always pass a list
def remove_spaces(text):
    names = []

    for i in text:
        names.append(i.replace(" ", ""))
    return names

# convert words in text list into lower
def lowercase_words(text):
   return [i.lower() for i in text]

## Each column -> convert into list, remove spaces, lowercasing

In [64]:
movies['genres'] = movies['genres'].apply(string_to_list_convertor).apply(remove_spaces).apply(lowercase_words)
movies['cast'] = movies['cast'].apply(lambda x :  string_to_list_convertor(x, 3)).apply(remove_spaces).apply(lowercase_words)
movies['keywords'] = movies['keywords'].apply(string_to_list_convertor).apply(remove_spaces).apply(lowercase_words)
movies['crew'] = movies['crew'].apply(fetch_director).apply(remove_spaces).apply(lowercase_words)
movies['overview'] = movies['overview'].apply(lambda x : x.split()).apply(lowercase_words)

movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[in, the, 22nd, century,, a, paraplegic, marin...","[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa,, long, believed, to, be, d...","[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]


 ## Load embedding model

In [65]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define weights
weights = {
    'overview': 1.0,
    'genres': 3.0,
    'keywords': 2.0,
    'cast': 1.5,
    'crew': 3.0
}

# Function to create weighted embeddings
def get_weighted_embedding(row, model, weights):

     # convert lists to strings
    parts = {
        'overview': " ".join(row['overview']),
        'genres': " ".join(row['genres']),
        'keywords': " ".join(row['keywords']),
        'cast': " ".join(row['cast']),
        'crew': " ".join(row['crew'])
    }

    final_embedding = np.zeros(model.get_sentence_embedding_dimension())

    for key, text in parts.items():
        if text.strip(): # skip empty
            emb = model.encode(text)
            final_embedding += weights[key] * emb
    return final_embedding

In [66]:
# Apply to all movies (no perogress bar)

# embeddings = np.array([get_weighted_embedding(row, model, weights) for _, row in movies.iterrows()])

## Compute weighted embeddings for all movies with a progress bar
    Each movie's embedding is calculated based on overview, genres, keywords, cast, and crew

In [67]:
from tqdm import tqdm

# Wrap the iterator with tqdm
embeddings = np.array([
    get_weighted_embedding(row, model, weights)
    for _, row in tqdm(movies.iterrows(), total=len(movies), desc="Embedding movies")
])


Embedding movies: 100%|████████████████████████████████████████████████████████████| 4806/4806 [16:27<00:00,  4.87it/s]


In [71]:
embeddings.shape

(4806, 384)

## Cosine similarity, Reccomendation function

In [72]:
similarity_matrix = cosine_similarity(embeddings)

In [73]:
def recommend(movie, top_k=5):
    # all titles
    titles = movies['title'].tolist()
    
    # Find the closest match (allowing typos / case differences)
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)

    # Checks if the matches list is empty
    if not matches:
        print(f"No close match found for '{movie}'")
        return

    best_match = matches[0]
    # Finds the row number of the movie in new_df whose title exactly matches best_match
    index = movies[movies['title'] == best_match].index[0]

    distance = sorted(list(enumerate(similarity_matrix[index])), reverse=True, key= lambda x : x[1])

    # ignore first value (1 is itself same movie)
    results = []
    print(f"\nResults for: {best_match}\n")
    for i in distance[1:top_k+1]:
        movie_title = movies.iloc[i[0]]['title'] # movie_index in the DataFrame, access row by integer position, fetch the title column from that row
        score = i[1]
        results.append((movie_title, score))
        print(f"{movie_title} (Similarity: {score:.3f})")
    return results

In [74]:
recommend('batman', 10)


Results for: Batman

Batman Returns (Similarity: 0.918)
Batman (Similarity: 0.823)
Batman Forever (Similarity: 0.797)
Batman & Robin (Similarity: 0.782)
Hancock (Similarity: 0.771)
Batman (Similarity: 0.771)
Dark Shadows (Similarity: 0.765)
Beetlejuice (Similarity: 0.763)
Big Fish (Similarity: 0.759)
Edward Scissorhands (Similarity: 0.750)


[('Batman Returns', np.float64(0.9179363103776909)),
 ('Batman', np.float64(0.8228711171077796)),
 ('Batman Forever', np.float64(0.7974305323836386)),
 ('Batman & Robin', np.float64(0.782053671422448)),
 ('Hancock', np.float64(0.7714462047829034)),
 ('Batman', np.float64(0.7709384499364034)),
 ('Dark Shadows', np.float64(0.7650128630059615)),
 ('Beetlejuice', np.float64(0.7627691925171494)),
 ('Big Fish', np.float64(0.7591604843432245)),
 ('Edward Scissorhands', np.float64(0.7499833593437009))]

In [76]:
os.makedirs('artifacts_v3', exist_ok=True)

joblib.dump(movies, 'artifacts_v3/movies.pkl')
joblib.dump(similarity_matrix, 'artifacts_v3/similarity_matrix.pkl')

['artifacts_v3/similarity_matrix.pkl']