# Simple SBERT Recommendation System #

In [6]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [7]:
# load the TMDB database and filter by popularity and limit dataset to 500
movies_df = pd.read_csv("tmdb_5000_movies.csv")
movies_df = movies_df.sort_values(by='popularity', ascending=False).head(500)

In [8]:
# build the vectors
model = SentenceTransformer('all-MiniLM-L6-v2')

# compute the vectors 
movies_df['genres_embedding'] = movies_df['genres'].apply(lambda text: model.encode(text, convert_to_numpy=True))
movies_df['keywords_embedding'] = movies_df['keywords'].apply(lambda text: model.encode(text, convert_to_numpy=True))
movies_df['overview_embedding'] = movies_df['overview'].apply(lambda text: model.encode(text, convert_to_numpy=True))
movies_df['title_embedding'] = movies_df['original_title'].apply(lambda text: model.encode(text, convert_to_numpy=True))

# helper function to handle empty or missing embeddings
def compute_similarity(user_embedding, embedding):
    if embedding is None or len(embedding) == 0:
        return 0  # Handle empty or missing embeddings
    return cosine_similarity(user_embedding.reshape(1, -1), embedding.reshape(1, -1))[0][0]

In [9]:
# compute simularity
def movie_recommender(user_input, top_n = 5, weight_genres = 0.3, weight_keywords = 0.4, weight_overview = 0.15, weight_title = 0.15):
    """Returning the top N recommended movies based on weighted text similarity"""
    user_embedding = model.encode(user_input, convert_to_numpy=True)

    # similarity calculation
    movies_df['genres_similarity'] = movies_df['genres_embedding'].apply(lambda emb: compute_similarity(user_embedding, emb))
    movies_df['keywords_similarity'] = movies_df['keywords_embedding'].apply(lambda emb: compute_similarity(user_embedding, emb))
    movies_df['overview_similarity'] = movies_df['overview_embedding'].apply(lambda emb: compute_similarity(user_embedding, emb))
    movies_df['title_similarity'] = movies_df['title_embedding'].apply(lambda emb: compute_similarity(user_embedding, emb))

    # compute the weighted similarity score
    movies_df['similarity'] = (
        weight_genres * movies_df['genres_similarity'] +
        weight_keywords * movies_df['keywords_similarity'] +
        weight_overview * movies_df['overview_similarity'] +
        weight_title * movies_df['title_similarity']
    )
    # generate movie recommendations utilzing similarity score
    recomendations = movies_df.sort_values(by='similarity', ascending=False).head(top_n)

    return recomendations[['original_title']]

In [10]:
# generated recommendation from an example query
query = "I like action movies set in space"
movies = movie_recommender(query)
display(movies)

#Salary Expectation per Hour: ~$16-$30 || (Assuming 30 hr/week) ...per Month: ~$1500-4000 



Unnamed: 0,original_title
158,Star Trek
95,Interstellar
2912,Star Wars
56,Star Trek Beyond
300,Starship Troopers
