In [16]:
# Step 1: Imports
# ===============================
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from difflib import get_close_matches

In [17]:
# Step 2: Load Datasets
# ===============================
# Movies dataset
movies = pd.read_csv("../data/movies.csv")

# Optional: Ratings dataset (to show avg ratings)
ratings = pd.read_csv("../data/ratings.csv")

# Tags and links are optional for now
tags = pd.read_csv("../data/tags.csv")
links = pd.read_csv("../data/links.csv")

In [18]:
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)
print("Tags:", tags.shape)
print("Links:", links.shape)
print(movies.head())

Movies: (27278, 3)
Ratings: (20000263, 4)
Tags: (465564, 4)
Links: (27278, 3)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [19]:
# Step 3: Preprocess Movie Text
# ===============================
# Combine title + genres for embeddings
movies['text'] = movies['title'] + " " + movies['genres']

In [20]:
# Step 4: Sentence-BERT Embeddings
# ===============================
embedder = SentenceTransformer('all-MiniLM-L6-v2')
movie_embeddings = embedder.encode(movies['text'].tolist(), show_progress_bar=True)

print("Embeddings shape:", movie_embeddings.shape)  # (num_movies, 384)

Batches: 100%|██████████| 853/853 [03:05<00:00,  4.59it/s]


Embeddings shape: (27278, 384)


In [21]:
# Step 5: Build FAISS Index
# ===============================
embedding_dim = movie_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(movie_embeddings.astype('float32'))

print("FAISS index built with", faiss_index.ntotal, "movies")

FAISS index built with 27278 movies


In [37]:
def find_closest_movie(title, movies, cutoff=0.4):
    # Clean titles: remove year + lowercase
    movies['title_clean'] = movies['title'].str.replace(r"\(\d{4}\)", "", regex=True).str.strip()
    title_input = title.lower()
    titles = [t.lower() for t in movies['title_clean'].tolist()]
    
    from difflib import get_close_matches
    matches = get_close_matches(title_input, titles, n=1, cutoff=cutoff)
    if matches:
        # Map back to original title
        matched_idx = titles.index(matches[0])
        return movies.iloc[matched_idx]['title']
    return None


In [40]:
def recommend_movies_by_typing(input_title, movies, embeddings, faiss_index, ratings=None, top_k=5):
    """
    Recommend movies given any typed input using vector similarity.
    """
    # 1. Embed the input text
    input_vector = embedder.encode([input_title]).astype('float32')
    
    # 2. Find closest movie in dataset
    D, I = faiss_index.search(input_vector, 1)  # top 1 closest movie
    closest_idx = I[0][0]
    closest_title = movies.iloc[closest_idx]['title']
    
    # 3. Find top-K similar movies using FAISS
    vector = embeddings[closest_idx].reshape(1, -1).astype('float32')
    D, I = faiss_index.search(vector, top_k + 1)
    recs = movies.iloc[I[0]][['movieId','title','genres']].copy()
    recs = recs[recs['title'] != closest_title]  # remove the original movie
    
    # 4. Optional: add average rating
    if ratings is not None:
        avg_ratings = ratings.groupby("movieId")['rating'].mean()
        recs['avg_rating'] = recs['movieId'].map(avg_ratings.round(2))
    
    recs.reset_index(drop=True, inplace=True)
    return closest_title, recs.head(top_k)


In [46]:
input_movie = "final destination"  # typo
closest_title, recommendations = recommend_movies_by_typing(
    input_movie, movies, movie_embeddings, faiss_index, ratings=ratings, top_k=5
)

print(f"Closest match found: {closest_title}")
print("Recommended Movies:")
print(recommendations)


Closest match found: City of Your Final Destination, The (2009)
Recommended Movies:
   movieId                                              title  \
0     3409                           Final Destination (2000)   
1    71252  Final Destination, The (Final Destination 4) (...   
2     6058                         Final Destination 2 (2003)   
3    78959                                     Endgame (2009)   
4    88932                         Final Destination 5 (2011)   

                 genres  avg_rating  
0        Drama|Thriller        3.12  
1       Horror|Thriller        2.59  
2       Horror|Thriller        2.90  
3                 Drama        4.00  
4  Horror|Thriller|IMAX        2.69  
