In [None]:
# ======================
# MOVIE RECOMMENDATION SYSTEM
# ======================
# This system recommends movies based on content similarity using:
# - TF-IDF vectorization
# - Cosine similarity
# - Fuzzy string matching for title recognition
# ======================

In [None]:
# ----------
# 1. IMPORT LIBRARIES
# ----------

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from fuzzywuzzy import process, fuzz
import ast
import re

In [None]:
# ----------
# 2. DATA LOADING AND PREPROCESSING
# ----------

# Load datasets
movie = pd.read_csv("C:/Users/Vivek/Downloads/tmdb_m.csv")
credit = pd.read_csv("C:/Users/Vivek/Downloads/tmdb_c.csv")

# Merge datasets on movie title
movie = movie.merge(credit, on = 'title')

# Select relevant features for recommendation system
movie = movie[['genres','id','keywords','title','overview','cast','crew']]

# Remove rows with missing values
movie.dropna(inplace = True)


In [None]:
# ----------
# 3. HELPER FUNCTIONS
# ----------

# Convert stringified JSON to Python objects and extract names
def extract_name(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l


def extract_cast_name(obj):
    l = []
    temp=0
    for i in ast.literal_eval(obj):
        if(temp!=5):
            l.append(i['name'])
            temp+=1
        else:
            break
    return l


def extract_crew_name(obj):
    l = []
    temp=0
    for i in ast.literal_eval(obj):
        if (i['job']=='Director'):
            l.append(i['name'])
    return l


#Normalize and clean text data
def clean_text_items(text_list):
    return [re.sub(r'[^a-zA-Z0-9]', '', str(item).lower().replace(" ","")) 
            for item in text_list]


#Apply Porter Stemmer to text
ps = PorterStemmer()
def stemming(obj):
    l = []

    for i in obj.split():
        l.append(ps.stem(i))
        
    return " ".join(l)

In [None]:
# ----------
# 4. FEATURE ENGINEERING
# ----------


# Process genres and keywords
for col in ['genres', 'keywords']:
    movie[col] = movie[col].apply(extract_name)

# Process crew (get directors only
movie['crew'] = movie['crew'].apply(extract_crew_name)

# Process cast (get top 5 actors)
movie['cast'] = movie['cast'].apply(extract_cast_name)

# Process overview (split into words)
movie['overview'] = movie['overview'].apply(lambda x: x.split())

# Clean all text features
for col in ['genres', 'keywords','cast', 'crew']:
    movie[col] = movie[col].apply(clean_text_items)

In [None]:
# Combine all features into tags
movie['tags'] = movie['keywords'] + movie['genres'] + movie['crew'] + movie['cast'] + movie['overview']

# Convert tags to string and clean
movie['tags'] = movie['tags'].apply(lambda x : ' '.join(x))
movie['tags'] = movie['tags'].apply(lambda x : x.lower())
movie['tags'] = movie['tags'].apply(stemming)

# Keep only essential columns
movie = movie[['id','title','tags']]

In [None]:
# Initialize TF-IDF Vectorizer
tfdif = TfidfVectorizer(
    max_features=5000,                 # Limit vocabulary size
    stop_words='english',              # Remove common English words
    ngram_range=(1,2)                  # Consider both single words and word pairs
)

# Create TF-IDF feature matrix
tfdif_matrix = tfdif.fit_transform(movie['tags'])

# Compute cosine similarity between all movies
cosine_similar = cosine_similarity(tfdif_matrix)

In [None]:
#Recommends movies based on fuzzy-matched input title
def movie_recommender(user_input):

    # Fuzzy match with movie titles
    titles = movie['title'].to_list()
    result= process.extractOne(
        user_input,
        titles,
        scorer = fuzz.token_set_ratio
    )

    # Handle no match case
    if result is None:
        print("No matching movie found in our database")
        return
        
    # Unpack match results
    match, confidence = result

    # Check confidence threshold
    if confidence < 60:
        print(f"No strong match found. Did you mean: '{match}'? (Confidence: {confidence}%)")
        return
    
    print(f"🎬 Showing results for: '{match}' (Confidence: {confidence}%)")

    # Get index of matched movie
    indx = movie.index[movie['title'] == match].to_list()[0]

    # Get similarity scores and sort
    m_score = list(enumerate(cosine_similar[indx]))
    m_score = sorted(m_score, key = lambda x : x[1], reverse=True)[1:11]

    # Get top N recommendations (skip the first as it's the same movie)
    # Display results
    print("\nTop Recommendations:")
    for i, (index, score) in enumerate(m_score, 1):
        print(f"{i}. {movie.iloc[index]['title']} (Similarity: {score:.2f})")

In [82]:
# Run this cell to test the recommender
user_movie = input("🔍 Enter a movie title: ")
movie_recommender(user_movie)

🎬 Showing results for: 'The Shawshank Redemption' (Confidence: 74%)

🍿 Top Recommendations:
1. Civil Brand (Similarity: 0.28)
2. Prison (Similarity: 0.28)
3. Escape Plan (Similarity: 0.25)
4. Mean Machine (Similarity: 0.25)
5. The Chorus (Similarity: 0.23)
6. The Longest Yard (Similarity: 0.23)
7. Penitentiary (Similarity: 0.23)
8. Standard Operating Procedure (Similarity: 0.23)
9. Fortress (Similarity: 0.22)
10. Get Hard (Similarity: 0.20)
