# Content-Based Movie Recommendation System

This notebook implements a content-based movie recommendation system using TF-IDF vectorization and cosine similarity.

## Features:
- Efficient memory management for large datasets
- Fuzzy string matching for movie titles
- Weighted feature importance
- Optimized TF-IDF parameters
- Clean, object-oriented design

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import warnings
import gc
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
class MovieRecommendationSystem:
    """
    A content-based movie recommendation system using TF-IDF and cosine similarity.
    """
    
    def __init__(self, data_path='../data/movies.csv', max_features=5000):
        """
        Initialize the recommendation system
        
        Args:
            data_path (str): Path to the movies CSV file
            max_features (int): Maximum number of features for TF-IDF
        """
        self.data_path = data_path
        self.max_features = max_features
        self.movie_data = None
        self.tfidf_vectorizer = None
        self.feature_vectors = None
        self.similarity_matrix = None
        
    def load_data(self):
        """Load and explore the movie dataset"""
        print("Loading movie data...")
        self.movie_data = pd.read_csv(self.data_path)
        
        print(f"Dataset shape: {self.movie_data.shape}")
        print(f"Columns: {list(self.movie_data.columns)}")
        print("\nFirst 5 rows:")
        display(self.movie_data.head())
        
        # Check for missing values in key columns
        key_columns = ['title', 'genres', 'description', 'directors', 'stars']
        missing_data = self.movie_data[key_columns].isnull().sum()
        print("\nMissing values in key columns:")
        print(missing_data)
        
        return self.movie_data
    
    def preprocess_features(self, selected_features=['genres', 'description', 'directors', 'stars']):
        """
        Preprocess and combine movie features with proper weighting
        
        Args:
            selected_features (list): List of features to combine
        """
        print(f"Preprocessing features: {selected_features}")
        
        # Handle missing values
        for feature in selected_features:
            if feature in self.movie_data.columns:
                self.movie_data[feature] = self.movie_data[feature].fillna('')
            else:
                print(f"Warning: Feature '{feature}' not found in dataset")
        
        # Clean and process text data
        combined_features = []
        for idx, row in self.movie_data.iterrows():
            feature_text = ""
            
            for feature in selected_features:
                if feature in self.movie_data.columns:
                    text = str(row[feature]).lower()
                    # Remove brackets and quotes from list-like strings
                    text = text.replace('[', '').replace(']', '').replace("'", '').replace('"', '')
                    
                    # Add feature multiple times based on importance
                    weight = self._get_feature_weight(feature)
                    feature_text += (text + " ") * weight
            
            combined_features.append(feature_text.strip())
        
        print(f"Combined features created for {len(combined_features)} movies")
        return combined_features
    
    def _get_feature_weight(self, feature):
        """Get weight for different features based on importance"""
        weights = {
            'genres': 3,      # Most important for content similarity
            'directors': 2,   # Important for style similarity
            'stars': 2,       # Important for cast similarity
            'description': 1  # Provides context but can be noisy
        }
        return weights.get(feature, 1)
    
    def build_similarity_matrix(self, combined_features):
        """
        Build TF-IDF vectors and similarity matrix efficiently
        
        Args:
            combined_features (list): Combined feature text for each movie
        """
        print("Building TF-IDF vectors...")
        
        # Initialize TF-IDF vectorizer with optimized parameters
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words='english',
            ngram_range=(1, 2),  # Include unigrams and bigrams
            min_df=2,            # Ignore terms in less than 2 documents
            max_df=0.8,          # Ignore terms in more than 80% of documents
            lowercase=True,
            strip_accents='unicode'
        )
        
        # Fit and transform the features
        self.feature_vectors = self.tfidf_vectorizer.fit_transform(combined_features)
        
        print(f"Feature vector shape: {self.feature_vectors.shape}")
        print(f"Sparsity: {(1.0 - self.feature_vectors.nnz / (self.feature_vectors.shape[0] * self.feature_vectors.shape[1])):.4f}")
        
        # For large datasets, we'll compute similarity on-demand rather than storing the full matrix
        if self.feature_vectors.shape[0] > 10000:
            print("Large dataset detected. Similarity will be computed on-demand.")
            self.similarity_matrix = None
        else:
            print("Computing similarity matrix...")
            self.similarity_matrix = cosine_similarity(self.feature_vectors)
        
        return self.feature_vectors
    
    def get_movie_recommendations(self, movie_title, num_recommendations=10):
        """
        Get movie recommendations based on similarity
        
        Args:
            movie_title (str): Title of the movie to get recommendations for
            num_recommendations (int): Number of recommendations to return
        """
        # Find the movie in the dataset using fuzzy matching
        movie_matches = difflib.get_close_matches(
            movie_title.lower(), 
            [title.lower() for title in self.movie_data['title'].tolist()], 
            n=3, 
            cutoff=0.6
        )
        
        if not movie_matches:
            return {
                'error': f"Movie '{movie_title}' not found in the dataset",
                'suggestions': self._get_movie_suggestions(movie_title)
            }
        
        # Find the exact match in the original data
        matched_movie = None
        movie_index = None
        
        for match in movie_matches:
            for idx, title in enumerate(self.movie_data['title']):
                if title.lower() == match:
                    matched_movie = title
                    movie_index = idx
                    break
            if matched_movie:
                break
        
        if movie_index is None:
            return {'error': 'Movie index not found'}
        
        # Compute similarity scores
        if self.similarity_matrix is not None:
            # Use pre-computed similarity matrix
            similarity_scores = list(enumerate(self.similarity_matrix[movie_index]))
        else:
            # Compute similarity on-demand for memory efficiency
            movie_vector = self.feature_vectors[movie_index:movie_index+1]
            similarity_scores = cosine_similarity(movie_vector, self.feature_vectors).flatten()
            similarity_scores = list(enumerate(similarity_scores))
        
        # Sort by similarity score (excluding the movie itself)
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:]
        
        # Get top recommendations
        recommendations = []
        for i in range(min(num_recommendations, len(similarity_scores))):
            movie_idx = similarity_scores[i][0]
            similarity_score = similarity_scores[i][1]
            
            movie_info = {
                'title': self.movie_data.iloc[movie_idx]['title'],
                'similarity_score': round(float(similarity_score), 4)
            }
            
            # Add additional info if available
            for col in ['genres', 'year', 'rating', 'description']:
                if col in self.movie_data.columns:
                    value = self.movie_data.iloc[movie_idx][col]
                    if col == 'description' and len(str(value)) > 100:
                        value = str(value)[:100] + "..."
                    movie_info[col] = value
            
            recommendations.append(movie_info)
        
        return {
            'input_movie': matched_movie,
            'input_movie_info': {
                'genres': self.movie_data.iloc[movie_index]['genres'],
                'year': self.movie_data.iloc[movie_index]['year'],
                'rating': self.movie_data.iloc[movie_index]['rating']
            },
            'recommendations': recommendations
        }
    
    def _get_movie_suggestions(self, movie_title, n=5):
        """Get movie title suggestions for failed searches"""
        all_titles = self.movie_data['title'].tolist()
        suggestions = difflib.get_close_matches(movie_title, all_titles, n=n, cutoff=0.3)
        return suggestions
    
    def fit(self, selected_features=['genres', 'description', 'directors', 'stars']):
        """
        Complete pipeline to fit the recommendation system
        
        Args:
            selected_features (list): Features to use for recommendations
        """
        print("=" * 50)
        print("FITTING MOVIE RECOMMENDATION SYSTEM")
        print("=" * 50)
        
        # Load data
        self.load_data()
        
        # Preprocess features
        combined_features = self.preprocess_features(selected_features)
        
        # Build similarity matrix
        self.build_similarity_matrix(combined_features)
        
        # Clean up memory
        gc.collect()
        
        print("\n" + "=" * 50)
        print("RECOMMENDATION SYSTEM IS READY!")
        print("=" * 50)
        return self
    
    def get_random_movies(self, n=10):
        """Get random movies for testing"""
        sample_movies = self.movie_data.sample(n=n)[['title', 'year', 'genres', 'rating']]
        return sample_movies.to_dict('records')

print("MovieRecommendationSystem class defined successfully!")

MovieRecommendationSystem class defined successfully!


In [3]:
# Initialize and fit the recommendation system
recommender = MovieRecommendationSystem(max_features=5000)
recommender.fit()

FITTING MOVIE RECOMMENDATION SYSTEM
Loading movie data...
Dataset shape: (63249, 24)
Columns: ['id', 'title', 'year', 'duration', 'MPA', 'rating', 'votes', 'meta_score', 'description', 'Movie_Link', 'writers', 'directors', 'stars', 'budget', 'opening_weekend_gross', 'gross_worldwide', 'gross_us_canada', 'release_date', 'countries_origin', 'filming_locations', 'production_companies', 'awards_content', 'genres', 'languages']

First 5 rows:


Unnamed: 0,id,title,year,duration,MPA,rating,votes,meta_score,description,Movie_Link,...,opening_weekend_gross,gross_worldwide,gross_us_canada,release_date,countries_origin,filming_locations,production_companies,awards_content,genres,languages
0,tt0073195,Jaws,1975,2h 4m,PG,8.1,690K,87.0,When a massive killer shark unleashes chaos on...,https://www.imdb.com/title/tt0073195,...,"$7,061,513","$477,916,625","$267,263,625",1975.0,['United States'],"[""Water Street, Edgartown, Martha's Vineyard, ...","['Zanuck/Brown Productions', 'Universal Pictur...","Won 3 Oscars, 16 wins & 20 nominations total","['Monster Horror', 'Sea Adventure', 'Survival'...",['English']
1,tt0073629,The Rocky Horror Picture Show,1975,1h 40m,R,7.4,174K,65.0,A newly-engaged couple have a breakdown in an ...,https://www.imdb.com/title/tt0073629,...,,"$115,827,018","$112,892,319",1975.0,"['United Kingdom', 'United States']","[""Oakley Court, Windsor Road, Oakley Green, Wi...","['Twentieth Century Fox', 'Michael White Produ...","Awards, 3 wins & 4 nominations total","['B-Horror', 'Dark Comedy', 'Parody', 'Raunchy...",['English']
2,tt0073486,One Flew Over the Cuckoo's Nest,1975,2h 13m,R,8.7,1.1M,84.0,"In the Fall of 1963, a Korean War veteran and ...",https://www.imdb.com/title/tt0073486,...,,"$109,115,366","$108,981,275",1975.0,['United States'],['Oregon State Mental Hospital - 2600 Center S...,"['Fantasy Films', 'N.V. Zvaluw']","Won 5 Oscars, 38 wins & 15 nominations total","['Medical Drama', 'Psychological Drama', 'Drama']",['English']
3,tt0072890,Dog Day Afternoon,1975,2h 5m,R,8.0,281K,86.0,Three amateur robbers plan to hold up a Brookl...,https://www.imdb.com/title/tt0072890,...,,"$50,004,527","$50,000,000",1975.0,['United States'],"['285 Prospect Park West, Brooklyn, New York C...","['Warner Bros.', 'Artists Entertainment Complex']","Won 1 Oscar, 14 wins & 20 nominations total","['Dark Comedy', 'Heist', 'True Crime', 'Biogra...",['English']
4,tt0073692,Shampoo,1975,1h 50m,R,6.4,15K,65.0,"On Election Day, 1968, irresponsible hairdress...",https://www.imdb.com/title/tt0073692,...,,"$49,407,734","$49,407,734",1975.0,['United States'],"[""2270 Bowmont Drive, Beverly Hills, Californi...","['Persky-Bright / Vista', 'Columbia Pictures',...","Won 1 Oscar, 3 wins & 11 nominations total","['Satire', 'Comedy', 'Drama']",['English']



Missing values in key columns:
title             0
genres          787
description    2360
directors        51
stars           344
dtype: int64
Preprocessing features: ['genres', 'description', 'directors', 'stars']
Combined features created for 63249 movies
Building TF-IDF vectors...
Feature vector shape: (63249, 5000)
Sparsity: 0.9939
Large dataset detected. Similarity will be computed on-demand.

RECOMMENDATION SYSTEM IS READY!


<__main__.MovieRecommendationSystem at 0x25a70935550>

In [4]:
# Test the recommendation system with a popular movie
test_movie = "The Dark Knight"
recommendations = recommender.get_movie_recommendations(test_movie, num_recommendations=5)

print(f"\nRecommendations for '{test_movie}':")
print("=" * 60)

if 'error' in recommendations:
    print(f"Error: {recommendations['error']}")
    if 'suggestions' in recommendations:
        print(f"Did you mean: {recommendations['suggestions']}")
else:
    print(f"Input Movie: {recommendations['input_movie']}")
    print(f"Genres: {recommendations['input_movie_info']['genres']}")
    print(f"Year: {recommendations['input_movie_info']['year']}")
    print(f"Rating: {recommendations['input_movie_info']['rating']}")
    print("\nTop Recommendations:")
    
    for i, movie in enumerate(recommendations['recommendations'], 1):
        print(f"\n{i}. {movie['title']} ({movie.get('year', 'N/A')})")
        print(f"   Similarity: {movie['similarity_score']:.4f}")
        print(f"   Genres: {movie.get('genres', 'N/A')}")
        print(f"   Rating: {movie.get('rating', 'N/A')}")


Recommendations for 'The Dark Knight':
Input Movie: The Dark Knight
Genres: ['Action Epic', 'Epic', 'Superhero', 'Tragedy', 'Action', 'Crime', 'Drama', 'Thriller']
Year: 2008
Rating: 9.0

Top Recommendations:

1. The Dark Knight Rises (2012)
   Similarity: 0.6814
   Genres: ['Action Epic', 'Epic', 'Superhero', 'Action', 'Drama', 'Thriller']
   Rating: 8.4

2. Batman Begins (2005)
   Similarity: 0.6669
   Genres: ['Action Epic', 'Epic', 'Superhero', 'Tragedy', 'Action', 'Drama']
   Rating: 8.2

3. Athena (2022)
   Similarity: 0.5234
   Genres: ['Action Epic', 'Epic', 'Psychological Thriller', 'Action', 'Drama', 'Thriller']
   Rating: 6.8

4. Salaar 2 (2025)
   Similarity: 0.5107
   Genres: ['Action Epic', 'Epic', 'One-Person Army Action', 'Action', 'Crime', 'Drama', 'Thriller']
   Rating: nan

5. Salaar (2023)
   Similarity: 0.5047
   Genres: ['Action Epic', 'Epic', 'One-Person Army Action', 'Action', 'Crime', 'Drama', 'Thriller']
   Rating: 6.6


In [5]:
# Interactive recommendation function
def get_recommendations_interactive():
    """
    Interactive function to get movie recommendations
    """
    print("\n" + "=" * 60)
    print("INTERACTIVE MOVIE RECOMMENDATION SYSTEM")
    print("=" * 60)
    
    # Show some random movies as examples
    print("\nHere are some random movies from our database:")
    random_movies = recommender.get_random_movies(5)
    for movie in random_movies:
        print(f"- {movie['title']} ({movie['year']}) - {movie['rating']}")
    
    while True:
        movie_title = input("\nEnter a movie title (or 'quit' to exit): ").strip()
        
        if movie_title.lower() == 'quit':
            print("Thanks for using the recommendation system!")
            break
        
        if not movie_title:
            print("Please enter a valid movie title.")
            continue
        
        try:
            num_recs = int(input("How many recommendations do you want? (default: 5): ") or "5")
        except ValueError:
            num_recs = 5
        
        recommendations = recommender.get_movie_recommendations(movie_title, num_recommendations=num_recs)
        
        print("\n" + "-" * 50)
        
        if 'error' in recommendations:
            print(f"❌ {recommendations['error']}")
            if 'suggestions' in recommendations and recommendations['suggestions']:
                print(f"\n💡 Did you mean one of these?")
                for suggestion in recommendations['suggestions']:
                    print(f"   - {suggestion}")
        else:
            print(f"🎬 Input Movie: {recommendations['input_movie']}")
            print(f"📅 Year: {recommendations['input_movie_info']['year']}")
            print(f"⭐ Rating: {recommendations['input_movie_info']['rating']}")
            print(f"🎭 Genres: {recommendations['input_movie_info']['genres']}")
            
            print(f"\n🎯 Top {len(recommendations['recommendations'])} Recommendations:")
            
            for i, movie in enumerate(recommendations['recommendations'], 1):
                print(f"\n{i}. 🎬 {movie['title']} ({movie.get('year', 'N/A')})")
                print(f"   📊 Similarity: {movie['similarity_score']:.4f}")
                print(f"   ⭐ Rating: {movie.get('rating', 'N/A')}")
                print(f"   🎭 Genres: {movie.get('genres', 'N/A')}")
        
        print("-" * 50)

# Uncomment the line below to run the interactive system
# get_recommendations_interactive()

In [6]:
# Evaluation and analysis functions
def analyze_recommendations(movie_title, num_recommendations=10):
    """
    Analyze the quality of recommendations
    """
    recommendations = recommender.get_movie_recommendations(movie_title, num_recommendations)
    
    if 'error' in recommendations:
        return recommendations
    
    # Analyze genre overlap
    input_genres = str(recommendations['input_movie_info']['genres']).lower()
    genre_matches = []
    
    for rec in recommendations['recommendations']:
        rec_genres = str(rec.get('genres', '')).lower()
        # Simple genre overlap check
        overlap = len(set(input_genres.split()) & set(rec_genres.split()))
        genre_matches.append(overlap)
    
    # Calculate average similarity score
    avg_similarity = np.mean([rec['similarity_score'] for rec in recommendations['recommendations']])
    
    analysis = {
        'input_movie': recommendations['input_movie'],
        'average_similarity': round(avg_similarity, 4),
        'genre_overlap_scores': genre_matches,
        'recommendations_with_analysis': []
    }
    
    for i, rec in enumerate(recommendations['recommendations']):
        rec_analysis = rec.copy()
        rec_analysis['genre_overlap'] = genre_matches[i]
        analysis['recommendations_with_analysis'].append(rec_analysis)
    
    return analysis

# Test analysis
analysis = analyze_recommendations("Inception", 5)
if 'error' not in analysis:
    print(f"\nAnalysis for '{analysis['input_movie']}':")
    print(f"Average Similarity Score: {analysis['average_similarity']}")
    print(f"Genre Overlap Scores: {analysis['genre_overlap_scores']}")
else:
    print(f"Analysis failed: {analysis['error']}")


Analysis for 'Inception':
Average Similarity Score: 0.6859
Genre Overlap Scores: [9, 7, 8, 7, 6]


In [7]:
# System statistics and information
def print_system_stats():
    """
    Print comprehensive system statistics
    """
    print("\n" + "=" * 60)
    print("RECOMMENDATION SYSTEM STATISTICS")
    print("=" * 60)
    
    # Dataset statistics
    print(f"📊 Dataset Size: {recommender.movie_data.shape[0]:,} movies")
    print(f"📅 Year Range: {recommender.movie_data['year'].min():.0f} - {recommender.movie_data['year'].max():.0f}")
    print(f"⭐ Rating Range: {recommender.movie_data['rating'].min():.1f} - {recommender.movie_data['rating'].max():.1f}")
    
    # Feature vector statistics
    if recommender.feature_vectors is not None:
        print(f"\n🔢 Feature Vector Shape: {recommender.feature_vectors.shape}")
        print(f"📈 TF-IDF Features: {recommender.max_features:,}")
        sparsity = 1.0 - recommender.feature_vectors.nnz / (recommender.feature_vectors.shape[0] * recommender.feature_vectors.shape[1])
        print(f"🕳️  Matrix Sparsity: {sparsity:.4f} ({sparsity*100:.2f}% zeros)")
    
    # Genre statistics
    all_genres = []
    for genres in recommender.movie_data['genres'].dropna():
        if isinstance(genres, str):
            # Clean up the genre string
            clean_genres = genres.replace('[', '').replace(']', '').replace("'", '').replace('"', '')
            genre_list = [g.strip() for g in clean_genres.split(',') if g.strip()]
            all_genres.extend(genre_list)
    
    genre_counts = pd.Series(all_genres).value_counts()
    print(f"\n🎭 Total Unique Genres: {len(genre_counts)}")
    print(f"🔝 Top 5 Genres:")
    for genre, count in genre_counts.head().items():
        print(f"   - {genre}: {count:,} movies")
    
    # Memory usage
    if recommender.feature_vectors is not None:
        memory_mb = recommender.feature_vectors.data.nbytes / (1024 * 1024)
        print(f"\n💾 Feature Vector Memory: {memory_mb:.2f} MB")
    
    print("=" * 60)

print_system_stats()


RECOMMENDATION SYSTEM STATISTICS
📊 Dataset Size: 63,249 movies
📅 Year Range: 1920 - 2025
⭐ Rating Range: 1.0 - 10.0

🔢 Feature Vector Shape: (63249, 5000)
📈 TF-IDF Features: 5,000
🕳️  Matrix Sparsity: 0.9939 (99.39% zeros)

🎭 Total Unique Genres: 192
🔝 Top 5 Genres:
   - Drama: 35,871 movies
   - Comedy: 19,069 movies
   - Romance: 14,207 movies
   - Thriller: 9,719 movies
   - Crime: 9,624 movies

💾 Feature Vector Memory: 14.67 MB
