Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


Load and Explore Data

In [None]:
# Load datasets
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')



print("Movies shape:", movies.shape)
print("\nFirst few rows:")
print(movies.head())
print("\nDataset Info:")
print(movies.info())
print("\nCredits shape:", credits.shape)


Data Preprocessing

In [None]:
# Merge datasets
movies = movies.merge(credits, on='title')

# Select relevant features
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average', 'vote_count']]

# Check for missing values
print("Missing values:")
print(movies.isnull().sum())

# Drop rows with missing overviews
movies.dropna(subset=['overview'], inplace=True)

# Fill remaining missing values
movies['overview'] = movies['overview'].fillna('')
movies['keywords'] = movies['keywords'].fillna('[]')
movies['cast'] = movies['cast'].fillna('[]')
movies['crew'] = movies['crew'].fillna('[]')
movies['genres'] = movies['genres'].fillna('[]')

print(f"\nCleaned dataset shape: {movies.shape}")


Feature Engineering

In [None]:
import ast

# Function to extract names from JSON-like strings
def extract_names(text, limit=3):
    """Extract names from JSON string"""
    try:
        data = ast.literal_eval(text)
        names = [item['name'] for item in data[:limit]]
        return names
    except:
        return []

# Function to get director
def get_director(crew_text):
    """Extract director from crew"""
    try:
        crew = ast.literal_eval(crew_text)
        for member in crew:
            if member['job'] == 'Director':
                return [member['name']]
        return []
    except:
        return []

# Apply feature extraction
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(extract_names)
movies['director'] = movies['crew'].apply(get_director)

print("Sample processed features:")
print(movies[['title', 'genres', 'keywords', 'cast', 'director']].head())


Create Combined Features for Content-Based Filtering

In [None]:
# Function to clean text
def clean_text(text_list):
    """Convert list to lowercase string without spaces"""
    if isinstance(text_list, list):
        return ' '.join([str(item).lower().replace(' ', '') for item in text_list])
    return ''

# Combine all features into a single string
movies['genres_str'] = movies['genres'].apply(clean_text)
movies['keywords_str'] = movies['keywords'].apply(clean_text)
movies['cast_str'] = movies['cast'].apply(clean_text)
movies['director_str'] = movies['director'].apply(clean_text)

# Create combined tags
movies['tags'] = (
    movies['overview'] + ' ' + 
    movies['genres_str'] + ' ' + 
    movies['keywords_str'] + ' ' + 
    movies['cast_str'] + ' ' + 
    movies['director_str']
)

# Create a new dataframe for recommendation
movies_data = movies[['movie_id', 'title', 'tags', 'vote_average', 'vote_count']].copy()
movies_data['tags'] = movies_data['tags'].str.lower()

print("Tags created successfully!")
print("\nSample tag:")
print(movies_data['tags'].iloc[0][:200])


Content-Based Recommendation System (Method 1)

In [None]:
# Create count vectorizer (converts text to vector)
cv = CountVectorizer(max_features=5000, stop_words='english')

# Transform tags into vectors
tag_vectors = cv.fit_transform(movies_data['tags']).toarray()

print(f"Vector shape: {tag_vectors.shape}")
print(f"Total movies: {len(movies_data)}")

# Calculate cosine similarity
similarity_matrix = cosine_similarity(tag_vectors)
print(f"Similarity matrix shape: {similarity_matrix.shape}")


Recommendation Function (Content-Based)

In [None]:
def recommend_movies_content(movie_title, n_recommendations=10):
    """
    Recommend movies based on content similarity
    
    Parameters:
    movie_title: Name of the movie
    n_recommendations: Number of recommendations to return
    """
    try:
        # Find movie index
        movie_index = movies_data[movies_data['title'].str.lower() == movie_title.lower()].index[0]
        
        # Get similarity scores
        distances = similarity_matrix[movie_index]
        
        # Sort by similarity (excluding the movie itself)
        similar_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:n_recommendations+1]
        
        print(f"\n{'='*60}")
        print(f"Top {n_recommendations} recommendations for: '{movie_title}'")
        print(f"{'='*60}\n")
        
        recommendations = []
        for idx, (movie_idx, score) in enumerate(similar_movies, 1):
            movie_name = movies_data.iloc[movie_idx]['title']
            rating = movies_data.iloc[movie_idx]['vote_average']
            recommendations.append({
                'rank': idx,
                'title': movie_name,
                'similarity_score': round(score, 4),
                'rating': rating
            })
            print(f"{idx}. {movie_name}")
            print(f"   Similarity: {score:.4f} | Rating: {rating}/10")
            print()
        
        return pd.DataFrame(recommendations)
        
    except IndexError:
        print(f"Movie '{movie_title}' not found in database.")
        print("\nTry searching from these popular movies:")
        print(movies_data['title'].head(10).tolist())
        return None

# Test the recommendation system
result = recommend_movies_content('Avatar', 5)


Alternative Method - TF-IDF Based Recommendation

In [None]:
# Using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_vectors = tfidf.fit_transform(movies_data['tags']).toarray()

# Calculate similarity
tfidf_similarity = cosine_similarity(tfidf_vectors)

def recommend_movies_tfidf(movie_title, n_recommendations=10):
    """Recommend using TF-IDF vectors"""
    try:
        movie_index = movies_data[movies_data['title'].str.lower() == movie_title.lower()].index[0]
        distances = tfidf_similarity[movie_index]
        similar_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:n_recommendations+1]
        
        print(f"\n{'='*60}")
        print(f"TF-IDF Based Recommendations for: '{movie_title}'")
        print(f"{'='*60}\n")
        
        for idx, (movie_idx, score) in enumerate(similar_movies, 1):
            movie_name = movies_data.iloc[movie_idx]['title']
            rating = movies_data.iloc[movie_idx]['vote_average']
            print(f"{idx}. {movie_name} (Similarity: {score:.4f}, Rating: {rating}/10)")
        
    except IndexError:
        print(f"Movie '{movie_title}' not found.")
        return None

# Test TF-IDF method
recommend_movies_tfidf('The Dark Knight', 5)


Search Function

In [None]:
def search_movies(query, limit=10):
    """Search for movies by partial title match"""
    matches = movies_data[movies_data['title'].str.contains(query, case=False, na=False)]
    
    if len(matches) == 0:
        print(f"No movies found matching '{query}'")
        return None
    
    print(f"Found {len(matches)} movies matching '{query}':\n")
    results = matches[['title', 'vote_average']].head(limit)
    for idx, row in results.iterrows():
        print(f"- {row['title']} (Rating: {row['vote_average']}/10)")
    
    return results

# Test search
search_movies('iron', 5)


Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Top rated movies
top_movies = movies_data.nlargest(10, 'vote_average')[['title', 'vote_average']]

plt.figure(figsize=(12, 6))
sns.barplot(data=top_movies, x='vote_average', y='title', palette='viridis')
plt.title('Top 10 Highest Rated Movies', fontsize=16, fontweight='bold')
plt.xlabel('Average Rating')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

# Distribution of ratings
plt.figure(figsize=(10, 5))
sns.histplot(movies_data['vote_average'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Movie Ratings', fontsize=16, fontweight='bold')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


Interactive Recommendation

In [None]:
def interactive_recommender():
    """Interactive recommendation system"""
    print("="*60)
    print("MOVIE RECOMMENDATION SYSTEM")
    print("="*60)
    
    while True:
        movie_name = input("\nEnter a movie name (or 'quit' to exit): ").strip()
        
        if movie_name.lower() == 'quit':
            print("Thank you for using the recommendation system!")
            break
        
        # Try exact match first
        if movie_name in movies_data['title'].str.lower().values:
            recommend_movies_content(movie_name, 5)
        else:
            # Search for similar titles
            print(f"\nExact match not found. Searching for similar titles...")
            search_movies(movie_name, 5)
            
            retry = input("\nTry another search? (yes/no): ").strip().lower()
            if retry != 'yes':
                break


interactive_recommender()


Test Multiple Movies

In [None]:
# Test with different movies
test_movies = ['The Avengers', 'Inception', 'Titanic', 'The Matrix']

for movie in test_movies:
    try:
        print(f"\n{'#'*70}\n")
        recommend_movies_content(movie, 3)
    except:
        print(f"Could not find recommendations for {movie}")


Save Results

In [None]:
# Save similarity matrix for later use
np.save('similarity_matrix.npy', similarity_matrix)

# Save processed data
movies_data.to_csv('processed_movies.csv', index=False)

print("Model saved successfully!")
print("Files created:")
print("- similarity_matrix.npy")
print("- processed_movies.csv")


User Preference Based Recommendation

In [None]:
def get_user_preferences():
    """
    Collect user preferences for personalized recommendations
    """
    print("\n" + "="*70)
    print("PERSONALIZED MOVIE RECOMMENDATION SYSTEM")
    print("="*70)
    
    preferences = {
        'favorite_genres': [],
        'favorite_actors': [],
        'favorite_directors': [],
        'min_rating': 0,
        'favorite_movies': []
    }
    
    # Get favorite genres
    print("\nAvailable genres:")
    all_genres = set()
    for genres_list in movies['genres']:
        all_genres.update(genres_list)
    
    print(", ".join(sorted(all_genres)))
    
    genre_input = input("\nEnter your favorite genres (comma-separated, or press Enter to skip): ").strip()
    if genre_input:
        preferences['favorite_genres'] = [g.strip().lower() for g in genre_input.split(',')]
    
    # Get favorite actors
    actor_input = input("\nEnter your favorite actors (comma-separated, or press Enter to skip): ").strip()
    if actor_input:
        preferences['favorite_actors'] = [a.strip().lower() for a in actor_input.split(',')]
    
    # Get favorite directors
    director_input = input("\nEnter your favorite directors (comma-separated, or press Enter to skip): ").strip()
    if director_input:
        preferences['favorite_directors'] = [d.strip().lower() for d in director_input.split(',')]
    
    # Get minimum rating
    rating_input = input("\nMinimum rating (0-10, press Enter for any rating): ").strip()
    if rating_input:
        try:
            preferences['min_rating'] = float(rating_input)
        except:
            preferences['min_rating'] = 0
    
    # Get favorite movies
    movie_input = input("\nEnter movies you liked (comma-separated, or press Enter to skip): ").strip()
    if movie_input:
        preferences['favorite_movies'] = [m.strip() for m in movie_input.split(',')]
    
    return preferences

def recommend_by_preferences(preferences, n_recommendations=10):
    """
    Recommend movies based on user preferences
    """
    # Start with all movies
    filtered_movies = movies_data.copy()
    filtered_indices = movies.copy()
    
    # Filter by minimum rating
    if preferences['min_rating'] > 0:
        mask = filtered_movies['vote_average'] >= preferences['min_rating']
        filtered_movies = filtered_movies[mask]
        filtered_indices = filtered_indices[mask]
        print(f"\nFiltered by rating >= {preferences['min_rating']}: {len(filtered_movies)} movies remaining")
    
    # Score movies based on preferences
    filtered_movies['preference_score'] = 0
    
    # Score by genres
    if preferences['favorite_genres']:
        def genre_match(genres_list):
            if not isinstance(genres_list, list):
                return 0
            genres_lower = [g.lower() for g in genres_list]
            matches = sum(1 for g in preferences['favorite_genres'] if g in genres_lower)
            return matches
        
        filtered_indices_reset = filtered_indices.reset_index(drop=True)
        genre_scores = filtered_indices_reset['genres'].apply(genre_match)
        filtered_movies = filtered_movies.reset_index(drop=True)
        filtered_movies['preference_score'] += genre_scores * 3  # Weight genres heavily
    
    # Score by actors
    if preferences['favorite_actors']:
        def actor_match(cast_list):
            if not isinstance(cast_list, list):
                return 0
            cast_lower = [c.lower() for c in cast_list]
            matches = sum(1 for a in preferences['favorite_actors'] 
                         if any(a in actor for actor in cast_lower))
            return matches
        
        actor_scores = filtered_indices_reset['cast'].apply(actor_match)
        filtered_movies['preference_score'] += actor_scores * 2
    
    # Score by directors
    if preferences['favorite_directors']:
        def director_match(director_list):
            if not isinstance(director_list, list):
                return 0
            director_lower = [d.lower() for d in director_list]
            matches = sum(1 for d in preferences['favorite_directors'] 
                         if any(d in director for director in director_lower))
            return matches
        
        director_scores = filtered_indices_reset['director'].apply(director_match)
        filtered_movies['preference_score'] += director_scores * 2
    
    # Also consider similarity to favorite movies
    if preferences['favorite_movies']:
        similarity_scores = np.zeros(len(filtered_movies))
        
        for fav_movie in preferences['favorite_movies']:
            try:
                # Find movie in original dataset
                movie_idx = movies_data[movies_data['title'].str.lower() == fav_movie.lower()].index[0]
                
                # Get similarities for all movies
                movie_similarities = similarity_matrix[movie_idx]
                
                # Map to filtered movies
                for i, original_idx in enumerate(filtered_movies.index):
                    if original_idx < len(movie_similarities):
                        similarity_scores[i] += movie_similarities[original_idx]
                
                print(f"Added similarity scores based on: '{fav_movie}'")
            except:
                print(f"Could not find movie: '{fav_movie}'")
        
        filtered_movies['preference_score'] += similarity_scores * 5  # Weight favorite movies highly
    
    # Sort by preference score and rating
    filtered_movies['combined_score'] = (
        filtered_movies['preference_score'] * 0.7 + 
        filtered_movies['vote_average'] * 0.3
    )
    
    recommendations = filtered_movies.nlargest(n_recommendations, 'combined_score')
    
    print(f"\n{'='*70}")
    print(f"TOP {n_recommendations} PERSONALIZED RECOMMENDATIONS")
    print(f"{'='*70}\n")
    
    for idx, (_, row) in enumerate(recommendations.iterrows(), 1):
        movie_idx = row.name if row.name < len(movies) else 0
        genres = movies.iloc[movie_idx]['genres'] if movie_idx < len(movies) else []
        
        print(f"{idx}. {row['title']}")
        print(f"   Rating: {row['vote_average']}/10")
        print(f"   Preference Score: {row['preference_score']:.2f}")
        print(f"   Genres: {', '.join(genres) if genres else 'N/A'}")
        print()
    
    return recommendations

# Test the enhanced system
# Uncomment to run interactively:
# user_prefs = get_user_preferences()
# recommend_by_preferences(user_prefs, 10)


Hybrid Recommendation System

In [None]:
def hybrid_recommender(movie_title=None, preferences=None, n_recommendations=10):
    """
    Hybrid system combining content-based and preference-based filtering
    """
    all_recommendations = []
    
    # Get content-based recommendations if movie provided
    if movie_title:
        try:
            movie_index = movies_data[movies_data['title'].str.lower() == movie_title.lower()].index[0]
            distances = similarity_matrix[movie_index]
            similar_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:21]
            
            for movie_idx, score in similar_movies:
                all_recommendations.append({
                    'title': movies_data.iloc[movie_idx]['title'],
                    'rating': movies_data.iloc[movie_idx]['vote_average'],
                    'content_score': score,
                    'preference_score': 0
                })
        except:
            print(f"Movie '{movie_title}' not found for content-based filtering")
    
    # Add preference-based filtering
    if preferences:
        pref_recs = recommend_by_preferences(preferences, 20)
        
        for _, row in pref_recs.iterrows():
            # Check if already in recommendations
            existing = next((r for r in all_recommendations if r['title'] == row['title']), None)
            
            if existing:
                existing['preference_score'] = row['preference_score']
            else:
                all_recommendations.append({
                    'title': row['title'],
                    'rating': row['vote_average'],
                    'content_score': 0,
                    'preference_score': row['preference_score']
                })
    
    # Calculate hybrid score
    for rec in all_recommendations:
        rec['hybrid_score'] = (
            rec['content_score'] * 0.4 + 
            rec['preference_score'] * 0.4 + 
            rec['rating'] * 0.02  # Normalize rating to 0-1 scale
        )
    
    # Sort by hybrid score
    all_recommendations.sort(key=lambda x: x['hybrid_score'], reverse=True)
    
    print(f"\n{'='*70}")
    print(f"HYBRID RECOMMENDATIONS")
    print(f"{'='*70}\n")
    
    for idx, rec in enumerate(all_recommendations[:n_recommendations], 1):
        print(f"{idx}. {rec['title']}")
        print(f"   Rating: {rec['rating']}/10 | Hybrid Score: {rec['hybrid_score']:.4f}")
        print(f"   Content: {rec['content_score']:.3f} | Preference: {rec['preference_score']:.2f}")
        print()
    
    return all_recommendations[:n_recommendations]

# Test hybrid system
# Example usage:
prefs = {
    'favorite_genres': ['action', 'sci-fi'],
    'favorite_actors': ['tom cruise'],
    'favorite_directors': [],
    'min_rating': 7.0,
    'favorite_movies': ['The Matrix']
}
hybrid_recommender(movie_title='Inception', preferences=prefs, n_recommendations=10)


Multi-Modal Input System

In [None]:
def advanced_recommendation_system():
    """
    Complete interactive system with multiple input modes
    """
    print("\n" + "="*70)
    print("ADVANCED MOVIE RECOMMENDATION SYSTEM")
    print("="*70)
    print("\nChoose recommendation mode:")
    print("1. Based on a movie you liked")
    print("2. Based on your preferences (genres, actors, directors)")
    print("3. Hybrid (combine both methods)")
    print("4. Random discovery (high-rated movies)")
    print("5. Exit")
    
    while True:
        choice = input("\nEnter your choice (1-5): ").strip()
        
        if choice == '1':
            movie_name = input("\nEnter a movie you liked: ").strip()
            n_recs = input("How many recommendations? (default 10): ").strip()
            n_recs = int(n_recs) if n_recs.isdigit() else 10
            recommend_movies_content(movie_name, n_recs)
        
        elif choice == '2':
            preferences = get_user_preferences()
            n_recs = input("\nHow many recommendations? (default 10): ").strip()
            n_recs = int(n_recs) if n_recs.isdigit() else 10
            recommend_by_preferences(preferences, n_recs)
        
        elif choice == '3':
            movie_name = input("\nEnter a movie you liked (or press Enter to skip): ").strip()
            movie_name = movie_name if movie_name else None
            
            print("\nNow let's get your preferences...")
            preferences = get_user_preferences()
            
            n_recs = input("\nHow many recommendations? (default 10): ").strip()
            n_recs = int(n_recs) if n_recs.isdigit() else 10
            
            hybrid_recommender(movie_name, preferences, n_recs)
        
        elif choice == '4':
            n_recs = input("\nHow many movies to discover? (default 10): ").strip()
            n_recs = int(n_recs) if n_recs.isdigit() else 10
            
            # Random high-rated movies
            high_rated = movies_data[movies_data['vote_average'] >= 7.5].sample(n=min(n_recs, len(movies_data)))
            
            print(f"\n{'='*70}")
            print(f"DISCOVER HIGHLY-RATED MOVIES")
            print(f"{'='*70}\n")
            
            for idx, (_, row) in enumerate(high_rated.iterrows(), 1):
                print(f"{idx}. {row['title']} - Rating: {row['vote_average']}/10")
        
        elif choice == '5':
            print("\nThank you for using the recommendation system!")
            break
        
        else:
            print("\nInvalid choice. Please select 1-5.")
        
        continue_choice = input("\nWould you like another recommendation? (yes/no): ").strip().lower()
        if continue_choice != 'yes':
            print("\nThank you for using the recommendation system!")
            break

advanced_recommendation_system()


 Genre-Based Filtering Enhancement

In [None]:
def recommend_by_genre(genre, min_rating=6.0, n_recommendations=10):
    """
    Recommend top movies from a specific genre
    """
    # Filter movies by genre
    genre_movies = []
    
    for idx, row in movies.iterrows():
        if isinstance(row['genres'], list):
            genres_lower = [g.lower() for g in row['genres']]
            if genre.lower() in genres_lower:
                movie_data = movies_data.iloc[idx]
                if movie_data['vote_average'] >= min_rating:
                    genre_movies.append({
                        'title': movie_data['title'],
                        'rating': movie_data['vote_average'],
                        'vote_count': movie_data['vote_count']
                    })
    
    # Sort by rating and vote count
    genre_movies.sort(key=lambda x: (x['rating'], x['vote_count']), reverse=True)
    
    print(f"\n{'='*70}")
    print(f"TOP {genre.upper()} MOVIES (Rating >= {min_rating})")
    print(f"{'='*70}\n")
    
    for idx, movie in enumerate(genre_movies[:n_recommendations], 1):
        print(f"{idx}. {movie['title']}")
        print(f"   Rating: {movie['rating']}/10 | Votes: {movie['vote_count']}")
        print()
    
    return genre_movies[:n_recommendations]

# Test genre-based recommendations
recommend_by_genre('Action', min_rating=7.0, n_recommendations=10)


Mood-Based Recommendations


In [None]:
def recommend_by_mood(mood, n_recommendations=10):
    """
    Recommend movies based on user's current mood
    """
    mood_genre_map = {
        'happy': ['Comedy', 'Animation', 'Family', 'Music'],
        'sad': ['Drama', 'Romance'],
        'excited': ['Action', 'Adventure', 'Thriller'],
        'scared': ['Horror', 'Thriller', 'Mystery'],
        'thoughtful': ['Drama', 'Documentary', 'History'],
        'romantic': ['Romance', 'Drama', 'Comedy'],
        'adventurous': ['Adventure', 'Action', 'Fantasy', 'Science Fiction']
    }
    
    mood = mood.lower()
    if mood not in mood_genre_map:
        print(f"Mood '{mood}' not recognized.")
        print(f"Available moods: {', '.join(mood_genre_map.keys())}")
        return
    
    target_genres = mood_genre_map[mood]
    mood_movies = []
    
    for idx, row in movies.iterrows():
        if isinstance(row['genres'], list):
            genres_lower = [g.lower() for g in row['genres']]
            
            # Check if any target genre matches
            genre_match = any(tg.lower() in genres_lower for tg in target_genres)
            
            if genre_match:
                movie_data = movies_data.iloc[idx]
                if movie_data['vote_average'] >= 6.5:
                    mood_movies.append({
                        'title': movie_data['title'],
                        'rating': movie_data['vote_average'],
                        'genres': row['genres']
                    })
    
    # Sort by rating
    mood_movies.sort(key=lambda x: x['rating'], reverse=True)
    
    print(f"\n{'='*70}")
    print(f"MOVIES FOR WHEN YOU'RE FEELING {mood.upper()}")
    print(f"{'='*70}\n")
    
    for idx, movie in enumerate(mood_movies[:n_recommendations], 1):
        print(f"{idx}. {movie['title']}")
        print(f"   Rating: {movie['rating']}/10")
        print(f"   Genres: {', '.join(movie['genres'])}")
        print()
    
    return mood_movies[:n_recommendations]

# Test mood-based recommendations
recommend_by_mood('excited', 8)
