In [4]:
# Unique Implementation of Movie Recommendation System
# Author: Sarowar Alam
# Description: Custom implementation of a recommendation system using Content-Based and Collaborative Filtering approaches.

# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# ===========================
# Load and Preprocess Dataset
# ===========================

# Define file paths
movies_file = "C:\\Users\\sarow\\Downloads\\ml-32m\\movies.csv"
ratings_file = "C:\\Users\\sarow\\Downloads\\ml-32m\\ratings.csv"

# Read datasets into DataFrames
movies_df = pd.read_csv(movies_file)
ratings_df = pd.read_csv(ratings_file)

# Restrict to the first 10,000 movies for quicker processing
movies_df = movies_df.head(10000)

# ===========================
# Content-Based Recommendation System
# ===========================
print("Step 1: Content-Based Recommendation System")

# Combine movie title and genres into a single textual feature
movies_df['combined_features'] = movies_df['title'] + " " + movies_df['genres']

# Initialize TF-IDF Vectorizer for text processing
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Convert combined features into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['combined_features'])

# Calculate cosine similarity scores for all movies
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_by_content(movie_title, matrix=similarity_matrix):
    """
    Recommends movies similar to the given movie based on content features.
    
    Parameters:
    - movie_title (str): The title of the movie.
    - matrix (array): Precomputed similarity matrix.
    
    Returns:
    - list: Top 5 similar movie titles.
    """
    try:
        # Find the index of the given movie
        movie_idx = movies_df[movies_df['title'] == movie_title].index[0]
        
        # Get similarity scores for the movie
        scores = list(enumerate(matrix[movie_idx]))
        
        # Sort by score in descending order and extract top results
        scores_sorted = sorted(scores, key=lambda x: x[1], reverse=True)
        top_movies = [movies_df.iloc[i[0]]['title'] for i in scores_sorted[1:6]]
        
        return top_movies
    except IndexError:
        return ["Movie not found in dataset."]

# Example: Recommend movies similar to "Toy Story (1995)"
print("Content-Based Recommendations for 'Toy Story (1995)':")
print(recommend_by_content("Toy Story (1995)"))

# ===========================
# Collaborative Filtering Recommendation System
# ===========================
print("\nStep 2: Collaborative Filtering Recommendation System")

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(0.5, 5.0))
surprise_data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data into training and testing sets
train_data, test_data = train_test_split(surprise_data, test_size=0.25)

# Train the SVD model
svd_model = SVD()
svd_model.fit(train_data)

# Evaluate model performance
predictions = svd_model.test(test_data)
rmse_score = accuracy.rmse(predictions)
print(f"Collaborative Filtering Model RMSE: {rmse_score:.4f}")

def recommend_by_collaboration(user_id, model=svd_model, num_recommendations=5):
    """
    Recommends movies to a user based on collaborative filtering.
    
    Parameters:
    - user_id (int): The ID of the user.
    - model: Trained collaborative filtering model.
    - num_recommendations (int): Number of recommendations to return.
    
    Returns:
    - list: Top recommended movie titles.
    """
    try:
        # Retrieve all movie IDs and rated movies by the user
        all_movies = ratings_df['movieId'].unique()
        rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
        
        # Predict ratings for unrated movies
        unrated_movies = [movie for movie in all_movies if movie not in rated_movies]
        predicted_ratings = [(movie, model.predict(user_id, movie).est) for movie in unrated_movies]
        
        # Sort predictions by rating and get top recommendations
        top_predictions = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]
        top_movie_ids = [pred[0] for pred in top_predictions]
        
        # Fetch movie titles
        recommended_titles = movies_df[movies_df['movieId'].isin(top_movie_ids)]['title'].values
        
        return recommended_titles
    except KeyError:
        return ["User not found in dataset."]

# Example: Recommend movies for user with ID 1
print("\nCollaborative Filtering Recommendations for User ID 1:")
print(recommend_by_collaboration(user_id=1))


Step 1: Content-Based Recommendation System
Content-Based Recommendations for 'Toy Story (1995)':
['Toy Story 2 (1999)', 'Toy, The (1982)', "We're Back! A Dinosaur's Story (1993)", 'NeverEnding Story, The (1984)', 'Monsters, Inc. (2001)']

Step 2: Collaborative Filtering Recommendation System
RMSE: 0.7756
Collaborative Filtering Model RMSE: 0.7756

Collaborative Filtering Recommendations for User ID 1:
['Alien (1979)' 'Psycho (1960)' 'Jaws (1975)'
 'Thing from Another World, The (1951)' 'All That Heaven Allows (1955)']
