# Enhancing a Movie Recommender System 

## Part 1: Load and Preprocess the Data

### Importing Required `Libraries`

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score, f1_score

### Loading and preview data using `pandas`

In [1]:
# Load datasets
movies = pd.read_csv('movie_data/movies.csv')    # Contains: movieId, title, genres
ratings = pd.read_csv('movie_data/ratings.csv')  # Contains: userId, movieId, rating, timestamp

NameError: name 'pd' is not defined

### Preprocessing the `movies.csv` dataset to make genres usable as features

In [None]:
# Checking If There Are Any Missing Values at All
print(movies.isnull().values.any())  # Returns True if there are missing values, False otherwise
print(ratings.isnull().values.any()) # Returns True if there are missing values, False otherwise

#Process the 'genres' column by replacing '|' with a space
movies["genres"] = movies["genres"].str.replace("|", " ")
print(movies.head(5))

False
False
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


### Creating a `user-item interaction matrix` from ratings.csv.

In [None]:
# Pivot the ratings DataFrame
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

## Part 2: Build a Hybrid Recommender System

### Content-Based Recommendation

In [None]:
# Using the `genres` column from `movies.csv` to create a TF-IDF matrix.
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Computing similarity using cosine similarity.
def content_based_recommendations(title, cosine_sim=cosine_sim, movies=movies):
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
    if title not in indices:
        return []
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

### Collaborative Recommendation

In [None]:
#  Using the user-item matrix from `ratings.csv`. 
movie_sparse = csr_matrix(user_movie_matrix.values)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_sparse)

In [None]:
# Implementing collaborative filtering using KNN.
def collaborative_recommendations(movie_id, user_movie_matrix, model_knn, movies, n_neighbors=10):
    if movie_id not in user_movie_matrix.columns:
        return []
    movie_idx = list(user_movie_matrix.columns).index(movie_id)
    distances, indices = model_knn.kneighbors(movie_sparse[movie_idx], n_neighbors=n_neighbors)
    movie_indices = [user_movie_matrix.columns[i] for i in indices.flatten()]
    return movies[movies['movieId'].isin(movie_indices)]['title']

### Hybrid Recommendation System

In [None]:
def hybrid_recommendations(title, movie_id, content_weight=0.5, collab_weight=0.5):
    content_recs = content_based_recommendations(title)
    collab_recs = collaborative_recommendations(movie_id, user_movie_matrix, model_knn, movies)

    hybrid_scores = {}
    for rec in content_recs:
        hybrid_scores[rec] = hybrid_scores.get(rec, 0) + content_weight
    for rec in collab_recs:
        hybrid_scores[rec] = hybrid_scores.get(rec, 0) + collab_weight

    hybrid_recs = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return [rec[0] for rec in hybrid_recs[:10]]

## Part 3: Incorporate Additional Features

### Incorporate movie descriptions if available

In [None]:
if 'description' in movies.columns:
    movies['combined_features'] = movies['title'] + ' ' + movies['genres'] + ' ' + movies['description'].fillna('')
    tfidf_combined = TfidfVectorizer(stop_words='english')
    tfidf_matrix_combined = tfidf_combined.fit_transform(movies['combined_features'])
    cosine_sim_combined = cosine_similarity(tfidf_matrix_combined, tfidf_matrix_combined)

### Add user demographics to collaborative filtering if available

In [None]:
# Load the users dataset if it exists
try:
    users = pd.read_csv('movie_data/users.csv')  # Adjust the path if necessary
except FileNotFoundError:
    users = pd.DataFrame()  # Create an empty DataFrame if the file is not found

# Define whether demographics data is available
demographics_available = not users.empty and 'age' in users.columns and 'gender' in users.columns

if demographics_available:
    user_demo_matrix = pd.merge(ratings, users, on='userId')
    demo_features = pd.get_dummies(user_demo_matrix[['age', 'gender']])
    user_demo_matrix = pd.concat([user_demo_matrix, demo_features], axis=1)

### Test The Systems

In [None]:
# Sample data for testing
title_test = 'Lion King, The (1994)'
movie_id_test = 364

# Content-Based Model Test
content_preds = content_based_recommendations(title_test)
print(f"Content-Based Recommendations for '{title_test}': {content_preds}")

# Collaborative Model Test
collab_preds = collaborative_recommendations(movie_id_test, user_movie_matrix, model_knn, movies)
print(f"Collaborative Recommendations for Movie ID {movie_id_test}: {collab_preds}")

# Hybrid Model Test
hybrid_preds = hybrid_recommendations(title_test, movie_id_test)
print(f"Hybrid Recommendations for '{title_test}': {hybrid_preds}")

Content-Based Recommendations for 'Lion King, The (1994)': 2381                                 Fantasia 2000 (1999)
6313                                   Open Season (2006)
6347                                    Happy Feet (2006)
7639                                        Cars 2 (2011)
7899            Madagascar 3: Europe's Most Wanted (2012)
1273                                     Anastasia (1997)
512                           Beauty and the Beast (1991)
673     Land Before Time III: The Time of the Great Gi...
787                                  Pete's Dragon (1977)
1562                             Song of the South (1946)
Name: title, dtype: object
Collaborative Recommendations for Movie ID 364: 37                                    Richard III (1995)
228    Like Water for Chocolate (Como agua para choco...
234                   Madness of King George, The (1994)
322                                Lion King, The (1994)
403                                   Hard Target (1993)
4

## Part 4:  Evaluate the Recommender System

In [None]:
# Function to generate ground truth dynamically based on high-rated movies by similar users
def get_dynamic_ground_truth(movie_id, ratings, movies):
    # Find users who rated the target movie highly (4+ stars)
    high_rated_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"]
    
    # Get other movies rated 4+ stars by these users
    relevant_movies = ratings[(ratings["userId"].isin(high_rated_users)) & (ratings["rating"] >= 4)]["movieId"].unique()
    
    # Convert movie IDs to titles
    return movies[movies["movieId"].isin(relevant_movies)]["title"].tolist()

# Generate ground truth dynamically
relevant_movie_titles = get_dynamic_ground_truth(movie_id_test, ratings, movies)

# Function to evaluate recommendations
def evaluate_model(recommended, relevant):
    y_true = [1 if movie in relevant else 0 for movie in recommended]
    y_pred = [1] * len(recommended)  # All recommendations are considered positive predictions
    
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    
    return precision, recall, f1

# Evaluate models using actual recommendations
print("\nEvaluation Results:")
print("Content-Based Filtering Evaluation:")
precision, recall, f1 = evaluate_model(content_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

print("Collaborative Filtering Evaluation:")
precision, recall, f1 = evaluate_model(collab_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

print("Hybrid Model Evaluation:")
precision, recall, f1 = evaluate_model(hybrid_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")


Evaluation Results:
Content-Based Filtering Evaluation:
Precision: 0.70, Recall: 1.00, F1-Score: 0.82

Collaborative Filtering Evaluation:
Precision: 0.80, Recall: 1.00, F1-Score: 0.89

Hybrid Model Evaluation:
Precision: 0.70, Recall: 1.00, F1-Score: 0.82
