### Part 1: Load and Preprocess the Data

In [617]:
import pandas as pd
import numpy as np

In [618]:
# Load datasets
movies = pd.read_csv('movie_data/movies.csv')    # Contains: movieId, title, genres
ratings = pd.read_csv('movie_data/ratings.csv')  # Contains: userId, movieId, rating, timestamp

In [619]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [620]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [621]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [622]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

***There is no any missing values in movies and ratings data.***

In [623]:
# Replace '|' with space for TF-IDF processing
movies['genres'] = movies['genres'].str.replace('|', ' ')
print(movies['genres'].head())

0    Adventure Animation Children Comedy Fantasy
1                     Adventure Children Fantasy
2                                 Comedy Romance
3                           Comedy Drama Romance
4                                         Comedy
Name: genres, dtype: object


In [624]:
### Create a user-item interaction matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print(user_movie_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

### Part 2: Build a Hybrid Recommender System

***Build a Content-Based Filtering model using genres***

In [625]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization of genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend movies
# Computing similarity using cosine similarity.
def content_based_recommendations(title, cosine_sim=cosine_sim, movies=movies):
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
    if title not in indices:
        return []
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

***Implement Collaborative Filtering using KNN for Collaborative recommendations***

In [626]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Convert to sparse matrix
movie_sparse = csr_matrix(user_movie_matrix.values)

# Fit KNN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_sparse)

# Function for collaborative recommendations
def collaborative_recommendations(movie_id, user_movie_matrix, model_knn, movies, n_neighbors=10):
    if movie_id not in user_movie_matrix.columns:
        return []
    movie_idx = list(user_movie_matrix.columns).index(movie_id)
    distances, indices = model_knn.kneighbors(movie_sparse[movie_idx], n_neighbors=n_neighbors)
    movie_indices = [user_movie_matrix.columns[i] for i in indices.flatten()]
    return movies[movies['movieId'].isin(movie_indices)]['title']


***Combine both models into a Hybrid Recommender System***

In [627]:
def hybrid_recommendations(title, movie_id, content_weight=0.5, collab_weight=0.5, top_n=10):
    content_recs = content_based_recommendations(title)
    collab_recs = collaborative_recommendations(movie_id, user_movie_matrix, model_knn, movies)

    hybrid_scores = {}
    for rec in content_recs:
        hybrid_scores[rec] = hybrid_scores.get(rec, 0) + content_weight
    for rec in collab_recs:
        hybrid_scores[rec] = hybrid_scores.get(rec, 0) + collab_weight

    hybrid_recs = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return [rec[0] for rec in hybrid_recs[:10]]


### Part 4: Evaluate the Recommender System

In [628]:
# Sample data for testing
title_test = 'Barcelona (1994)'
movie_id_test = 417

# Content-Based Model Test
content_preds = content_based_recommendations(title_test)
print(f"Content-Based Recommendations for '{title_test}': {content_preds}")

# Collaborative Model Test
collab_preds = collaborative_recommendations(movie_id_test, user_movie_matrix, model_knn, movies)
print(f"Collaborative Recommendations for Movie ID {movie_id_test}: {collab_preds}")

# Hybrid Model Test
hybrid_preds = hybrid_recommendations(title_test, movie_id_test)
print(f"Hybrid Recommendations for '{title_test}': {hybrid_preds}")

Content-Based Recommendations for 'Barcelona (1994)': 6                          Sabrina (1995)
35                        Clueless (1995)
57                   Two if by Sea (1996)
60     French Twist (Gazon maudit) (1995)
103                   If Lucy Fell (1996)
106                      Boomerang (1992)
111                 Pie in the Sky (1996)
152                       Mallrats (1995)
157                    Nine Months (1995)
203                   Forget Paris (1995)
Name: title, dtype: object
Collaborative Recommendations for Movie ID 417: 16                Sense and Sensibility (1995)
130                      Canadian Bacon (1995)
232                         Love Affair (1994)
238                          Milk Money (1994)
246    New York Cop (Nyû Yôku no koppu) (1993)
316                     Higher Learning (1995)
361                           Barcelona (1994)
399        Geronimo: An American Legend (1993)
416                     Jimmy Hollywood (1994)
433                What Happ

In [629]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to generate ground truth dynamically based on high-rated movies by similar users
def get_dynamic_ground_truth(movie_id, ratings, movies):
    # Find users who rated the target movie highly (4+ stars)
    high_rated_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"]
    
    # Get other movies rated 4+ stars by these users
    relevant_movies = ratings[(ratings["userId"].isin(high_rated_users)) & (ratings["rating"] >= 4)]["movieId"].unique()
    
    # Convert movie IDs to titles
    return movies[movies["movieId"].isin(relevant_movies)]["title"].tolist()

# Generate ground truth dynamically
relevant_movie_titles = get_dynamic_ground_truth(movie_id_test, ratings, movies)

# Function to evaluate recommendations
def evaluate_model(recommended, relevant):
    y_true = [1 if movie in relevant else 0 for movie in recommended]
    y_pred = [1] * len(recommended)  # All recommendations are considered positive predictions
    
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    
    return precision, recall, f1

# Evaluate models using actual recommendations
print("\nEvaluation Results:")
print("Content-Based Filtering Evaluation:")
precision, recall, f1 = evaluate_model(content_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

print("Collaborative Filtering Evaluation:")
precision, recall, f1 = evaluate_model(collab_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

print("Hybrid Model Evaluation:")
precision, recall, f1 = evaluate_model(hybrid_preds, relevant_movie_titles)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")


Evaluation Results:
Content-Based Filtering Evaluation:
Precision: 0.20, Recall: 1.00, F1-Score: 0.33

Collaborative Filtering Evaluation:
Precision: 0.40, Recall: 1.00, F1-Score: 0.57

Hybrid Model Evaluation:
Precision: 0.20, Recall: 1.00, F1-Score: 0.33
