 Data Preprocessing

In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load ratings data
ratings = pd.read_csv("ratings.csv")
# Load movie metadata
movies = pd.read_csv("movies.csv")    # Contains movieId, title, genres

# Create user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# Convert to sparse matrix (efficient storage)
sparse_matrix = csr_matrix(user_item_matrix.values)


User-Based Collaborative Filtering

In [15]:
# Compute similarity between users
user_similarity = cosine_similarity(sparse_matrix)

# Example: get top 5 similar users to user ID 1
import numpy as np
user_id = 1
similar_users = np.argsort(-user_similarity[user_id-1])[1:6]  # Exclude self (0th is self)
print(f"Top 5 similar users to User {user_id}:", similar_users + 1)


Top 5 similar users to User 1: [325 634 341 310 207]


Item-Based Collaborative Filtering

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix) # Changed user_movie_matrix to user_item_matrix
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index) # Changed user_movie_matrix to user_item_matrix


Compute User Similarity Matrix


In [9]:
# Transpose for item-item similarity
item_item_matrix = user_item_matrix.T
item_similarity = cosine_similarity(item_item_matrix.fillna(0))

# Example: get top 5 similar movies to movie ID 1
movie_id = 1
movie_idx = list(item_item_matrix.index).index(movie_id)
similar_movies = np.argsort(-item_similarity[movie_idx])[1:6]
similar_movie_ids = item_item_matrix.index[similar_movies]
print(f"Top 5 similar movies to Movie {movie_id}:", similar_movie_ids.tolist())


Top 5 similar movies to Movie 1: [3114, 260, 356, 780, 1265]


Recommend Movies Based on Similar Users

Recommend top n movies to a target user based on ratings from similar users.

In [19]:
def recommend_movies_for_user(target_user_id, num_recommendations=5):
    # Get similarity scores for the target user
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(index=target_user_id)  # Exclude self

    # Get ratings of similar users
    similar_users_ratings = user_movie_matrix.loc[similar_users.index]

    # Weighted average of ratings using similarity scores
    weighted_ratings = similar_users_ratings.T.dot(similar_users)
    normalization = similar_users.sum()
    recommendation_scores = weighted_ratings / normalization

    # Remove movies already rated by the target user
    watched_movies = user_movie_matrix.loc[target_user_id]
    watched_movies = watched_movies[watched_movies > 0].index
    recommendation_scores = recommendation_scores.drop(watched_movies, errors='ignore')

    # Return top N recommended movie IDs
    top_recommendations = recommendation_scores.sort_values(ascending=False).head(num_recommendations)
    return top_recommendations


In [23]:
def recommend_movies_for_user(target_user_id, num_recommendations=5):
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(index=target_user_id)

    # Get ratings of similar users
    similar_ratings = user_movie_matrix.loc[similar_users.index]

    # Weighted rating calculation
    weighted_scores = similar_ratings.T.dot(similar_users)
    normalization = similar_users.sum()
    scores = weighted_scores / normalization

    # Remove movies the target user has already rated
    watched = user_movie_matrix.loc[target_user_id]
    scores = scores.drop(watched[watched > 0].index, errors='ignore')

    # Get top recommended movie IDs
    top_movies = scores.sort_values(ascending=False).head(num_recommendations)
    return top_movies


In [25]:
def recommend_movies_for_user(target_user_id, num_recommendations=5):
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(index=target_user_id)

    # Get ratings of similar users
    # Changed 'user_movie_matrix' to 'user_item_matrix'
    similar_ratings = user_item_matrix.loc[similar_users.index]

    # Weighted rating calculation
    weighted_scores = similar_ratings.T.dot(similar_users)
    normalization = similar_users.sum()
    scores = weighted_scores / normalization

    # Remove movies the target user has already rated
    # Changed 'user_movie_matrix' to 'user_item_matrix'
    watched = user_item_matrix.loc[target_user_id]
    scores = scores.drop(watched[watched > 0].index, errors='ignore')

    # Get top recommended movie IDs
    top_movies = scores.sort_values(ascending=False).head(num_recommendations)
    return top_movies

top_movies = recommend_movies_for_user(target_user_id=1, num_recommendations=5)

print("Top 5 recommended movie IDs:", top_movies.index.tolist())
print("Recommendation scores:", top_movies.values)

Top 5 recommended movie IDs: [260, 296, 1196, 318, 1198]
Recommendation scores: [2.44739659 2.3691164  2.30466924 2.25938908 2.1557039 ]


In [32]:
# Merge with movies.csv to get titles
# Assuming the movie ID column in movies.csv is 'Index'
recommended_titles = movies[movies['index'].isin(top_movies.index)]
recommended_titles = recommended_titles.set_index('index').loc[top_movies.index]
recommended_titles['score'] = top_movies.values

print(recommended_titles[['title', 'score']])

                            title     score
movieId                                    
260                  Ender's Game  2.447397
296                   End of Days  2.369116
1196                 The Prestige  2.304669
318                     Surf's Up  2.259389
1198     Escape from Planet Earth  2.155704


In [33]:
# Transpose for item-item similarity
item_item_matrix = user_item_matrix.T
item_similarity = cosine_similarity(item_item_matrix.fillna(0))

# Example: get top 5 similar movies to movie ID 1
movie_id = 1
movie_idx = list(item_item_matrix.index).index(movie_id)
similar_movies = np.argsort(-item_similarity[movie_idx])[1:6]
similar_movie_ids = item_item_matrix.index[similar_movies]
print(f"Top 5 similar movies to Movie {movie_id}:", similar_movie_ids.tolist())


Top 5 similar movies to Movie 1: [3114, 260, 356, 780, 1265]


Evaluation Metrics

In [49]:
def precision_at_k(recommended, relevant, k):
    recommended_at_k = recommended[:k]
    return len(set(recommended_at_k) & set(relevant)) / k


In [50]:
def recall_at_k(recommended, relevant, k):
    recommended_at_k = recommended[:k]
    return len(set(recommended_at_k) & set(relevant)) / len(relevant)


In [52]:
# Assume for a test user:
relevant_movies = [296, 318, 1198]  # Actual relevant movies (e.g., rated > 4)
recommended_movies = [260, 296, 1196, 318, 1198]  # Model's top 5 recommendations

k = 5
print("Precision@5:", precision_at_k(recommended_movies, relevant_movies, k))
print("Recall@5:", recall_at_k(recommended_movies, relevant_movies, k))


Precision@5: 0.6
Recall@5: 1.0


In [53]:
# Actual and predicted ratings for testing
actual_ratings = [4.0, 3.5, 5.0, 2.0, 4.5]    # y_true
predicted_ratings = [3.8, 3.0, 4.7, 2.5, 4.6] # y_pred


In [54]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# MAE
mae = mean_absolute_error(actual_ratings, predicted_ratings)

# RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

print(" MAE:", round(mae, 3))
print(" RMSE:", round(rmse, 3))


 MAE: 0.32
 RMSE: 0.358


Summary

| **Metric**       | **Type**   | **Best For**                        | **Ideal Value** |
| ---------------- | ---------- | ----------------------------------- | --------------- |
| **MAE**          | Prediction | Accuracy of predicted rating values | Close to 0      |
| **RMSE**         | Prediction | Penalizing large prediction errors  | Close to 0      |
| **Precision\@k** | Top-N      | Relevance of top-K recommendations  | Close to 1      |
| **Recall\@k**    | Top-N      | Capturing all relevant items        | Close to 1      |
