In [51]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [52]:
DATA_DIR = "../data"
RESULTS_DIR = "../results"
os.makedirs(RESULTS_DIR, exist_ok=True)

items_path = os.path.join(DATA_DIR, "sleep_items.csv")
ratings_path = os.path.join(DATA_DIR, "sleep_ratings.csv")
users_path = os.path.join(DATA_DIR, "sleep_users.csv")

items = pd.read_csv(items_path)
ratings = pd.read_csv(ratings_path)
users = pd.read_csv(users_path)

print("Items shape:", items.shape)
print("Ratings shape:", ratings.shape)
print("Users shape:", users.shape)
ratings.head()

Items shape: (520, 8)
Ratings shape: (51488, 5)
Users shape: (5000, 14)


Unnamed: 0,rating_id,user_id,item_id,rating,date
0,1,2798,18,5,2025-02-11
1,2,848,383,1,2025-11-29
2,3,798,171,4,2025-01-22
3,4,3701,359,5,2025-09-24
4,5,4426,307,4,2025-12-29


In [None]:

user_col = "user_id"
item_col = "item_id"
rating_col = "rating"

user_item_matrix = ratings.pivot_table(index=user_col, columns=item_col, values=rating_col)
print("User-Item Matrix shape:", user_item_matrix.shape)

User-Item Matrix shape: (5000, 520)


In [None]:
# ------------------------------------------------------------------------------
# 7.1 EDA Visualizations
# ------------------------------------------------------------------------------
PLOTS_DIR = os.path.join(RESULTS_DIR, "plots")
os.makedirs(PLOTS_DIR, exist_ok=True)

# 1. Rating Distribution
plt.figure(figsize=(8, 5))
user_item_matrix.stack().value_counts().sort_index().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# 2. User Activity (Ratings per User)
user_counts = ratings.groupby(user_col)[rating_col].count()
plt.figure(figsize=(10, 5))
plt.hist(user_counts, bins=30, color='lightgreen', edgecolor='black')
plt.title('User Activity Distribution')
plt.xlabel('Number of Ratings per User')
plt.ylabel('Count of Users')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# 3. Item Popularity (Ratings per Item)
item_counts = ratings.groupby(item_col)[rating_col].count()
plt.figure(figsize=(10, 5))
plt.hist(item_counts, bins=30, color='salmon', edgecolor='black')
plt.title('Item Popularity Distribution')
plt.xlabel('Number of Ratings per Item')
plt.ylabel('Count of Items')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## 8.1 Collaborative Filtering: User-Based (Pearson Correlation)

In [None]:
user_means = user_item_matrix.mean(axis=1)
R_demeaned = user_item_matrix.sub(user_means, axis=0).fillna(0)

user_similarity = cosine_similarity(R_demeaned)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print("User Similarity Matrix shape:", user_similarity_df.shape)
user_similarity_df.iloc[:5, :5]

User Similarity Matrix shape: (5000, 5000)


user_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.0,0.0,0.0,-0.003928
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0
5,-0.003928,0.0,0.0,0.0,1.0


In [None]:
def predict_user_based(user_id, item_id, k=20):

    if item_id not in user_item_matrix.columns:
        return np.nan
    if user_id not in user_similarity_df.index:
        return np.nan
    sim_scores = user_similarity_df.loc[user_id]
    item_ratings = user_item_matrix[item_id]
    rated_users = item_ratings[item_ratings.notna()].index
    relevant_sims = sim_scores.loc[rated_users]
    relevant_sims = relevant_sims[relevant_sims > 0]
    top_k_users = relevant_sims.sort_values(ascending=False).head(k)
    
    if top_k_users.empty:
        return user_means.loc[user_id]
    neighbor_ratings = user_item_matrix.loc[top_k_users.index, item_id]
    neighbor_means = user_means.loc[top_k_users.index]
    numerator = (top_k_users * (neighbor_ratings - neighbor_means)).sum()
    denominator = top_k_users.sum()
    
    if denominator == 0:
        return user_means.loc[user_id]
    pred_rating = user_means.loc[user_id] + (numerator / denominator)
    return np.clip(pred_rating, 1.0, 5.0)
    
test_user = user_item_matrix.index[0]
test_item = user_item_matrix.columns[0]
print(f"Predicting for User {test_user}, Item {test_item}")
pred = predict_user_based(test_user, test_item)
print(f"Predicted Rating: {pred:.2f}")

Predicting for User 1, Item 1
Predicted Rating: 4.33


In [None]:
def recommend_user_based(user_id, top_n=10):
    user_ratings = user_item_matrix.loc[user_id]
    unrated_items = user_ratings[user_ratings.isna()].index
    
    predictions = []
    for item in unrated_items:
        est = predict_user_based(user_id, item)
        predictions.append((item, est))
        
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    recs_df = pd.DataFrame(predictions[:top_n], columns=['item_id', 'predicted_rating'])
    
    recs_df = recs_df.merge(items[['item_id', 'name', 'category']], on='item_id', how='left')
    
    return recs_df

print("Top 10 Recommendations for User", test_user)
recommend_user_based(test_user, top_n=10)

Top 10 Recommendations for User 1


Unnamed: 0,item_id,predicted_rating,name,category
0,122,5.0,Valerian Root Extract,Supplements
1,349,5.0,Premium Sleep Solution 349,Medical Devices
2,439,5.0,Premium Sleep Solution 439,Sound & Environment
3,491,4.960391,Premium Sleep Solution 491,Therapy & Counseling
4,273,4.902703,Premium Sleep Solution 273,Bedding & Accessories
5,420,4.871805,Premium Sleep Solution 420,Sound & Environment
6,131,4.867382,Magnesium Glycinate,Supplements
7,315,4.848346,Yoga for Sleep,Lifestyle Practices
8,471,4.831259,Premium Sleep Solution 471,Therapy & Counseling
9,65,4.79247,Fitbit NightSense Band,Wearable Devices


## 8.2 Matrix Factorization using SVD (from Section 1)

In [57]:
R_filled = user_item_matrix.fillna(user_item_matrix.mean(axis=0))

R_matrix = R_filled.values
user_ratings_mean = np.mean(R_matrix, axis=1)
R_demeaned_svd = R_matrix - user_ratings_mean.reshape(-1, 1)

k = 20
U, sigma, Vt = svds(R_demeaned_svd, k=k)

sigma = np.diag(sigma)

print("U shape:", U.shape)
print("Sigma shape:", sigma.shape)
print("Vt shape:", Vt.shape)

U shape: (5000, 20)
Sigma shape: (20, 20)
Vt shape: (20, 520)


In [58]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)

def recommend_svd(user_id, top_n=10):
    sorted_user_predictions = preds_df.loc[user_id].sort_values(ascending=False)
    
    user_data = ratings[ratings[user_col] == user_id]
    rated_items = user_data[item_col].tolist()
    
    recommendations = sorted_user_predictions[~sorted_user_predictions.index.isin(rated_items)]
    
    top_recs = recommendations.head(top_n)
    
    recs_df = top_recs.to_frame(name='predicted_rating').reset_index()
    recs_df = recs_df.merge(items[['item_id', 'name', 'category']], on='item_id', how='left')
    
    return recs_df

print("SVD Recommendations for User", test_user)
recommend_svd(test_user, top_n=10)

SVD Recommendations for User 1


Unnamed: 0,item_id,predicted_rating,name,category
0,40,4.236056,MindRest v5,Mobile Apps
1,62,4.131789,Eight Sleep BiometricWatch,Wearable Devices
2,495,4.130776,Premium Sleep Solution 495,Therapy & Counseling
3,342,4.11356,Breathing Technique Workshop,Lifestyle Practices
4,442,4.108173,Premium Sleep Solution 442,Sound & Environment
5,247,4.095059,Premium Sleep Solution 247,Bedding & Accessories
6,378,4.091839,Premium Sleep Solution 378,Medical Devices
7,303,4.087169,Sleep Psychology Course,Lifestyle Practices
8,391,4.081298,Premium Sleep Solution 391,Medical Devices
9,132,4.079108,Valerian Root Extract,Supplements


In [59]:
user_based_recs = recommend_user_based(test_user, top_n=20)
svd_recs = recommend_svd(test_user, top_n=20)

user_based_path = os.path.join(RESULTS_DIR, "user_based_cf_predictions.csv")
svd_path = os.path.join(RESULTS_DIR, "svd_predictions.csv")

user_based_recs.to_csv(user_based_path, index=False)
svd_recs.to_csv(svd_path, index=False)

print(f"Saved recommendations to {RESULTS_DIR}")

Saved recommendations to ../results
