In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score, recall_score, f1_score

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [2]:
df1 = pd.read_csv('users_interactions.csv')
df2 = pd.read_csv('products_catalog.csv')

In [3]:
df1.head()

Unnamed: 0,user_id,product_id,rating,timestamp
0,U0001,P0234,4,2025-06-30 00:00:00
1,U0001,P0212,5,2024-11-05 00:00:00
2,U0001,P0425,4,2025-02-08 00:00:00
3,U0001,P0491,4,2024-12-12 00:00:00
4,U0001,P0204,2,2024-07-01 00:00:00


In [9]:
df1 = df1.drop("timestamp", axis=1)
df1

Unnamed: 0,user_id,product_id,rating
0,U0001,P0234,4
1,U0001,P0212,5
2,U0001,P0425,4
3,U0001,P0491,4
4,U0001,P0204,2
...,...,...,...
27497,U1000,P0345,4
27498,U1000,P0412,5
27499,U1000,P0226,4
27500,U1000,P0095,5


In [10]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df1,
    test_size=0.2,
    random_state=42
)

In [11]:
import numpy as np

user_ids = train_df['user_id'].unique()
product_ids = train_df['product_id'].unique()

user_id_map = {u: i for i, u in enumerate(user_ids)}
product_id_map = {p: i for i, p in enumerate(product_ids)}

In [12]:
from scipy.sparse import csr_matrix

rows = train_df['user_id'].map(user_id_map)
cols = train_df['product_id'].map(product_id_map)
ratings = train_df['rating']

sparse_matrix = csr_matrix(
    (ratings, (rows, cols)),
    shape=(len(user_ids), len(product_ids))
)

In [13]:
user_means = np.zeros(sparse_matrix.shape[0])

for i in range(sparse_matrix.shape[0]):
    row_data = sparse_matrix[i].data
    if len(row_data) > 0:
        user_means[i] = row_data.mean()

In [14]:
mean_centered = sparse_matrix.copy().tolil()

for i in range(mean_centered.shape[0]):
    if len(mean_centered.rows[i]) > 0:
        mean_centered.data[i] = [
            val - user_means[i] for val in mean_centered.data[i]
        ]

mean_centered = mean_centered.tocsr()

In [15]:
from scipy.sparse.linalg import svds

k = 50  # latent dimensions

U, sigma, Vt = svds(mean_centered, k=k)
sigma = np.diag(sigma)

reconstructed = np.dot(np.dot(U, sigma), Vt)

predicted_ratings = reconstructed + user_means.reshape(-1, 1)

predicted_ratings = np.clip(predicted_ratings, 1, 5)

In [16]:
from sklearn.metrics import mean_squared_error

y_true = []
y_pred = []

for _, row in test_df.iterrows():
    user = row['user_id']
    product = row['product_id']
    
    if user in user_id_map and product in product_id_map:
        u_idx = user_id_map[user]
        p_idx = product_id_map[product]
        
        y_true.append(row['rating'])
        y_pred.append(predicted_ratings[u_idx, p_idx])

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print("SVD RMSE:", rmse)

SVD RMSE: 1.1649390862932856


In [17]:
def recommend_svd(user_id, top_n=10):
    if user_id not in user_id_map:
        return []
    
    u_idx = user_id_map[user_id]
    user_preds = predicted_ratings[u_idx].copy()
    
    already_rated = sparse_matrix[u_idx].indices
    user_preds[already_rated] = -np.inf
    
    top_indices = np.argsort(user_preds)[-top_n:][::-1]
    top_products = [product_ids[i] for i in top_indices]
    
    return df2[df2['product_id'].isin(top_products)]

In [19]:
recc = recommend_svd('U0999')
recc

Unnamed: 0,product_id,product_name,brand,category,price,description,image_url
26,P0027,XPS 13 Plus 573,Dell,Electronics,43.16,Experience excellence with the XPS 13 Plus 573...,https://images.unsplash.com/photo-149618113320...
43,P0044,WH-1000XM5 9278,Sony,Electronics,1861.83,"Meet the WH-1000XM5 9278, designed for perfect...",https://images.unsplash.com/photo-150574042092...
189,P0190,High Impact Mascara,Clinique,Beauty,67.57,Experience excellence with the High Impact Mas...,https://images.unsplash.com/photo-159646250227...
258,P0259,Moisturizing Lotion,Clinique,Beauty,49.4,Discover the power of the Moisturizing Lotion....,https://images.unsplash.com/photo-157017261964...
293,P0294,Alpha 7 IV 2876,Sony,Electronics,810.4,Experience excellence with the Alpha 7 IV 2876...,https://images.unsplash.com/photo-151603506937...
339,P0340,Voluminous Mascara,L'Oreal,Beauty,42.06,Discover the power of the Voluminous Mascara. ...,https://images.unsplash.com/photo-159646250227...
375,P0376,Fit Me Foundation,Maybelline,Beauty,57.09,Upgrade your lifestyle with the Fit Me Foundat...,https://images.unsplash.com/photo-159646250227...
411,P0412,Galaxy Buds2 Pro 3242,Samsung,Electronics,2406.47,Discover the power of the Galaxy Buds2 Pro 324...,https://images.unsplash.com/photo-150574042092...
445,P0446,Alpha 7 IV 2398,Sony,Electronics,943.59,Experience excellence with the Alpha 7 IV 2398...,https://images.unsplash.com/photo-151603506937...
488,P0489,Tech Fleece Joggers,Nike,Fashion,35.6,Experience excellence with the Tech Fleece Jog...,https://images.unsplash.com/photo-154227245431...


In [27]:
# ==============================
# 9. PRECISION / RECALL / F1
# ==============================

threshold = 3.5  # relevant if rating >= 4

y_true_binary = []
y_pred_binary = []

for i in range(len(y_true)):
    y_true_binary.append(1 if y_true[i] >= threshold else 0)
    y_pred_binary.append(1 if y_pred[i] >= threshold else 0)

precision = precision_score(y_true_binary, y_pred_binary)
recall = recall_score(y_true_binary, y_pred_binary)
f1 = f1_score(y_true_binary, y_pred_binary)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


# ==============================
# 10. PRECISION@K & RECALL@K
# ==============================

def precision_recall_at_k(test_df, K=10, threshold=3.5):
    precisions = []
    recalls = []

    for user in test_df['user_id'].unique():
        if user not in user_id_map:
            continue

        u_idx = user_id_map[user]

        # Get predicted scores
        user_preds = predicted_ratings[u_idx].copy()

        # Remove already rated items
        already_rated = sparse_matrix[u_idx].indices
        user_preds[already_rated] = -np.inf

        # Top K items
        top_k_idx = np.argsort(user_preds)[-K:][::-1]
        top_k_products = set([product_ids[i] for i in top_k_idx])

        # Relevant items in test set
        user_test = test_df[
            (test_df['user_id'] == user) &
            (test_df['rating'] >= threshold)
        ]

        relevant_items = set(user_test['product_id'])

        if len(relevant_items) == 0:
            continue

        hits = len(top_k_products & relevant_items)

        precisions.append(hits / K)
        recalls.append(hits / len(relevant_items))

    return np.mean(precisions), np.mean(recalls)


precision_k, recall_k = precision_recall_at_k(test_df, K=10)

print("Precision@10:", precision_k)
print("Recall@10:", recall_k)

Precision: 0.6517271534761696
Recall: 0.8271365149833518
F1 Score: 0.7290291024700416
Precision@10: 0.016856256463288522
Recall@10: 0.04780169847388048


In [29]:
# ============================================================
# Session-Aware Popularity Recommender
#
# Logic:
#   - Tracks up to 5 most recently viewed products (sliding window)
#   - Recommends 12 products total, split by slot allocation:
#       1 product viewed  → 12 from current category
#       2 products viewed → 6 current + 6 prev1
#       3 products viewed → 6 current + 3 prev1 + 3 prev2
#       4 products viewed → 6 current + 2 prev1 + 2 prev2 + 2 prev3
#       5 products viewed → 4 current + 2 prev1 + 2 prev2 + 2 prev3 + 2 prev4
#   - From 6th product onward: sliding window drops oldest, keeps last 5
#   - Popularity = total interaction count per category (from df1 + df2 join)
#   - Already-viewed products are excluded from recommendations
# ============================================================

import pandas as pd
from collections import deque

# ── 1. Load data ─────────────────────────────────────────────
df1 = pd.read_csv('users_interactions.csv')   # user_id, product_id, rating, ...
df2 = pd.read_csv('products_catalog.csv')     # product_id, category, ...

df1 = df1.drop(columns=["timestamp"], errors="ignore")

# ── 2. Build category popularity from interactions ───────────
# Join df1 interactions with df2 to get category per interaction
interactions_with_cat = df1.merge(df2[['product_id', 'category']], on='product_id', how='left')

# Popularity = number of interactions per category
category_popularity = (
    interactions_with_cat
    .groupby('category')
    .size()
    .reset_index(name='interaction_count')
    .sort_values('interaction_count', ascending=False)
)
print("── Category Popularity ──")
print(category_popularity.to_string(index=False))

# ── 3. Build product-level popularity within each category ───
# For recommending specific products, rank by their own interaction count
product_popularity = (
    df1.groupby('product_id')
    .size()
    .reset_index(name='interaction_count')
)

# Attach category to each product
product_popularity = product_popularity.merge(df2[['product_id', 'category']], on='product_id', how='left')

# Rank products within their category by popularity
product_popularity['rank_in_category'] = (
    product_popularity
    .groupby('category')['interaction_count']
    .rank(method='first', ascending=False)
    .astype(int)
)
product_popularity = product_popularity.sort_values(['category', 'rank_in_category'])

# ── 4. Slot allocation table ─────────────────────────────────
# Index 0 = current (most recent), 1 = prev1, 2 = prev2, etc.
SLOT_ALLOCATION = {
    1: [12],
    2: [6, 6],
    3: [6, 3, 3],
    4: [6, 2, 2, 2],
    5: [4, 2, 2, 2, 2],
}

# ── 5. Helper: get top-N products from a category ────────────
def get_top_products(category, n, exclude_products=None):
    """
    Returns top-N most popular products from a given category,
    excluding any already-viewed products.
    """
    if exclude_products is None:
        exclude_products = set()

    pool = product_popularity[
        (product_popularity['category'] == category) &
        (~product_popularity['product_id'].isin(exclude_products))
    ]

    top = pool.nsmallest(n, 'rank_in_category')['product_id'].tolist()
    return top

# ── 6. Core recommendation function ──────────────────────────
def get_category_for_product(product_id):
    """Look up the category of a product."""
    match = df2.loc[df2['product_id'] == product_id, 'category']
    if match.empty:
        raise ValueError(f"Product '{product_id}' not found in catalog.")
    return match.values[0]


def session_recommend(session_history, top_n=12):
    """
    Generate 12 recommendations based on session history.

    Parameters
    ----------
    session_history : list of product_ids in order of interaction
                      (oldest → newest). Max 5 tracked.
    top_n           : total recommendations to return (default 12)

    Returns
    -------
    DataFrame with recommended products, their category,
    source slot, and popularity rank.
    """
    if not session_history:
        raise ValueError("Session history is empty.")

    # Keep only last 5 (sliding window)
    window = list(session_history[-5:])
    window_size = len(window)

    # Slots: index 0 = most recent product
    slots = SLOT_ALLOCATION[window_size]

    # Products already seen — exclude from recommendations
    viewed = set(session_history)

    recommendations = []

    for slot_idx, slot_count in enumerate(slots):
        # slot_idx 0 = most recent, 1 = second most recent, etc.
        product_id = window[-(slot_idx + 1)]
        category   = get_category_for_product(product_id)

        # Exclude already viewed + already recommended
        already_used = viewed | {r['product_id'] for r in recommendations}
        top_products = get_top_products(category, slot_count, exclude_products=already_used)

        for pid in top_products:
            pop_rank = product_popularity.loc[
                product_popularity['product_id'] == pid, 'rank_in_category'
            ].values[0]

            recommendations.append({
                'product_id':       pid,
                'category':         category,
                'source_product':   product_id,
                'slot':             f"slot_{slot_idx + 1} ({'current' if slot_idx == 0 else f'prev{slot_idx}'})",
                'popularity_rank':  int(pop_rank),
            })

    result = pd.DataFrame(recommendations)

    # Attach full product details from catalog
    # Drop category from df2 first to avoid collision with our own category column
    df2_no_cat = df2.drop(columns=['category'], errors='ignore')
    result = result.merge(df2_no_cat, on='product_id', how='left')

    return result


# ── 7. Demo ───────────────────────────────────────────────────
if __name__ == "__main__":

    # Simulate a user browsing session
    test_cases = [
        ["P0001"],
        ["P0001", "P0004"],
        ["P0001", "P0004", "P0006"],
        ["P0001", "P0004", "P0006", "P0010"],
        ["P0001", "P0004", "P0006", "P0010", "P0015"],
        # 6th product — sliding window drops P0001
        ["P0001", "P0004", "P0006", "P0010", "P0015", "P0020"],
    ]

    for history in test_cases:
        print(f"\n{'='*60}")
        print(f"Session history : {history}")
        window_shown = history[-5:]
        print(f"Active window   : {window_shown}  ({len(window_shown)} products)")
        slots = SLOT_ALLOCATION[len(window_shown)]
        print(f"Slot allocation : {slots}  (sum = {sum(slots)})")
        print(f"{'='*60}")

        recs = session_recommend(history)
        print(recs[['slot', 'category', 'product_id', 'popularity_rank']].to_string(index=False))

── Category Popularity ──
   category  interaction_count
     Beauty              10335
Electronics               8424
       Home               3583
    Fashion               3361
     Sports               1799

Session history : ['P0001']
Active window   : ['P0001']  (1 products)
Slot allocation : [12]  (sum = 12)
            slot    category product_id  popularity_rank
slot_1 (current) Electronics      P0017                1
slot_1 (current) Electronics      P0367                2
slot_1 (current) Electronics      P0006                3
slot_1 (current) Electronics      P0216                4
slot_1 (current) Electronics      P0424                5
slot_1 (current) Electronics      P0018                6
slot_1 (current) Electronics      P0155                7
slot_1 (current) Electronics      P0236                8
slot_1 (current) Electronics      P0446                9
slot_1 (current) Electronics      P0251               10
slot_1 (current) Electronics      P0429               1