In [1]:
# ===== 1. Imports =====
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [3]:
# ===== 2. Load and Filter Data =====
df = pd.read_csv('events.csv')

In [5]:
df = df[df['event'].isin(['view', 'transaction'])]

In [7]:
# Assign weights
df['weight'] = df['event'].map({'view': 1, 'transaction': 5})

In [9]:
# ===== 3. Encode Users and Items =====
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [11]:
df['user'] = user_encoder.fit_transform(df['visitorid'])
df['item'] = item_encoder.fit_transform(df['itemid'])

In [13]:
# ===== 4. Filter Active Users and Popular Items =====
min_user_interactions = 10
min_item_interactions = 20

active_users = df['user'].value_counts()[lambda x: x >= min_user_interactions].index
popular_items = df['item'].value_counts()[lambda x: x >= min_item_interactions].index

df = df[df['user'].isin(active_users) & df['item'].isin(popular_items)]

In [15]:
# ===== 5. Train-Test Split =====
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# ===== 6. Build Sparse User-Item Matrix =====
train_matrix = coo_matrix(
    (train_df['weight'], (train_df['user'], train_df['item']))
).tocsr()

In [19]:
# ===== 7. Fit Nearest Neighbors (User-based) =====
nn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=6, n_jobs=-1)
nn_model.fit(train_matrix)

In [21]:
# ===== 8. Recommend Top-K Items for a User =====
def recommend_items_ann(user_id, k=5, model=None, interactions_matrix=None):
    distances, indices = model.kneighbors(interactions_matrix[user_id], n_neighbors=6)
    similar_users = indices.flatten()[1:]  # Exclude self

    user_interactions = set(interactions_matrix[user_id].indices)
    recommendations = {}

    for sim_user in similar_users:
        for item in interactions_matrix[sim_user].indices:
            if item not in user_interactions:
                recommendations[item] = recommendations.get(item, 0) + 1

    recommended_items = sorted(recommendations.items(), key=lambda x: -x[1])
    return [item for item, score in recommended_items[:k]]

In [23]:
# ===== 9. Utility to Show Recommendations with Real IDs =====
def show_recommendations_for_user(encoded_user_id, k=5):
    encoded_recs = recommend_items_ann(
        user_id=encoded_user_id,
        k=k,
        model=nn_model,
        interactions_matrix=train_matrix
    )

    # Decode item IDs to original format
    decoded_items = item_encoder.inverse_transform(encoded_recs)
    seen_items = train_matrix[encoded_user_id].indices
    decoded_seen = item_encoder.inverse_transform(seen_items)

    print(f"\n🧑 Recommendations for User ID {user_encoder.inverse_transform([encoded_user_id])[0]}:")
    print(f"📦 Recommended Items (Top-{k}): {list(decoded_items)}")
    print(f"✅ Items Already Seen: {list(decoded_seen)}\n")

In [25]:
# ===== 10. Try a Sample User =====
# Choose an active user from train data
sample_user_encoded = train_df['user'].value_counts().index[0]

# Show recommendations
show_recommendations_for_user(sample_user_encoded, k=5)


🧑 Recommendations for User ID 1150086:
📦 Recommended Items (Top-5): [201816, 210002, 278586, 339703, 356029]
✅ Items Already Seen: [25, 42, 546, 550, 869, 909, 1152, 1538, 1590, 1684, 1879, 1976, 2104, 2416, 2455, 2567, 2634, 2641, 2711, 2810, 2980, 3902, 4067, 4437, 4442, 4482, 4537, 4606, 4613, 4740, 5143, 5470, 5675, 5848, 6605, 6656, 6692, 6720, 6848, 6913, 7804, 7943, 8023, 8122, 8259, 8347, 8523, 8588, 8638, 8651, 9385, 9759, 10034, 10040, 10107, 10291, 10572, 10594, 10697, 10885, 10934, 11131, 11279, 11663, 12057, 12217, 12233, 13031, 13092, 13169, 13298, 13417, 13556, 13747, 13852, 13925, 14547, 14644, 15543, 15744, 15903, 16063, 16158, 16206, 16344, 17108, 17163, 17379, 17478, 17698, 17798, 18265, 18287, 18726, 19547, 19677, 19833, 19873, 19934, 19976, 20092, 20158, 20208, 20416, 20740, 20968, 20981, 21013, 21163, 22161, 22257, 22544, 22839, 23325, 23347, 23660, 23762, 24147, 24154, 24728, 24795, 24855, 25027, 25375, 25383, 25515, 25590, 25662, 25762, 25917, 26644, 26747, 268

In [41]:
original_user_id = 76196  # Example visitorid from dataset
sample_user_encoded = user_encoder.transform([original_user_id])[0]

# Show recommendations
show_recommendations_for_user(sample_user_encoded, k=5)


🧑 Recommendations for User ID 76196:
📦 Recommended Items (Top-5): [99093, 155625, 443170, 7943, 25442]
✅ Items Already Seen: [99247, 287218, 353221, 367030]



In [43]:
# ===== 9. Create Ground Truth for Evaluation =====
test_group = test_df.groupby('user')['item'].apply(set)
test_group = test_group[test_group.map(len) >= 2]  # keep only users with ≥2 test items

In [45]:
# ===== 10. Evaluation Functions =====
def precision_at_k(pred, actual, k):
    return len(set(pred[:k]) & set(actual)) / k if actual else 0

def recall_at_k(pred, actual, k):
    return len(set(pred[:k]) & set(actual)) / len(actual) if actual else 0

In [47]:
# ===== 11. Run Evaluation =====
precision_list = []
recall_list = []

sample_users = test_group[test_group.map(len) >= 1].sample(n=100, random_state=42).index

for user in sample_users:
    actual = test_group[user]
    pred = recommend_items_ann(user, k=5, model=nn_model, interactions_matrix=train_matrix)

    if len(actual) == 0 or len(pred) == 0:
        continue  # skip users with no test items or no recommendations

    precision_list.append(precision_at_k(pred, actual, 5))
    recall_list.append(recall_at_k(pred, actual, 5))

In [48]:
# ===== 12. Show Results =====
print(f'Average Precision@5: {np.mean(precision_list):.4f}')
print(f'Average Recall@5: {np.mean(recall_list):.4f}')

Average Precision@5: 0.0323
Average Recall@5: 0.0577


In [51]:
print(f"Evaluated {len(precision_list)} users with valid predictions.")

Evaluated 99 users with valid predictions.
