In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('events.csv')

In [3]:
# Assign numerical weights to different interaction types
# View = weak interest, Add to cart = medium interest, Transaction = strong interest
event_weight = {
    'view': 1,
    'addtocart': 2,
    'transaction': 3
}

# Map interaction types to numerical values
df['interaction'] = df['event'].map(event_weight)


In [4]:
# Aggregate multiple interactions between the same user and item
interaction_df = (
    df.groupby(['visitorid', 'itemid'])['interaction']
    .sum()
    .reset_index()
)


In [5]:
# Count number of interactions per user and per item
user_counts = interaction_df['visitorid'].value_counts()
item_counts = interaction_df['itemid'].value_counts()


# Keep only active users and frequently interacted items
active_users = user_counts[user_counts >= 20].index
popular_items = item_counts[item_counts >= 20].index


# Filter the interaction data
filtered_df = interaction_df[
    interaction_df['visitorid'].isin(active_users) &
    interaction_df['itemid'].isin(popular_items)
]


In [6]:
# Create user-item interaction matrix
# Rows = users, Columns = items, Values = interaction strength

user_item_matrix = filtered_df.pivot_table(
    index='visitorid',
    columns='itemid',
    values='interaction',
    fill_value=0
)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute item-item similarity matrix
# Transpose is used because we want similarity between items

item_similarity = cosine_similarity(user_item_matrix.T)

In [8]:
# Convert similarity matrix into a DataFrame for easier access
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

In [9]:
# Function to recommend similar items
def recommend_items(item_id, top_n=5):
  # Sort items based on similarity score
    similar_items = item_similarity_df[item_id].sort_values(ascending=False)
  # Exclude the item itself and return top-N recommendations
    return similar_items.iloc[1:top_n+1]

In [10]:
# Select a sample item
sample_item = item_similarity_df.columns[0]
# Generate top-5 recommendations
recommend_items(sample_item, top_n=5)

Unnamed: 0_level_0,6
itemid,Unnamed: 1_level_1
345004,0.57735
79057,0.57735
287572,0.57735
424932,0.57735
242380,0.57735


In [11]:
# Generate top-10 recommendations for tuning
recommend_items(sample_item, top_n=10)

Unnamed: 0_level_0,6
itemid,Unnamed: 1_level_1
345004,0.57735
79057,0.57735
287572,0.57735
424932,0.57735
242380,0.57735
295168,0.471405
291036,0.408248
345755,0.408248
272144,0.408248
47353,0.408248


MILESTONE-3:

In [22]:
# Actual interacted items per user (ground truth)
actual_items_per_user = (
    filtered_df.groupby('visitorid')['itemid']
    .apply(set)
)


In [23]:
def precision_recall_f1(actual_items, recommended_items):
    if not recommended_items:
        return 0.0, 0.0, 0.0

    actual_items = set(actual_items)
    recommended_items = set(recommended_items)

    true_positives = len(actual_items & recommended_items)

    precision = true_positives / len(recommended_items)
    recall = true_positives / len(actual_items)

    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1


In [25]:
def get_user_recommendations_for_eval(user_id, top_k=5):
    if user_id not in user_item_matrix.index:
        return []

    user_items = user_item_matrix.loc[user_id]
    user_items = user_items[user_items > 0].index.tolist()

    scores = {}

    for item in user_items:
        if item in item_similarity_df.columns:
            for sim_item, score in item_similarity_df[item].items():
                # NOTE: we DO NOT exclude already seen items here
                scores[sim_item] = scores.get(sim_item, 0) + score

    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in ranked_items[:top_k]]


In [26]:
precision_scores = []
recall_scores = []
f1_scores = []

eval_users = actual_items_per_user.sample(50, random_state=42).index

for user in eval_users:
    actual_items = actual_items_per_user[user]
    recommended_items = get_user_recommendations_for_eval(user, top_k=5)

    p, r, f = precision_recall_f1(actual_items, recommended_items)

    precision_scores.append(p)
    recall_scores.append(r)
    f1_scores.append(f)

print("Precision@5:", np.mean(precision_scores))
print("Recall@5:", np.mean(recall_scores))
print("F1@5:", np.mean(f1_scores))


Precision@5: 0.92
Recall@5: 0.3363553025683326
F1@5: 0.41731533532883164


In [27]:
precision_scores_10 = []
recall_scores_10 = []
f1_scores_10 = []

for user in eval_users:
    actual_items = actual_items_per_user[user]
    recommended_items = get_user_recommendations_for_eval(user, top_k=10)

    p, r, f = precision_recall_f1(actual_items, recommended_items)

    precision_scores_10.append(p)
    recall_scores_10.append(r)
    f1_scores_10.append(f)

print("Precision@10:", np.mean(precision_scores_10))
print("Recall@10:", np.mean(recall_scores_10))
print("F1@10:", np.mean(f1_scores_10))


Precision@10: 0.8180000000000001
Recall@10: 0.5074604638802588
F1@10: 0.5440136627584474
