In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest

import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares

In [4]:
# File path
events_path = "events.csv"

# Load the events.csv file
events_df = pd.read_csv(events_path)

# Display basic info and first few rows
events_df.info(), events_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


(None,
        timestamp  visitorid event  itemid  transactionid
 0  1433221332117     257597  view  355908            NaN
 1  1433224214164     992329  view  248676            NaN
 2  1433221999827     111016  view  318965            NaN
 3  1433221955914     483717  view  253185            NaN
 4  1433221337106     951259  view  367447            NaN)

In [5]:
# convert the Unix timestamp into a readable date/time format
events_df['datetime'] = pd.to_datetime(events_df['timestamp'], unit='ms')


In [6]:
# how many events of each type exist 
events_df['event'].value_counts()


event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64

In [7]:
# how many unique visitors there are.
events_df['visitorid'].nunique()


1407580

In [8]:
events_df.isnull().sum()


timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
datetime               0
dtype: int64

In [9]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,datetime
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106


In [10]:
# # Load the events data
# events_df = pd.read_csv("events.csv")

# # Convert timestamp to datetime format
# events_df['datetime'] = pd.to_datetime(events_df['timestamp'], unit='ms')

# Count event types per user
user_events = events_df.pivot_table(index='visitorid', columns='event', aggfunc='size', fill_value=0)

# Rename columns for clarity
user_events.columns = ['num_addtocart', 'num_transaction', 'num_view']

# Add a total event count
user_events['total_events'] = user_events.sum(axis=1)

# Step 1.2: Calculate conversion rates
total_views = len(user_events[user_events['num_view'] > 0])
total_addtocart = len(user_events[user_events['num_addtocart'] > 0])
total_transactions = len(user_events[user_events['num_transaction'] > 0])

view_to_cart_rate = (total_addtocart / total_views) * 100
cart_to_purchase_rate = (total_transactions / total_addtocart) * 100

print(f"View → Add to Cart Conversion Rate: {view_to_cart_rate:.2f}%")
print(f"Add to Cart → Purchase Conversion Rate: {cart_to_purchase_rate:.2f}%")

# Step 1.3: Time difference between events for each user
events_df.sort_values(by=['visitorid', 'datetime'], inplace=True)
events_df['time_diff'] = events_df.groupby('visitorid')['datetime'].diff().dt.total_seconds()

# Step 2: Detecting Bots (Anomaly Detection)
# Define a threshold (e.g., users with extremely high event counts)
threshold = user_events['total_events'].quantile(0.99)  # Top 1% most active users

# Identify potential bot users
bot_users = user_events[user_events['total_events'] > threshold]

print(f"Number of potential bot users: {len(bot_users)}")

# Show the top 10 bot-like users
print(bot_users.sort_values(by='total_events', ascending=False).head(10))


View → Add to Cart Conversion Rate: 2.69%
Add to Cart → Purchase Conversion Rate: 31.07%
Number of potential bot users: 12712
           num_addtocart  num_transaction  num_view  total_events
visitorid                                                        
1150086              719              559      6479          7757
530559               419              286      3623          4328
152963               371              349      2304          3024
895999                56               50      2368          2474
163561               124               92      2194          2410
371606               110               94      2141          2345
286616               120               75      2057          2252
684514               231              189      1826          2246
892013                 1                0      2023          2024
861299               230              188      1573          1991


In [11]:

# # Load data
# events_df = pd.read_csv("events.csv")
# events_df['datetime'] = pd.to_datetime(events_df['timestamp'], unit='ms')

# Step 1: Aggregate User Data
user_events = events_df.pivot_table(index='visitorid', columns='event', aggfunc='size', fill_value=0)
user_events.columns = ['num_addtocart', 'num_transaction', 'num_view']
user_events['total_events'] = user_events.sum(axis=1)

# Step 2: Identify Bots Based on Activity Threshold
event_threshold = user_events['total_events'].quantile(0.99)  # Top 1% users
bots_rule_based = user_events[user_events['total_events'] > event_threshold]

# Step 3: Calculate Time Differences per User
events_df.sort_values(by=['visitorid', 'datetime'], inplace=True)
events_df['time_diff'] = events_df.groupby('visitorid')['datetime'].diff().dt.total_seconds()
mean_time_diff = events_df.groupby('visitorid')['time_diff'].mean()

# Step 4: Apply Isolation Forest for Anomaly Detection
features = user_events[['num_addtocart', 'num_transaction', 'num_view', 'total_events']]
iso_forest = IsolationForest(contamination=0.01, random_state=42)
user_events['anomaly_score'] = iso_forest.fit_predict(features)

# Step 5: Merge Time Difference Data
user_events = user_events.merge(mean_time_diff, on='visitorid', how='left')

# Step 6: Define Final Bot Users
bot_users = user_events[(user_events['anomaly_score'] == -1) | (user_events['total_events'] > event_threshold)]
print(f"Number of bots detected: {len(bot_users)}")

# Step 7: Filter Out Bots
cleaned_events_df = events_df[~events_df['visitorid'].isin(bot_users.index)]
print(f"Remaining users after bot removal: {cleaned_events_df['visitorid'].nunique()}")

# Save cleaned dataset
cleaned_events_df.to_csv("cleaned_events.csv", index=False)


Number of bots detected: 16694
Remaining users after bot removal: 1390886


### Preprocessing item_properties.csv (part1 and 2)

clean and merge the item_properties_part1.1.csv and item_properties_part2.csv

In [13]:
# Load both item properties files
item_properties_1 = pd.read_csv("item_properties_part1.1.csv")
item_properties_2 = pd.read_csv("item_properties_part2.csv")

# Merge the two parts into one DataFrame
item_properties = pd.concat([item_properties_1, item_properties_2])

# Convert timestamp to readable format
item_properties['datetime'] = pd.to_datetime(item_properties['timestamp'], unit='ms')

# Keep only the latest property value for each item
item_properties = item_properties.sort_values(by=['itemid', 'datetime']).drop_duplicates(subset=['itemid'], keep='last')

# Filter for category and availability
item_properties_filtered = item_properties[item_properties['property'].isin(['categoryid', 'available'])]

# Pivot the data so that each item has categoryid & available columns
item_properties_pivot = item_properties_filtered.pivot(index='itemid', columns='property', values='value').reset_index()

# Convert availability to integer (0 or 1)
item_properties_pivot['available'] = item_properties_pivot['available'].astype(float).fillna(0).astype(int)

# Save cleaned dataset
item_properties_pivot.to_csv("cleaned_item_properties.csv", index=False)

print("Item properties preprocessing complete! 🚀")


Item properties preprocessing complete! 🚀


In [14]:
# Load cleaned item properties
item_properties = pd.read_csv("cleaned_item_properties.csv")

# Load cleaned events data (after bot removal)
events_df = pd.read_csv("cleaned_events.csv")

# Merge events with item properties to include category information
events_df = events_df.merge(item_properties, on='itemid', how='left')

# Filter out unavailable products
events_df = events_df[events_df['available'] == 1]

# Step 1: Identify most viewed & purchased products per category
popular_items = events_df.groupby(['categoryid', 'itemid'])['event'].count().reset_index()
popular_items = popular_items.sort_values(by=['categoryid', 'event'], ascending=[True, False])

# Step 2: Define a recommendation function
def recommend_items(categoryid, exclude_itemid=None, num_recommendations=5):
    """ Recommend top N items from the same category, excluding the current item """
    recommendations = popular_items[popular_items['categoryid'] == categoryid]
    if exclude_itemid:
        recommendations = recommendations[recommendations['itemid'] != exclude_itemid]
    return recommendations.head(num_recommendations)['itemid'].tolist()

# Step 3: Generate recommendations for each item
sample_item = events_df.sample(1)  # Pick a random item a user interacted with
sample_category = sample_item['categoryid'].values[0]
sample_itemid = sample_item['itemid'].values[0]

# Get recommendations
recommended_items = recommend_items(sample_category, exclude_itemid=sample_itemid)

print(f"User viewed item {sample_itemid} in category {sample_category}.")
print(f"Recommended items from the same category: {recommended_items}")


User viewed item 30391 in category nan.
Recommended items from the same category: []


Instead of training a full recommendation model, we'll use a User-Item Matrix and find similar users using K-Nearest Neighbors (KNN).

In [None]:
# Load cleaned events data
events_df = pd.read_csv("cleaned_events.csv")

# Convert event types into numerical scores (higher score for transactions)
event_scores = {'view': 1, 'addtocart': 2, 'transaction': 5}
events_df['rating'] = events_df['event'].map(event_scores)

# Step 1: Create a Sparse User-Item Matrix
unique_users = events_df['visitorid'].nunique()
unique_items = events_df['itemid'].nunique()

user_to_index = {user: i for i, user in enumerate(events_df['visitorid'].unique())}
item_to_index = {item: i for i, item in enumerate(events_df['itemid'].unique())}
index_to_item = {v: k for k, v in item_to_index.items()}  # Reverse mapping

rows = events_df['visitorid'].map(user_to_index)
cols = events_df['itemid'].map(item_to_index)
data = events_df['rating']

user_item_sparse = sparse.csr_matrix((data, (rows, cols)), shape=(unique_users, unique_items))

# Step 2: Train the ALS Model
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
model.fit(user_item_sparse)

# Step 3: Generate Recommendations for a User
def recommend_for_user(user_id, num_recommendations=5):
    """ Recommend top N items for a given user """
    if user_id not in user_to_index:
        return "User not found in dataset."

    user_idx = user_to_index[user_id]
    recommendations = model.recommend(user_idx, user_item_sparse[user_idx], N=num_recommendations)

    # Extract only integer item indices from recommendations
    recommended_items = [index_to_item[int(i[0])] for i in recommendations]  

    return recommended_items

# Test Recommendation for a Random User
sample_user = events_df['visitorid'].sample(1).values[0]
recommended_items = recommend_for_user(sample_user)

print(f"Top recommendations for User {sample_user}: {recommended_items}")


100%|██████████| 20/20 [02:07<00:00,  6.40s/it]


Top recommendations for User 328227: [np.int64(247909), np.int64(285930)]


In [21]:
import pickle  # For saving the model

# Load item properties
item_properties = pd.read_csv("cleaned_item_properties.csv")

# Step 1: Modify Recommendation Function to Include Item Details
def recommend_for_user(user_id, num_recommendations=5):
    """ Recommend top N items for a given user with details """
    if user_id not in user_to_index:
        return "User not found in dataset."

    user_idx = user_to_index[user_id]
    recommendations = model.recommend(user_idx, user_item_sparse[user_idx], N=num_recommendations)

    # Convert indices back to item IDs
    recommended_items = [index_to_item[int(i[0])] for i in recommendations]

    # Merge with item details
    recommended_df = pd.DataFrame({'itemid': recommended_items})
    recommended_df = recommended_df.merge(item_properties, on="itemid", how="left")

    return recommended_df

# Step 2: Save the ALS Model to a File
with open("als_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Step 3: Test Enhanced Recommendation Output
sample_user = events_df['visitorid'].sample(1).values[0]
recommended_items = recommend_for_user(sample_user)

print(f"Top recommendations for User {sample_user}:\n", recommended_items)


Top recommendations for User 342365:
    itemid  available  categoryid
0    5411        NaN         NaN
1  285930        1.0         NaN
