In [1]:
import pandas as pd
import numpy as np

In [3]:
events_df = pd.read_csv("Dataset/Synthetic_Dataset/user_events.csv")

In [4]:
user_profiles = []

for user_id, user_df in events_df.groupby("user_id"):
    total_purchases = len(user_df[user_df['event_type'] == 'purchase'])
    total_interactions = len(user_df)
    
    if total_purchases > 0:
        avg_spend = user_df[user_df['event_type'] == 'purchase']["price_seen"].mean()
    else:
        avg_spend = user_df["price_seen"].mean()

    # Eco-awareness: % of items purchased with CO2e < 300
    merged = user_df.merge(
        pd.read_csv("Dataset/Synthetic_Dataset/products_catalog.csv")[["itemID", "Total(kg CO2e)", "refurbished_available"]],
        on="itemID",
        how="left"
    )

    eco_purchases = merged[(merged["event_type"] == "purchase") & (merged["Total(kg CO2e)"] < 300)]
    refurb_purchases = merged[(merged["event_type"] == "purchase") & (merged["refurbished_available"] == 1)]

    eco_awareness = len(eco_purchases) / total_purchases if total_purchases > 0 else 0
    refurb_pref = 1 if len(refurb_purchases) >= 1 else 0
    delay_tolerant = 1 if np.random.rand() < 0.6 else 0  # Simulate preference for delayed delivery

    user_profiles.append({
        "user_id": user_id,
        "avg_spend": round(avg_spend, 2),
        "eco_awareness": round(eco_awareness, 2),
        "refurb_pref": refurb_pref,
        "delay_tolerant": delay_tolerant
    })


In [5]:
user_profiles_df = pd.DataFrame(user_profiles)

user_profiles_df.head()

Unnamed: 0,user_id,avg_spend,eco_awareness,refurb_pref,delay_tolerant
0,1,24447.25,1.0,1,0
1,2,28625.0,0.8,1,1
2,3,29251.75,0.75,1,1
3,4,17817.5,0.67,1,1
4,5,28321.83,0.83,1,1


In [6]:
user_profiles_df.to_csv("Dataset/Synthetic_Dataset/user_profiles.csv", index=False)