In [1]:
import pandas as pd
import numpy as np

path = "../data/pricing_abtest_events_sample.csv"
df = pd.read_csv(path)

print("Rows, Cols:", df.shape)
print("\nColumns:", list(df.columns))

df["event_ts"] = pd.to_datetime(df["event_ts"])
df["assignment_ts"] = pd.to_datetime(df["assignment_ts"])


Rows, Cols: (19122, 25)

Columns: ['event_id', 'user_id', 'session_id', 'experiment_id', 'variant', 'assignment_ts', 'event_ts', 'event_type', 'device_type', 'region', 'traffic_source', 'product_id', 'category', 'is_target_product', 'size_selected', 'color_selected', 'base_price', 'price_shown', 'discount_applied', 'final_price_paid', 'quantity', 'substitute_viewed', 'substitute_purchased', 'return_flag', 'return_reason']


In [2]:
required_cols = [
    "user_id","session_id","experiment_id","variant","assignment_ts",
    "event_ts","event_type","product_id","is_target_product",
    "price_shown","final_price_paid"
]
missing = [c for c in required_cols if c not in df.columns]
print("Missing required columns:", missing)


Missing required columns: []


In [3]:
print("\nVariant counts (events):")
print(df["variant"].value_counts(dropna=False))

print("\nEvent type counts:")
print(df["event_type"].value_counts(dropna=False))



Variant counts (events):
variant
treatment    9923
control      9199
Name: count, dtype: int64

Event type counts:
event_type
view           10810
add_to_cart     4544
purchase        3533
return           235
Name: count, dtype: int64


In [4]:
target_view_users = set(
    df[(df["event_type"]=="view") & (df["is_target_product"]==True)]["user_id"].unique()
)

print("Eligible users (target view at least once):", len(target_view_users))
print("Unique users overall:", df["user_id"].nunique())

df_eligible = df[df["user_id"].isin(target_view_users)].copy()
print("Eligible events:", df_eligible.shape[0])


Eligible users (target view at least once): 4568
Unique users overall: 4568
Eligible events: 19122


In [5]:
user_variant = (
    df[["user_id", "variant"]]
    .drop_duplicates()
    .groupby("variant")
    .size()
)

total_users = user_variant.sum()

srm_df = (
    user_variant
    .to_frame(name="users")
    .assign(
        pct=lambda x: x["users"] / total_users * 100,
        expected_pct=50.0,
        diff_from_expected_pp=lambda x: x["users"] / total_users * 100 - 50.0
    )
)

print(srm_df)
print("\nTotal users:", total_users)


           users   pct  expected_pct  diff_from_expected_pp
variant                                                    
control     2284  50.0          50.0                    0.0
treatment   2284  50.0          50.0                    0.0

Total users: 4568


In [6]:
price_audit = (
    df[df["is_target_product"] == True]
    .groupby(["variant", "price_shown"])
    .size()
    .reset_index(name="events")
    .sort_values(["variant", "price_shown"])
)

print(price_audit)

expected_prices = {"control": 19.99, "treatment": 24.99}

df_target = df[df["is_target_product"] == True].copy()
df_target["price_expected"] = df_target["variant"].map(expected_prices)
df_target["price_mismatch"] = df_target["price_shown"] != df_target["price_expected"]

mismatch_rate = df_target["price_mismatch"].mean() * 100
print(f"\nPrice mismatch rate: {mismatch_rate:.2f}%")


     variant  price_shown  events
0    control        19.99    8788
1  treatment        24.99    8197

Price mismatch rate: 0.00%
