In [2]:
import pandas as pd
import numpy as np


In [14]:
df = pd.read_csv("AB_Testing_Dataset.csv")

In [16]:
df.head(10)

Unnamed: 0,user_id,variant,channel,is_lookalike,clicked_cta,converted,revenue,visit_date
0,1,B,Email,1,0,0,0.0,7/12/2024
1,2,A,Organic,0,0,0,0.0,7/20/2024
2,3,B,Paid Social,1,0,0,0.0,7/17/2024
3,4,A,Email,0,0,0,0.0,8/9/2024
4,5,B,Organic,1,1,0,0.0,7/26/2024
5,6,B,Email,0,0,0,0.0,7/14/2024
6,7,B,Organic,1,1,0,0.0,7/25/2024
7,8,B,Paid Social,0,0,0,0.0,7/26/2024
8,9,A,Email,0,1,1,32.72,8/16/2024
9,10,B,Paid Social,0,1,0,0.0,7/25/2024


In [41]:
summary = df.groupby('variant').agg(
    users=('user_id', 'nunique'),
    conversions=('converted', 'sum'),
    conversion_rate=('converted', 'mean'),
    total_revenue=('revenue', 'sum'),
    revenue_per_user=('revenue', 'mean')
)


In [43]:
print(summary)

         users  conversions  conversion_rate  total_revenue  revenue_per_user
variant                                                                      
A         2581          162         0.062766        6457.94          2.502108
B         2419          209         0.086399        9322.33          3.853795


## Statistical test – conversion (chi-square)

In [26]:
from scipy import stats
import numpy as np

conv_table = pd.crosstab(df['variant'], df['converted'])
print(conv_table)



converted     0    1
variant             
A          2419  162
B          2210  209


In [28]:
chi2, p, dof, expected = stats.chi2_contingency(conv_table)
print("Chi-square p-value:", p)

Chi-square p-value: 0.001734429888783756


In [None]:
## because p value < 0.05, We observed a statistically significant improvement in conversion for the new layout (Variant B)

## Statistical test – revenue per user (t-test)

In [34]:
rev_A = df.loc[df['variant'] == 'A', 'revenue']
rev_B = df.loc[df['variant'] == 'B', 'revenue']

from scipy.stats import ttest_ind
tstat, p_val = ttest_ind(rev_A, rev_B, equal_var=False)
print("T-test p-value:", p_val)


T-test p-value: 3.6846388090007605e-05


In [None]:
## Again, p_val < 0.05 → Variant B is significantly better in revenue per user.

## Lookalike audience segment

In [37]:
seg_summary = df.groupby(['variant', 'is_lookalike']).agg(
    users=('user_id', 'nunique'),
    conversion_rate=('converted', 'mean'),
    revenue_per_user=('revenue', 'mean')
).reset_index()

print(seg_summary)


  variant  is_lookalike  users  conversion_rate  revenue_per_user
0       A             0   1592         0.055905          2.219491
1       A             1    989         0.073812          2.957037
2       B             0   1470         0.080952          3.721224
3       B             1    949         0.094837          4.059146
