# AB Test

In [103]:
import pandas as pd
import numpy as np
from statsmodels.stats import api as sms
from scipy.stats import chi2_contingency

# 1.0 Load Data

In [4]:
df0 = pd.read_csv("data/ab_data.csv")
df0.columns = df0.columns.str.lower()
print( df0.shape )
df0.head(3)

(294478, 5)


Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0


# 2.0 Experiment Desing

## 2.2 Experiment Parameters

In [5]:
confidence_level = 0.95

significance_level = 1 - confidence_level


p1 = .13
p2 = .15

effect_size = sms.proportion_effectsize( p1, p2 )

#statistical power
power = .8

In [6]:
effect_size

-0.0576728617308947

In [81]:
# Sample Size
sample_n = sms.NormalIndPower().solve_power(
    effect_size,
    power=power,
    alpha=significance_level
)

sample_n = int(np.ceil(sample_n))
sample_n

4720

In [82]:
print(f" Total sample size is {2*sample_n}")
print(f" Control sample size is {sample_n}")
print(f" Treatment sample size is {sample_n}")

 Total sample size is 9440
 Control sample size is 4720
 Treatment sample size is 4720


## 2.0 Descriptive analysis

In [112]:
df2 = df0.copy()

In [113]:
df2.shape

(294478, 5)

In [114]:
df2.duplicated().sum()

0

In [115]:
df2.head(1)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0


In [116]:
df2.groupby(['group', 'landing_page'])[['user_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
group,landing_page,Unnamed: 2_level_1
control,new_page,1928
control,old_page,145274
treatment,new_page,145311
treatment,old_page,1965


In [117]:
user_id_del = df2[['user_id','group']].groupby( 'user_id' ).count().reset_index().query( 'group>1' )['user_id']

In [118]:
print(df2.shape)
df2 = df2[~df2['user_id'].isin( user_id_del )]
print(df2.shape)

(294478, 5)
(286690, 5)


In [119]:
df2.groupby(['group', 'landing_page'])[['user_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
group,landing_page,Unnamed: 2_level_1
control,old_page,143293
treatment,new_page,143397


In [120]:
df_control_sample = df2[df2['group'] == 'control'].sample( n=sample_n, random_state=42 )
print( df_control_sample.shape[0] )

df_treatment_sample = df2[df2['group'] == 'treatment'].sample( n=sample_n, random_state=42 )
print( df_treatment_sample.shape[0] )

df_ab = pd.concat( [df_control_sample, df_treatment_sample]).reset_index( drop=True )
df_ab

4720
4720


Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,663037,2017-01-05 02:13:38.063443,control,old_page,0
1,932826,2017-01-13 04:50:06.941679,control,old_page,1
2,922999,2017-01-04 07:19:23.813317,control,old_page,0
3,857826,2017-01-19 02:27:37.678241,control,old_page,0
4,762926,2017-01-18 17:50:59.261406,control,old_page,0
...,...,...,...,...,...
9435,649363,2017-01-13 00:10:17.571044,treatment,new_page,0
9436,868879,2017-01-06 12:21:41.396652,treatment,new_page,0
9437,792022,2017-01-24 03:55:24.551307,treatment,new_page,0
9438,663570,2017-01-04 03:04:33.647683,treatment,new_page,0


In [121]:
comparison = df_ab.groupby('group')[['converted']].mean()
comparison

Unnamed: 0_level_0,converted
group,Unnamed: 1_level_1
control,0.115466
treatment,0.129025


In [122]:
df_ab2 = df_ab[['group', 'converted']].groupby( 'group' ).agg( {'converted' : ['sum', 'count']})
df_ab2.columns = ['converted' ,'non_converted']
df_ab2.head()

Unnamed: 0_level_0,converted,non_converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,545,4720
treatment,609,4720


In [123]:
chi_val, pval, dof, expected = chi2_contingency( df_ab2 )
pval

0.08059188499562367

In [124]:
if pval < significance_level:
    print( "Reject null hypotesis" )
else:
    print( "Fail to reject null hypotesis" )

Fail to reject null hypotesis


# 3.0 Convertion to financial impact

In [125]:
df3 = df2.copy()

In [132]:
# gmv = gross margin value

gmv_baseline = df3.shape[0] * 0.13 * 4500
gmv_baseline

167713650.00000003

In [133]:
gmv_expected = df3.shape[0] * 0.15 * 4500
gmv_expected

193515750.0

In [147]:
gmv_lift = (gmv_expected / gmv_baseline) -1
print( f"If the new page reachs the 15% conversion (2 percentuals points above the baseline) , the Gross Margin Value would increase in {gmv_lift:.2%}" )

If the new page reachs the 15% conversion (2 percentuals points above the baseline) , the Gross Margin Value would increase in 15.38%
