In [1]:
import pandas as pd
import numpy as np
import math
from math import log
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [2]:
df = pd.read_csv("AB_test_data.csv")
df.head()

Unnamed: 0,purchase_TF,Variant,date,id
0,False,A,2019-11-08,0x25b44a
1,False,B,2020-08-27,0x46271e
2,False,A,2020-06-11,0x80b8f1
3,False,B,2020-08-22,0x8d736d
4,False,A,2020-08-05,0x96c9c8


## Question1

In [3]:
Group_A = df.loc[df["Variant"] == "A"]
Group_B = df.loc[df["Variant"] == "B"]

In [4]:
Num_A = Group_A.shape[0]
Num_B = Group_B.shape[0]

In [5]:
Group_A_True = sum(Group_A["purchase_TF"])
Group_B_True = sum(Group_B["purchase_TF"])

##### Conversion rate of Group A

In [6]:
Group_A_conversion_rate = Group_A_True/Num_A
Group_A_conversion_rate

0.149616

##### Conversion rate of Group B

In [7]:
Group_B_conversion_rate = Group_B_True/Num_B
Group_B_conversion_rate

0.1766

In [8]:
successes = [Group_A_True, Group_B_True]
nobs = [Num_A, Num_B]

z_stat, pval = proportions_ztest(successes, nobs=nobs,alternative="smaller")
print(f'p-value: {pval:.10f}') 

p-value: 0.0000000845


#####  p-value < 0.05, which reject the null hypothesis and alternative B improved conversion rate

## Question 2

In [17]:
P_bar = (Group_A_True+Group_B_True)/(Num_A+Num_B)
P_bar

0.15065384615384617

In [18]:
def cal_opt_sample_size(P_a,P_b,P): 
    return (1/(P_b-P_a)**2)*(2.1*(2*P*(1-P))**0.5+0.7*(P_a*(1-P_a)+P_b*(1-P_b)))**2

opt_sample_size = round(cal_opt_sample_size(Group_A_conversion_rate,Group_B_conversion_rate,P_bar)+0.5)
opt_sample_size

2157

In [22]:
sample_list = []
results = []
for i in range(0,10):
    df_temp = df[df["Variant"] == "B"].sample(opt_sample_size)
    sample_list.append(df_temp)

    Num_B_temp = df_temp.shape[0]
    Group_B_True_temp = sum(df_temp["purchase_TF"])
    B_rate = Group_B_True_temp/Num_B_temp
    
    successes = [Group_A_True, Group_B_True_temp]
    nobs = [Num_A, Num_B_temp]

    z_stat, pval = proportions_ztest(successes, nobs=nobs,alternative="smaller")
    results.append(pval)
    print('sample '+ str(i+1))
    print(f'p-value: {pval:.10f}') 
    print(f'z-score: {z_stat:.10f}') 

sample 1
p-value: 0.0040543055
z-score: -2.6475135492
sample 2
p-value: 0.0000003258
z-score: -4.9752380683
sample 3
p-value: 0.0002473989
z-score: -3.4835569736
sample 4
p-value: 0.0000224770
z-score: -4.0804220700
sample 5
p-value: 0.0000102474
z-score: -4.2594314815
sample 6
p-value: 0.0000777799
z-score: -3.7820216595
sample 7
p-value: 0.0002473989
z-score: -3.4835569736
sample 8
p-value: 0.0000001274
z-score: -5.1541319954
sample 9
p-value: 0.0010861746
z-score: -3.0655983503
sample 10
p-value: 0.0000373074
z-score: -3.9610696162


## Question 3

In [23]:
iteration = []
ln_A = log(1/0.05)
ln_B = log(0.2)

In [24]:
for i in range(10):
    sample = sample_list[i]
    lambda_n = 0
    n = 0
    for j in range(opt_sample_size):
        #sample_mean = sample.loc[sample["Variant"] == "B"]["purchase_TF"].mean()
        sample_mean = sample["purchase_TF"].mean()
        lambda_0 = log((1-sample_mean)/(1-Group_A_conversion_rate))
        lambda_1 = log((sample_mean)/(Group_A_conversion_rate))
        if sample["purchase_TF"].iloc[j] == True:
            lambda_n = lambda_n + lambda_1
        else:
            lambda_n = lambda_n + lambda_0
        if lambda_n > ln_B and lambda_n < ln_A:
            n = n + 1
        elif lambda_n <= ln_B:
            print('For sample',i+1,', accept H0 and iteration is ',n+1)
            iteration.append(n)
            break
        elif lambda_n >= ln_A:
            print('For sample',i+1,', accept H1 and iteration is ',n+1)
            iteration.append(n)
            break

For sample 1 , accept H1 and iteration is  1348
For sample 2 , accept H1 and iteration is  77
For sample 3 , accept H1 and iteration is  1097
For sample 4 , accept H1 and iteration is  526
For sample 5 , accept H1 and iteration is  628
For sample 6 , accept H1 and iteration is  1058
For sample 7 , accept H1 and iteration is  1391
For sample 8 , accept H1 and iteration is  126
For sample 9 , accept H1 and iteration is  811
For sample 10 , accept H0 and iteration is  457


In [16]:
iteration

[892,
 239,
 145,
 198,
 1331,
 893,
 1148,
 694,
 791,
 252,
 892,
 239,
 145,
 198,
 1331,
 893,
 1148,
 694,
 791,
 252,
 892,
 239,
 145,
 198,
 1331,
 893,
 1148,
 694,
 791,
 252]