### A/B Testing on Grocery Website Data

#### intro:
observe whether a change made in the web interface of a market increases the number of clicks

In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
from scipy.stats import ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu,\
    pearsonr, spearmanr, kendalltau,f_oneway, kruskal
from statsmodels.stats.proportion import proportions_ztest

##### data


In [12]:
df = pd.read_csv("../data/grocery_website_data.csv")
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [19]:
# aggregate VisitPageFlag
df1 = df.groupby(["IP Address","LoggedInFlag","ServerID"])["VisitPageFlag"].sum()
df1 = df1.reset_index(name="VisitPageFlagSum")
df1["VisitPageFlag"] = df1["VisitPageFlagSum"].apply(lambda x: 1 if x!=0 else 0)
df1.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageFlagSum,VisitPageFlag
0,0.0.108.2,0,1,0,0
1,0.0.109.6,1,1,0,0
2,0.0.111.8,0,3,0,0
3,0.0.160.9,1,2,0,0
4,0.0.163.1,0,2,0,0


In [35]:
# split group according to ServerID
# 1 as Test group, 2 and 3 as Control group
df1["group"] = df1["ServerID"].map({1:"Test", 2:"Control", 3:"Control"})
df1 = df1.drop(["VisitPageFlagSum","ServerID"],axis=1)
df1.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.108.2,0,0,Test
1,0.0.109.6,1,0,Test
2,0.0.111.8,0,0,Control
3,0.0.160.9,1,0,Control
4,0.0.163.1,0,0,Control


In [42]:
df_control = df1[df1["group"]=="Control"]
df_control.reset_index(inplace=True, drop = True)
df_test = df1[df1["group"]=="Test"]
df_test.reset_index(inplace=True, drop = True)

In [47]:
# calculate the ratio of those who enter the page we want/all those who enter the site
control_visit_site = df_control["VisitPageFlag"].count()
control_visit_page = df_control["VisitPageFlag"].sum()
control_visit_ratio = control_visit_page/control_visit_site
control_visit_ratio

0.09225097803189888

In [48]:
# for test group
test_visit_site = df_test["VisitPageFlag"].count()
test_visit_page = df_test["VisitPageFlag"].sum()
test_visit_ratio = test_visit_page/test_visit_site
test_visit_ratio

0.11551511875806984

##### AB testing

In [50]:
# check normality
# H0 : normality is provided.
# H1 : normality is not provided.

test_stat, p_value = shapiro(df_control["VisitPageFlag"])
print("Test Stat = %.4f, p-value = %.4f" % (test_stat, p_value))
test_stat, p_value = shapiro(df_test["VisitPageFlag"])
print("Test Stat = %.4f, p-value = %.4f" % (test_stat, p_value))

# p-value<alpha=0.05, so rejected H0, 
# meaning that the assumption of normality is not provided
# so do  Mann-Whitney U test

Test Stat = 0.3266, p-value = 0.0000
Test Stat = 0.3711, p-value = 0.0000


In [51]:
# Mann-Whitney U test
# H0 : There is no significant difference between the two groups   
    # in terms of click rate to the desired page.
# H1 : There is a difference

test_stat, pvalue = mannwhitneyu(df_control["VisitPageFlag"], df_test["VisitPageFlag"])
print("Test Stat = %.4f, p-value = %.4f" % (test_stat, p_value))

# p-value<alpha=0.05, so rejected H0, 
# meaning that There is a difference btw two groups

Test Stat = 1080913226.5000, p-value = 0.0000


In [53]:
group_count = df1.groupby(["group", "VisitPageFlag"])["group"].count().reset_index(name="Count")
groupped = pd.crosstab(group_count["group"], group_count["VisitPageFlag"], values=group_count["Count"], aggfunc=np.sum, margins=True)
100*groupped.div(groupped["All"], axis=0)

VisitPageFlag,0,1,All
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,90.774902,9.225098,100.0
Test,88.448488,11.551512,100.0
All,89.998296,10.001704,100.0
