In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.stats.api as sms
from scipy.stats import shapiro, levene, mannwhitneyu, ttest_ind, pearsonr, fisher_exact

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df = pd.read_csv('/kaggle/input/ecommerce-ab-testing-2022-dataset1/ecommerce_ab_testing_2022_dataset1/ab_data.csv')

In [3]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [4]:
df.shape

(294480, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294480 entries, 0 to 294479
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294480 non-null  int64 
 1   timestamp     294480 non-null  object
 2   group         294480 non-null  object
 3   landing_page  294480 non-null  object
 4   converted     294480 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [6]:
df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [7]:
df.nunique()

user_id         290585
timestamp        35993
group                2
landing_page         2
converted            2
dtype: int64

In [8]:
df.drop_duplicates(subset='user_id',keep=False, inplace=True)

In [9]:
df.shape

(286690, 5)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 286690 entries, 0 to 294479
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       286690 non-null  int64 
 1   timestamp     286690 non-null  object
 2   group         286690 non-null  object
 3   landing_page  286690 non-null  object
 4   converted     286690 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 13.1+ MB


In [11]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [12]:
result = df.groupby(['group', 'landing_page']).size()

In [13]:
result

group      landing_page
control    old_page        143293
treatment  new_page        143397
dtype: int64

In [14]:
result2 = df.groupby(['group', 'landing_page'])['converted'].mean().reset_index(name='mean_conversion')

In [15]:
result2

Unnamed: 0,group,landing_page,mean_conversion
0,control,old_page,0.12
1,treatment,new_page,0.119


In [16]:
result3 = df['landing_page'].value_counts(normalize=True).mul(100).reset_index(name='percentage')

In [17]:
result3

Unnamed: 0,landing_page,percentage
0,new_page,50.018
1,old_page,49.982


In [18]:
result4 = df.query("(group == 'control' & landing_page == 'new_page') | (group == 'treatment' & landing_page == 'old_page')")

In [19]:
result4

Unnamed: 0,user_id,timestamp,group,landing_page,converted


Control group = new page

Treatment group = old page

### AB Test
#### Assumption of Normality

- H0: Assumption of normal distribution is satisfied
- H1: The assumption of normal distribution is not met

If the p-value is less than 0.05, it is considered significant and a non-parametric test is used. Otherwise, a parametric test.

In [20]:
test_stat, pvalue  = shapiro(df.loc[df["landing_page"] == "old_page", "converted"])
print("p-value:", pvalue)
print("test_stat:", test_stat)

p-value: 0.0
test_stat: 0.3792334198951721




In [21]:
test_stat, pvalue = shapiro(df.loc[df["landing_page"] == "new_page", "converted"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.3769, p-value = 0.0000


p-value < 0.05

### Variance Homogeneity
- H0: Variances are homogeneous
- H1: Variances are not homogeneous

In [22]:
test_stat, pvalue = levene(df.loc[df["landing_page"] == "new_page", "converted"],
                           df.loc[df["landing_page"] == "old_page", "converted"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 1.4268, p-value = 0.2323


p-value > 0.05

Variances are homogenous H0: cannot be rejected

### Hypothesis Testing¶

- H0: There is not statistically significant difference between the old page and new page
- H1: There is statistically significant difference between the old page and new page


In [23]:
test_stat, pvalue = mannwhitneyu(df.loc[df["landing_page"] == "new_page", "converted"],
                           df.loc[df["landing_page"] == "old_page", "converted"])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 10259026653.0000, p-value = 0.2323


p-value >0.05
There is not statistically significant difference between the old page and new page  H0: cannot be rejected