In [17]:
import pandas as pd
import numpy as np


import math
from statsmodels.stats.proportion import proportions_ztest
from scipy import stats

import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('marketing_AB.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,0,1069124,ad,False,130,Monday,20
1,1,1119715,ad,False,93,Tuesday,22
2,2,1144181,ad,False,21,Tuesday,18
3,3,1435133,ad,False,355,Tuesday,10
4,4,1015700,ad,False,276,Friday,14


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588101 entries, 0 to 588100
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     588101 non-null  int64 
 1   user id        588101 non-null  int64 
 2   test group     588101 non-null  object
 3   converted      588101 non-null  bool  
 4   total ads      588101 non-null  int64 
 5   most ads day   588101 non-null  object
 6   most ads hour  588101 non-null  int64 
dtypes: bool(1), int64(4), object(2)
memory usage: 27.5+ MB


In [11]:
# Drop a column
df.drop(columns=['Unnamed: 0'], inplace=True)
# Let us rename columns
df.columns = ['user_id', 'test_group', 'converted', 'total_ads',
              'most_ads_day', 'most_ads_hour']
# Let's rename 'ad' and 'psa' to 'treatmen' and 'control'
df.replace({'test_group':{'ad':'treatment',
                          'psa':'control'}}, inplace=True)
# Calculate conversion rate/proportion per user group
df_gr = df.groupby('test_group').agg({'converted':'sum',
                                 'user_id':'nunique'}).reset_index()
df_gr.columns = ['test_group', 'converted', 'unique_users']
df_gr['proportion'] = (df_gr['converted']) / df_gr['unique_users']

In [12]:
df_gr

Unnamed: 0,test_group,converted,unique_users,proportion
0,control,420,23524,0.017854
1,treatment,14423,564577,0.025547


####  Let us name the conversion rate for the control group as proportion 1 (p1) and the conversion rate for the treatment group as proportion 2 (p2).


#### H0: p1 = p2
#### H1: p1 < p2 (left-tailed)

In [18]:
def z_calc(p1, p2, n1, n2):
    p_star = (p1*n1 + p2*n2) / (n1 + n2)
    return (p2 - p1) / np.math.sqrt(p_star*(1 - p_star)*((1.0 / n1) + (1.0 / n2)))

def sample_required(p1, p_diff, alpha):
    if p_diff <= 0:  # p_diff = p2-p1
        raise ValueError("p_diff must be > 0")
    n = 1
    while True:
        z = z_calc(p1, p1+p_diff, n1=n, n2=n)
        p = 1 - stats.norm.cdf(z)
        if p < alpha:
            break
        n += 1
    return n


In [19]:
sample_size = sample_required(df_gr[['proportion'][0]][0],
                             (df_gr[['proportion'][0]][1] - df_gr[['proportion'][0]][0]),0.05)

In [20]:
# Split the data into 2 dataframes for each group
df_tr = df[df['test_group'] == 'treatment']
df_ctrl = df[df['test_group'] == 'control']


In [21]:
# Take random samples
df_tr_sample = df_tr.sample(n=20000, random_state=23)
df_ctrl_sample = df_ctrl.sample(n=20000, random_state=23)
df_sample = pd.concat([df_tr_sample, df_ctrl_sample], ignore_index=True)

In [22]:
# Calculate proportions
df_sample_gr = df_sample.groupby('test_group').agg({'converted':'sum',
                                 'user_id':'nunique'}).reset_index()
df_sample_gr.columns = ['test_group', 'converted', 'unique_users']
df_sample_gr['proportion'] = (df_sample_gr['converted']) / df_sample_gr['unique_users']

In [23]:
df_sample_gr

Unnamed: 0,test_group,converted,unique_users,proportion
0,control,348,20000,0.0174
1,treatment,497,20000,0.02485


In [24]:
# Looking at df_sample_gr, let us fill in the success numbers and the total sample sizes for each group
number_of_successes = [348, 497]
total_sample_sizes = [20000, 20000]
# Calculate z-test statistic and p-value
test_stat, p_value = proportions_ztest(number_of_successes, total_sample_sizes, alternative='smaller')

In [26]:
print('Computed z-tetst statistics = ',test_stat)
print('Computed p-value = ',p_value)

Computed z-tetst statistics =  -5.180769799760474
Computed p-value =  1.1048602970777487e-07
