In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

%matplotlib inline

In [23]:
effect_size = sms.proportion_effectsize(0.15, 0.158)    # Calculating effect size based on our expected rates

required_n = sms.NormalIndPower().solve_power(
    effect_size, 
    power=0.85, 
    alpha=0.05, 
    ratio=1
    )                                                  # Calculating sample size needed

required_n = ceil(required_n)                          # Rounding up to next whole number                          

print(f'Нужно минимум {required_n} юзеров')

Нужно минимум 36549 юзеров


In [32]:
df = pd.read_csv('/Users/maximmigutin/Documents/My_projects/OTUS/classes/Extra_topics_AB/ab_data.csv')

df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [33]:
# To make sure all the control group are seeing the old page and viceversa

pd.crosstab(df['group'], df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [34]:
# users_to_drop = session_counts[session_counts > 1].index
# df = df[~df['user_id'].isin(users_to_drop)]

NameError: name 'session_counts' is not defined

In [42]:
required_n = 100000
control_sample = df[df['group'] == 'control'].sample(n=required_n, random_state=21)
treatment_sample = df[df['group'] == 'treatment'].sample(n=required_n-90000, random_state=21)

ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

In [43]:
ab_test

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,659987,2017-01-11 14:33:20.137051,control,old_page,0
1,938810,2017-01-16 08:14:45.586086,control,old_page,0
2,901078,2017-01-04 04:29:25.018222,control,old_page,0
3,888227,2017-01-08 13:26:15.416239,control,old_page,0
4,831311,2017-01-19 14:00:44.455778,control,old_page,0
...,...,...,...,...,...
109995,722345,2017-01-09 11:32:16.868248,treatment,new_page,0
109996,707703,2017-01-12 17:46:56.678622,treatment,new_page,0
109997,682724,2017-01-05 17:27:31.800997,treatment,new_page,0
109998,816884,2017-01-04 04:39:03.647973,treatment,new_page,0


In [44]:
ab_test['group'].value_counts()

control      100000
treatment     10000
Name: group, dtype: int64

In [45]:
conversion_rates = ab_test.groupby('group')['converted']

std_p = lambda x: np.std(x, ddof=0)              # Std. deviation of the proportion
se_p = lambda x: stats.sem(x, ddof=0)            # Std. error of the proportion (std / sqrt(n))

conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']


conversion_rates.style.format('{:.3f}')

Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,0.121,0.326,0.001
treatment,0.121,0.326,0.003


In [46]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']

n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: 0.06
p-value: 0.951
ci 95% for control group: [0.119, 0.123]
ci 95% for treatment group: [0.114, 0.127]
