In [1]:
from scipy.stats import ttest_1samp
import numpy as np

import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

from scipy.stats import norm, t, kstest, shapiro
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Need conda install -c conda-forge openpyxl
data = pd.read_csv ('cookie_cats.csv')
data.head(20)

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True
5,540,gate_40,187,True,True
6,1066,gate_30,0,False,False
7,1444,gate_40,2,False,False
8,1574,gate_40,108,True,True
9,1587,gate_40,153,True,False


In [40]:
data.describe()

Unnamed: 0,id,converted
count,294478.0,294478.0
mean,787974.124733,0.119659
std,91210.823776,0.324563
min,630000.0,0.0
25%,709032.25,0.0
50%,787933.5,0.0
75%,866911.75,0.0
max,945999.0,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         294478 non-null  int64 
 1   time       294478 non-null  object
 2   con_treat  294478 non-null  object
 3   page       294478 non-null  object
 4   converted  294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [43]:
data.shape

(294478, 5)

In [14]:
data.con_treat.value_counts()

treatment    147276
control      147202
Name: con_treat, dtype: int64

In [41]:
data.converted.value_counts()

0    259241
1     35237
Name: converted, dtype: int64

In [7]:
data[data['converted']>0].describe()

Unnamed: 0,id,converted
count,35237.0,35237.0
mean,788394.376962,1.0
std,91398.565565,0.0
min,630001.0,1.0
25%,709555.0,1.0
50%,787633.0,1.0
75%,867831.0,1.0
max,945991.0,1.0


In [9]:
data.isna().sum()

id           0
time         0
con_treat    0
page         0
converted    0
dtype: int64

In [47]:
k1 = data[data['con_treat']=='control']['converted'].sum()
k2 = data[data['con_treat']=='treatment']['converted'].sum()
print(f'{k1},{k2}')

17723,17514


In [45]:
n1 = data[data['con_treat']=='control'].shape[0]
n2 = data[data['con_treat']=='treatment'].shape[0]
print(f'{n1},{n2}')

147202,147276


In [17]:
print(k1/n1)

0.12039917935897611


In [18]:
print(k2/n2)

0.11891957956489856


In [34]:
from statsmodels.stats import proportion

z_score, z_pvalue = proportion.proportions_ztest(np.array([k1, k2]), 
                                                   np.array([n1, n2]))
print(f' Z score: {z_score:.3f}, P-value: {z_pvalue:.3f}')

 Z score: 1.237, P-value: 0.216


In [36]:
chisq, pvalue, table = proportion.proportions_chisquare(np.array([k1, k2]), 
                                                   np.array([n1, n2]))

print(f'ChiSq: {chisq:.3f}, P-value: {pvalue:.3f}')

ChiSq: 1.530, P-value: 0.216


<b>Проверка мощности

In [54]:
import math
import statsmodels.stats.power as smp
from tqdm.notebook import tqdm


plt.style.use('ggplot')

#### Критерий пропорций (нужен для кликов, конверсий)

In [60]:
alpha = 0.05
power = 0.95
n = 147202
p_x = k1/n1
p_y = k2/n2

h = 2*math.asin(np.sqrt(p_x)) - 2*math.asin(np.sqrt(p_y))
# h - величина эффекта
h

0.004558767502302552

#### расчет мощности

In [59]:
power = smp.zt_ind_solve_power(effect_size=h, nobs1=n, alpha=alpha, alternative='two-sided')
power

0.23547563385118275

#### Расчет количества наблюдений  необходимо для заданного эффекта при с alpha = 5% и power = 95%

In [70]:
number_to_observe = smp.zt_ind_solve_power(effect_size = h, alpha = alpha, power = power, alternative='two-sided')
print(f'Number to observe: {number_to_observe:.0f}')

Number to observe: 1250552


In [56]:
effects = []
sample_sizes = []

for i in tqdm(range(10,10000)):
    effects.append(smp.tt_ind_solve_power(nobs1 = i, alpha = alpha, power = power))
    sample_sizes.append(i)

  0%|          | 0/9990 [00:00<?, ?it/s]

<b>Различия между группами не достоверны, так как ноль входит в диапазон возможных значений.
<p><b>Вывод: изменения после теста не принимаем