In [30]:
from typing import Union
from tqdm import tqdm

import pandas as pd
import numpy as np

from scipy import stats
from statsmodels.stats.meta_analysis import effectsize_smd
from statsmodels.stats import proportion
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.power import zt_ind_solve_power

In [31]:
data = pd.read_excel('gb_sem_8_hm.xlsx')


Unknown extension is not supported and will be removed



In [32]:
data.head(10)

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,737,variant,0.0
1,2423,control,0.0
2,9411,control,0.0
3,7311,control,0.0
4,6174,variant,0.0
5,2380,variant,0.0
6,2849,control,0.0
7,9168,control,0.0
8,6205,variant,0.0
9,7548,control,0.0


In [33]:
data.shape

(10000, 3)

In [34]:
data.USER_ID.nunique()

6324

In [35]:
data = data.groupby(['USER_ID', 'VARIANT_NAME'], as_index=False).agg({'REVENUE': 'sum'})

In [36]:
data.shape

(7865, 3)

In [37]:
data.groupby('USER_ID', as_index=False).agg({'VARIANT_NAME': 'count'})['VARIANT_NAME'].value_counts()

1    4783
2    1541
Name: VARIANT_NAME, dtype: int64

In [38]:
unique_ids = \
(data
 .groupby('USER_ID', as_index=False)
 .agg({'VARIANT_NAME': 'count'})
 #.['VARIANT_NAME'].value_counts()
 .query('VARIANT_NAME == 1')
 .USER_ID
 .values
 )

In [39]:
data_new = data[data.USER_ID.isin(unique_ids)].copy(deep=True)

In [40]:
data_new.describe()

Unnamed: 0,USER_ID,REVENUE
count,4783.0,4783.0
mean,4994.395777,0.135873
std,2898.618472,3.011392
min,2.0,0.0
25%,2476.0,0.0
50%,4975.0,0.0
75%,7515.0,0.0
max,9998.0,196.01


In [41]:
data.describe()

Unnamed: 0,USER_ID,REVENUE
count,7865.0,7865.0
mean,4977.856707,0.126442
std,2890.696828,2.61386
min,2.0,0.0
25%,2463.0,0.0
50%,4965.0,0.0
75%,7504.0,0.0
max,10000.0,196.01


In [42]:
data_new.head(10)

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,2,control,0.0
3,4,variant,0.0
4,5,variant,0.0
5,6,variant,0.0
6,9,variant,0.0
9,11,control,0.0
10,12,control,0.0
11,13,control,0.0
12,15,variant,0.0
15,19,variant,0.0


In [43]:
data.head(10)

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,2,control,0.0
1,3,control,0.0
2,3,variant,0.0
3,4,variant,0.0
4,5,variant,0.0
5,6,variant,0.0
6,9,variant,0.0
7,10,control,0.0
8,10,variant,0.0
9,11,control,0.0


In [44]:
data_new.shape #(7865, 3)

(4783, 3)

In [45]:
def continious_result(control: pd.DataFrame,
                      treatment: pd.DataFrame,
                      column: str,
                      n_iters: int = 10_000) -> pd.DataFrame:
    # Статистика по выборкам
    size = control.loc[:, column].shape[0]
    
    control_mean = control.loc[:, column].mean()
    treatment_mean = treatment.loc[:, column].mean()
    
    control_std = control.loc[:, column].std(ddof=1)
    treatment_std = treatment.loc[:, column].std(ddof=1)
    
    # Бутсрап
    booted_diff = []
    for _ in tqdm(range(n_iters)):
        control_sample = control.loc[:, column].sample(n=size, replace=True).values
        treatment_sample = treatment.loc[:, column].sample(n=size, replace=True).values
        booted_diff.append(np.mean(control_sample - treatment_sample))
    
    # Считаем статистику после бустрапа
    md_ci, std_ci = np.mean(booted_diff), np.std(booted_diff, ddof=1)
    left_ci, right_ci = np.percentile(booted_diff, [2.5, 97.5])
    p_value_ci = 2 * (1 - stats.norm.cdf(np.abs(md_ci / std_ci)))
    
    # Считаем мощность эксперимента
    effect_size, _ = effectsize_smd(mean1=treatment_mean, sd1=treatment_std, nobs1=size,
                                    mean2=control_mean, sd2=control_std, nobs2=size)
    power = tt_ind_solve_power(effect_size=effect_size,
                               nobs1=size,
                               alpha=.05,
                               power=None,
                               ratio=1)
    # Формируем отчёт 
    result = pd.DataFrame({'effect_size': effect_size,
                           'alpha': p_value_ci, 
                           'beta': (1-power),
                           'CI': f'[{np.round(left_ci, 3)}, {np.round(right_ci, 3)}]',
                           'difference': md_ci,},
                          index=[column]) 
    return result

In [46]:
control = data_new[data_new['VARIANT_NAME'] == 'control'].copy(deep=True)
treatment = data_new[data_new['VARIANT_NAME'] == 'variant'].copy(deep=True)

In [47]:
control.describe()

Unnamed: 0,USER_ID,REVENUE
count,2390.0,2390.0
mean,5020.88159,0.196887
std,2904.850992,4.172201
min,2.0,0.0
25%,2517.25,0.0
50%,5012.5,0.0
75%,7616.0,0.0
max,9998.0,196.01


In [48]:
treatment.describe()

Unnamed: 0,USER_ID,REVENUE
count,2393.0,2393.0
mean,4967.943168,0.074935
std,2892.745368,0.858207
min,4.0,0.0
25%,2435.0,0.0
50%,4955.0,0.0
75%,7379.0,0.0
max,9995.0,23.04


In [49]:
import plotly.express as px
fig = px.histogram(data_new,
                   x='REVENUE',
                   color = 'VARIANT_NAME',
                   title='Доход',
                   marginal = 'box',
                   nbins = 100,
                   barmode='overlay')
fig.show()

In [51]:
continious_result(control, treatment, column='REVENUE') # выручка - непрерывная метрика

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 3608.21it/s]


Unnamed: 0,effect_size,alpha,beta,CI,difference
REVENUE,-0.040483,0.170408,0.712143,"[-0.006, 0.323]",0.121175


#### Значение ошибки 1 рода alpha>p-value 0.05
#### Значение ошибки 2 рода beta>0.2 20%
#### Доверительный интервал включает 0
#### Ответ: результат теста не статистически значим
#### Тест неудачный, необходимо либо перезапустить тест, либо увеличеть срок проведения для отслеживания ключевое метрики
