In [44]:
import pandas as pd

from scipy.stats import ttest_1samp, ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import shapiro
from statsmodels.stats.power import TTestIndPower

import numpy as np

In [2]:
df = pd.read_csv('ab_test_results.csv', delimiter=';')
df

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,737,variant,0
1,2423,control,0
2,9411,control,0
3,7311,control,0
4,6174,variant,0
...,...,...,...
9995,1981,control,0
9996,502,variant,0
9997,9214,variant,0
9998,7741,control,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   USER_ID       10000 non-null  int64 
 1   VARIANT_NAME  10000 non-null  object
 2   REVENUE       10000 non-null  object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [4]:
df['REVENUE'] = df['REVENUE'].replace(',', '.', regex=True).astype(float)
df.describe()

Unnamed: 0,USER_ID,REVENUE
count,10000.0,10000.0
mean,4981.0802,0.099447
std,2890.590115,2.318529
min,2.0,0.0
25%,2468.75,0.0
50%,4962.0,0.0
75%,7511.5,0.0
max,10000.0,196.01


In [5]:
duplicat_user_id = df['USER_ID'].duplicated()
df[duplicat_user_id]

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
106,7704,control,0.0
195,8406,variant,0.0
224,7042,variant,0.0
283,4064,variant,0.0
302,4409,control,0.0
...,...,...,...
9992,9303,variant,0.0
9993,2400,variant,0.0
9994,3129,control,0.0
9995,1981,control,0.0


In [6]:
duplicat_row = df.duplicated()
df[duplicat_row]

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
106,7704,control,0.0
195,8406,variant,0.0
224,7042,variant,0.0
302,4409,control,0.0
422,1621,variant,0.0
...,...,...,...
9989,2444,variant,0.0
9993,2400,variant,0.0
9994,3129,control,0.0
9995,1981,control,0.0


In [7]:
df.groupby('VARIANT_NAME')['REVENUE'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
VARIANT_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
control,4984.0,0.129013,3.007524,0.0,0.0,0.0,0.0,196.01
variant,5016.0,0.07007,1.314802,0.0,0.0,0.0,0.0,58.63


In [17]:
control_filter = df['VARIANT_NAME'] == 'control'
variant_filter = df['VARIANT_NAME'] == 'variant'
control_ids = df[control_filter]['USER_ID']
variant_ids = df[variant_filter]['USER_ID']
if control_ids.isin(variant_ids).any():
    print("Обнаружены USER_ID из группы control, которые есть в группе variant:\n", control_ids[control_ids.isin(variant_ids)])
else:
    print("USER_ID из группы control отсутствуют в группе variant.")

Обнаружены USER_ID из группы control, которые есть в группе variant:
 1       2423
2       9411
3       7311
6       2849
27      2667
        ... 
9971    7925
9975    8036
9976    4779
9984    2069
9995    1981
Name: USER_ID, Length: 1958, dtype: int64


In [18]:
duplicates = control_ids[control_ids.isin(variant_ids)]
df_new = df[~((df['USER_ID'].isin(duplicates)))]
df_new

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,737,variant,0.0
4,6174,variant,0.0
5,2380,variant,0.0
7,9168,control,0.0
9,7548,control,0.0
...,...,...,...
9993,2400,variant,0.0
9994,3129,control,0.0
9996,502,variant,0.0
9998,7741,control,0.0


Проверяем распределение на нормальность:

In [16]:
alpha = 0.05

st = shapiro(df[control_filter].REVENUE)
print('Distribution is {}normal\n'.format( {True:'not ',
False:''}[st[1] < alpha]));
st = shapiro(df[variant_filter].REVENUE)
print('Distribution is {}normal\n'.format( {True:'not ',
False:''}[st[1] < alpha]));

Distribution is not normal

Distribution is not normal



Применяем критерий Манна-Уитни к обоим вариантам данных:

In [10]:
mw_stats = mannwhitneyu(x=df[df['VARIANT_NAME'] == 'variant']['REVENUE'].values,
                                                  y=df[df['VARIANT_NAME'] == 'control']['REVENUE'].values)
mw_stats

MannwhitneyuResult(statistic=12478180.0, pvalue=0.47825247965294926)

In [30]:
mw_stats2 = mannwhitneyu(x=df_new[df_new['VARIANT_NAME'] == 'variant']['REVENUE'].values,
                                                  y=df_new[df_new['VARIANT_NAME'] == 'control']['REVENUE'].values)
mw_stats2

MannwhitneyuResult(statistic=4588312.0, pvalue=0.2444173738649208)

Проверяем мощность:

In [50]:
control = df_new.query('VARIANT_NAME == "control"')
test = df_new.query('VARIANT_NAME == "variant"')
ttest_ind(control.REVENUE.values, test.REVENUE.values, equal_var = False)
C_mean = control.REVENUE.values.mean()
T_mean = test.REVENUE.values.mean()
C_std = control.REVENUE.values.std()
T_std = test.REVENUE.values.std()
print(len(control.REVENUE.values), len(test.REVENUE.values))

3026 3044


In [51]:
n=len(control.REVENUE.values)
S = ((T_std**2 + C_std **2)/ 2) ** 0.5
      
ef =float((T_mean-C_mean)/ S) 
ef

-0.0360905955737073

In [52]:
alpha = 0.05
analysis = TTestIndPower()
result = analysis.solve_power(ef, power=None,
nobs1=n, ratio=1.0, alpha=alpha)

result

0.28936707257547545

По результатам данного А/B тестирования можно сделать вывод, что мы не можем отвергнуть нулевую гипотезу в пользу альтернативной. Статистически значимой разницы нет.
Но стоит обратить внимание, что в данных А/В тестирования было обнаружено 39,3% пересекающихся идентификаторов пользователей в контрольной и тестовой группах. Оставшихся данных недостаточно для корректного анализа. Для достоверных выводов необходимо провести эксперимент повторно, на большем количестве пользователей, исключив попадания пользователей из контрольной группы в тестовую и наоборот.