## Импорты и данные

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import seaborn as sns
sns.set_theme()

In [2]:
df = pd.read_csv('synthetic_gmv_data_1.3.csv')
df

Unnamed: 0,gmv_hist,gmv_exp,group_name
0,200.78,123.19,test
1,363.80,134.49,control
2,39.93,116.72,control
3,150.99,177.67,control
4,208.93,65.30,test
...,...,...,...
249995,221.87,65.14,test
249996,307.51,183.22,test
249997,283.07,309.94,test
249998,76.17,121.67,test


In [11]:
def safe_divide(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return np.nan
    
def delta_var(numerator, denominator):
    x = numerator
    y = denominator
    n = len(x)
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    var_x = np.var(x, ddof=1)
    var_y = np.var(y, ddof=1)
    cov_xy = np.cov(x, y, ddof=1)[0][1]    
    delta_var = safe_divide(safe_divide(var_x,mu_y**2)  - 2*cov_xy*safe_divide(mu_x,mu_y**3) + var_y*safe_divide(mu_x**2,mu_y**4), n)
    return delta_var

def delta_ratio(x_num, x_denom, y_num, y_denom):
    n = len(x_num)
    m = len(y_num)
    test_var = delta_var(x_num, x_denom)
    control_var = delta_var(y_num, y_denom)
    sigma = np.sqrt(test_var + control_var)
    delta_estimator = safe_divide(np.mean(x_num), np.mean(x_denom)) - safe_divide(np.mean(y_num), np.mean(y_denom))
    tt = safe_divide(delta_estimator, sigma)
    p_value = 2*stats.t.sf(np.abs(tt),n+m-2)
    return tt, p_value

In [12]:
a = df.loc[df.group_name == 'control']
b = df.loc[df.group_name == 'test']

In [13]:
a.describe()

Unnamed: 0,gmv_hist,gmv_exp,gmv_cuped
count,125162.0,125162.0,125162.0
mean,179.52463,148.829971,48.674901
std,134.006455,111.940001,83.426008
min,0.0,0.0,-268.737513
25%,72.3625,59.46,-8.967849
50%,152.84,126.16,41.009089
75%,259.61,215.0,101.985139
max,946.87,929.64,484.190651


In [14]:
b.describe()

Unnamed: 0,gmv_hist,gmv_exp,gmv_cuped
count,124838.0,124838.0,124838.0
mean,179.294113,149.363889,49.337423
std,133.609983,111.735136,83.126385
min,0.0,1.0,-267.72366
25%,73.0325,60.35,-8.231474
50%,152.88,126.7,41.840883
75%,258.72,215.12,102.096712
max,942.23,814.08,512.808274


## Расчёт без CUPED

In [15]:
tt, pvalue = delta_ratio(
    a.gmv_exp, np.ones(a.gmv_exp.shape[0]),
    b.gmv_exp, np.ones(b.gmv_exp.shape[0])
    )
print(f'P-value: {round(pvalue, 3)}')

P-value: 0.233


## Расчёт с CUPED

In [16]:
theta = np.cov(df.gmv_hist, df.gmv_exp)[0, 1] / np.var(df.gmv_hist)
df['gmv_cuped'] = df.gmv_exp - theta * df.gmv_hist
df

Unnamed: 0,gmv_hist,gmv_exp,group_name,gmv_cuped
0,200.78,123.19,test,11.176763
1,363.80,134.49,control,-68.470531
2,39.93,116.72,control,94.443436
3,150.99,177.67,control,93.434127
4,208.93,65.30,test,-51.260043
...,...,...,...,...
249995,221.87,65.14,test,-58.639145
249996,307.51,183.22,test,11.663120
249997,283.07,309.94,test,152.017962
249998,76.17,121.67,test,79.175487


In [17]:
a = df[df.group_name == 'control']['gmv_cuped']
b = df[df.group_name == 'test']['gmv_cuped']

tt, pvalue = delta_ratio(
    a, np.ones(a.shape[0]),
    b, np.ones(b.shape[0])
    )
print(f'P-value: {round(pvalue, 3)}')

P-value: 0.047
