# Ноутбук для расчета результата экспериментов

Ноутбук для анализа проведенных экспериментов, со сплитование по конкретным водителям

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import logging
import sys
import re
from citymobil_python_mysql_wrapper import MysqlWrapper
import pyexasol
from random import randrange
from itertools import combinations
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [3]:
# импорт модулей с sql-запросами и функциями для обработки
import utils as u
import query as q

# Запрашиваем из БД все необходимые данные

In [4]:
cred = pd.read_json(r'/Users/skostuchik/crd_exa.json')
user = cred.iloc[0, 0]
password = cred.iloc[0, 1]

C = pyexasol.connect(dsn='ex1..3.city-srv.ru:8563', user=user, password=password, fetch_dict=True)

## !!!В следующей ячейке необимо ввести значения переменных!!!

In [5]:
# нужно указать locality_id, а так же даты начала и завершения экспериментов
params = {'locality_id': 338, 'date_from' : '2021-12-08', 'date_to' : '2021-12-14'}

# необходимо указать запрос, который возвращает id водителей и сплит
drivers_cte = '''
    select d.ID
                  , case when mod(UDF.CRC32(CONCAT(d.ID, '_20210406')), 100) < 50 then 'B' else 'A' end as exp_group
        from REPLICA.DRIVERS d
                      left join md.LOCALITY l on l.LOCALITY_RK = d.ID_LOCALITY
        where 1 = 1
               and LOCALITY_RK in ({locality_id})
               and d.role = 11
               and d.is_test != 1
               and d.status = 'A'
               and LAST_ORDER_DATE >= current_date - 30
'''

Важно:
- В запросе должны быть поля ID (id водителя) и exp_group (сплит)
- Запрос далее подставляется в другие запросы в виде CTE, а значит в нем нельзя использовать CTE - вместо этого следует использовать подзапрос (вместо with t as (...) нужно select * from (select ...))

In [6]:
query_params = {'locality_id':params.get('locality_id'),
                'drivers_cte':drivers_cte.format(locality_id=params.get('locality_id')),
                'date_from':params.get('date_from'), 'date_to':params.get('date_to')}

In [7]:
e_ar, e_of2r, e_mph, e_copt, e_dist, e_ch = C.execute(q.ar_query.format(**query_params)),\
C.execute(q.of2r_query.format(**query_params)), C.execute(q.mph_query.format(**query_params)),\
C.execute(q.copt_query.format(**query_params)), C.execute(q.dist_query.format(**query_params)),\
C.execute(q.churn_query.format(**query_params))

In [8]:
ar_suggest_data, of2r_data, mph_data, copt_data, dist_data, churn_data = \
pd.DataFrame(e_ar.fetchall()), pd.DataFrame(e_of2r.fetchall()),\
pd.DataFrame(e_mph.fetchall()), pd.DataFrame(e_copt.fetchall()),\
pd.DataFrame(e_dist.fetchall()), pd.DataFrame(e_ch.fetchall())

In [9]:
# словарь со сплитами
split_pairs = {comb[0] + '_' + comb[1]:list(comb)
               for comb in combinations(sorted(ar_suggest_data.EXP_GROUP.unique()), 2)}

In [10]:
#есть косяк с наполнением таблицы SH в DWH
mph_data = mph_data[mph_data['SUPPLY_HOURS'].notna()]

In [11]:
# словарь с конфигом - тут все метрики и то, как их надо считать
config = {
    'alpha':0.01, 'power':0.9,
    'metrics': {
        'AR':{
            'df':ar_suggest_data,
            'add_breakdown':{
                'Total':'Total',
                'ChainOffer':'SPECIFICATION_NAME',
                '0 - 3 km':'DISTANCE_CATEGORY','3 - 6 km':'DISTANCE_CATEGORY',
                '6 - 10 km':'DISTANCE_CATEGORY','+10 km':'DISTANCE_CATEGORY'
            },
            'options':{
                'x':['ACCEPT'],
                'n':['FSS_ID', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'RR':{
            'df':ar_suggest_data,
            'add_breakdown':{
                'Total':'Total',
                'ChainOffer':'SPECIFICATION_NAME',
                '0 - 3 km':'DISTANCE_CATEGORY','3 - 6 km':'DISTANCE_CATEGORY',
                '6 - 10 km':'DISTANCE_CATEGORY','+10 km':'DISTANCE_CATEGORY'
            },
            'options':{
                'x':['REJECT'],
                'n':['FSS_ID', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'FR':{
            'df':ar_suggest_data,
            'add_breakdown':{
                'Total':'Total',
                'ChainOffer':'SPECIFICATION_NAME',
                '0 - 3 km':'DISTANCE_CATEGORY','3 - 6 km':'DISTANCE_CATEGORY',
                '6 - 10 km':'DISTANCE_CATEGORY','+10 km':'DISTANCE_CATEGORY'
            },
            'options':{
                'x':['FRAUD'],
                'n':['FSS_ID', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'CHURN_7DAY':{
            'df':churn_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['CHURN_7DAY'],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'CHURN_14DAY':{
            'df':churn_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['CHURN_14DAY'],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'CHURN_21DAY':{
            'df':churn_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['CHURN_21DAY'],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'binomial'
        },
        'MPH_GROSS':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['MONEY_GROSS', np.sum],
                'y':['SUPPLY_HOURS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'MPH_NET':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['MONEY_NET', np.sum],
                'y':['SUPPLY_HOURS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'ORGANIC_MPH_GROSS':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['ORGANIC_MONEY_GROSS', np.sum],
                'y':['SUPPLY_HOURS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'ORGANIC_MPH_NET':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['ORGANIC_MONEY_NET', np.sum],
                'y':['SUPPLY_HOURS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'SHpD':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['SUPPLY_HOURS', np.sum],
                'y':['DRIVER_RK', pd.Series.nunique],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'DE':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total', 'Not_park':'PARK', 'Park':'PARK', 'not_brand':'BRAND', 'brand':'BRAND',
                'TOP':'SEGMENT', 'HIGH':'SEGMENT'
            },
            'options':{
                'x':['ON_TRIP', np.sum],
                'y':['SUPPLY_HOURS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'TPD':{
            'df':mph_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['TRIPS', np.sum],
                'y':['DRIVER_RK', pd.Series.nunique],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'OF2R':{
            'df':of2r_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['RIDES', np.sum],
                'y':['OFFERS', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'GMVPT':{
            'df':copt_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['GMV', np.sum],
                'y':['RIDES', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'COPT':{
            'df':copt_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['CONTRIBUTION', np.sum],
                'y':['RIDES', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'DIPT':{
            'df':copt_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['DI', np.sum],
                'y':['RIDES', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'COMPT':{
            'df':copt_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['COMMISSION', np.sum],
                'y':['RIDES', np.sum],
                'n':['DRIVER_RK', pd.Series.nunique]
            },
            'type':'ratio'
        },
        'DIST_MEAN':{
            'df':dist_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['EXP_DIST_KM', np.mean],
                'n':['ORDER_RK', pd.Series.nunique]
            },
            'type':'continious'
        },
        'DIST_MEDIAN':{
            'df':dist_data,
            'add_breakdown':{
                'Total':'Total'
            },
            'options':{
                'x':['EXP_DIST_KM', np.median],
                'n':['ORDER_RK', pd.Series.nunique]
            },
            'type':'continious'
        }
    }
}

In [12]:
# непосредственно расчет значений метрик
result_dict = {}

for metric,v in tqdm(config.get('metrics').items()):
    for split_pair, splits in split_pairs.items():
        df = v.get('df')
        metric_data = df[(df.EXP_GROUP.isin(splits))]
        for value, fltr in v.get('add_breakdown').items():
            if fltr != 'Total':
                add_breakdown_data = metric_data[metric_data[fltr] == value]
            else:
                add_breakdown_data = metric_data
                    
            if len(add_breakdown_data) == 0:
                continue
                    
            metric_name = metric+'_'+value if len(v.get('add_breakdown')) > 1 else metric

            try:
                src = u.stat_res_calculation(metric=metric, m_type=v.get('type'), df=add_breakdown_data,
                                             splits=splits, cfg=config)
                result_dict.update({(split_pair,metric_name):src})
            except:
                continue

Failed to converge on a solution.

Failed to converge on a solution.

  x = np.asarray((x - loc)/scale, dtype=dtyp)
100%|██████████| 20/20 [00:35<00:00,  1.80s/it]


In [13]:
experiment_summary = pd.DataFrame(result_dict).T

# Делаем поправку БХ на множественные сравнения

In [14]:
p_res = pd.DataFrame(experiment_summary['p_val']).fillna(1)

In [15]:
p_adj_list = {}

for bonus_id in p_res.index.get_level_values(0).unique():

    loc_index = p_res[p_res.index.get_level_values(0)==bonus_id].index
    loc_p = p_res[p_res.index.get_level_values(0)==bonus_id]['p_val']
    loc_p_adj = multipletests(loc_p, alpha=config.get('alpha'), method='fdr_bh')[1]
    loc_dict = dict(list(zip(loc_index,zip(loc_p,loc_p_adj))))
    
    p_adj_list.update(loc_dict)

In [16]:
p_adj = pd.DataFrame(p_adj_list, index=['p_val','p_adj']).T

# Итоги теста:

In [17]:
experiment_summary = experiment_summary.join(p_adj['p_adj'])
experiment_summary = experiment_summary.reset_index().rename(columns={"level_0": "split_pair",
                                                                      "level_1": "metric"})
experiment_summary = experiment_summary[[
    'split_pair', 'metric', 'split_a', 'split_b', 'lift', 'split_a_n_obs', 'split_b_n_obs',
    'n_obs_difference', 'split_a_drivers', 'split_b_drivers', 'unique_drivers_difference', 'p_val', 'p_adj',
    'nobs_needed', 'mde_ok']]

In [18]:
#словарь для порядка сортировки
m_order = {'AR_Total': 0, 'RR_Total': 1, 'FR_Total': 2, 'AR_ChainOffer': 3, 'RR_ChainOffer': 4, 'FR_ChainOffer': 5,
           'AR_0 - 3 km': 6, 'AR_3 - 6 km': 7, 'AR_6 - 10 km': 8, 'AR_+10 km': 9, 'RR_0 - 3 km': 10,
           'RR_3 - 6 km': 11, 'RR_6 - 10 km': 12, 'RR_+10 km': 13, 'FR_0 - 3 km': 14, 'FR_3 - 6 km': 15,
           'FR_6 - 10 km': 16, 'FR_+10 km': 17, 'OF2R': 18, 'CHURN_7DAY': 19, 'CHURN_14DAY': 20, 'CHURN_21DAY': 21,
           'MPH_GROSS': 22, 'MPH_NET': 23, 'ORGANIC_MPH_GROSS': 24, 'ORGANIC_MPH_NET': 25, 'SHpD': 26, 'TPD': 27,
           'GMVPT': 28, 'COPT': 29, 'DIPT': 30, 'COMPT': 31, 'DIST_MEAN': 32, 'DIST_MEDIAN': 33, 'DE_Total': 34,
           'DE_Park': 35, 'DE_Not_park': 36, 'DE_not_brand': 37, 'DE_brand': 38, 'DE_HIGH': 39, 'DE_TOP': 40}

m_order.update(dict((v,k) for k,v in dict(list(enumerate(sorted(experiment_summary.split_pair.unique())))).items()))

In [19]:
experiment_summary = experiment_summary.sort_values(by=['split_pair','metric'],
                                                    key=lambda x: x.map(m_order))
prokras = experiment_summary[(experiment_summary['p_adj']<=0.01)&(experiment_summary['mde_ok']==1)]

In [20]:
experiment_summary.drop(['unique_drivers_difference'],axis=1)

Unnamed: 0,split_pair,metric,split_a,split_b,lift,split_a_n_obs,split_b_n_obs,n_obs_difference,split_a_drivers,split_b_drivers,p_val,p_adj,nobs_needed,mde_ok
0,A_B,AR_Total,0.491957,0.465714,-0.026243,49858.0,46593.0,-0.065486,761.0,726.0,3.755748e-16,7.699283e-15,5399.0,1.0
6,A_B,RR_Total,0.466625,0.495096,0.028471,49858.0,46593.0,-0.065486,761.0,726.0,9.806838e-19,4.0208040000000006e-17,4568.0,1.0
12,A_B,FR_Total,0.041418,0.03919,-0.002227,49858.0,46593.0,-0.065486,761.0,726.0,0.08180292,0.1989678,119092.0,
1,A_B,AR_ChainOffer,0.729892,0.717617,-0.012276,2325.0,1930.0,-0.169892,435.0,406.0,0.391073,0.5134949,19465.0,
7,A_B,RR_ChainOffer,0.246882,0.249741,0.002859,2325.0,1930.0,-0.169892,435.0,406.0,0.8576939,0.9254065,338410.0,
13,A_B,FR_ChainOffer,0.023226,0.032642,0.009417,2325.0,1930.0,-0.169892,435.0,406.0,0.07575272,0.1989678,3806.0,
2,A_B,AR_0 - 3 km,0.642477,0.619227,-0.02325,7510.0,6907.0,-0.080293,644.0,607.0,0.004060619,0.01513503,6322.0,1.0
3,A_B,AR_3 - 6 km,0.613037,0.580068,-0.032969,10386.0,9673.0,-0.06865,668.0,633.0,2.109819e-06,1.44171e-05,3247.0,1.0
4,A_B,AR_6 - 10 km,0.51274,0.491861,-0.02088,13775.0,12839.0,-0.067949,680.0,642.0,0.0006939669,0.003161405,8526.0,1.0
5,A_B,AR_+10 km,0.344917,0.320019,-0.024898,18187.0,17174.0,-0.055699,693.0,648.0,7.250097e-07,5.94508e-06,5423.0,1.0


In [21]:
prokras.drop(['unique_drivers_difference'],axis=1)

Unnamed: 0,split_pair,metric,split_a,split_b,lift,split_a_n_obs,split_b_n_obs,n_obs_difference,split_a_drivers,split_b_drivers,p_val,p_adj,nobs_needed,mde_ok
0,A_B,AR_Total,0.491957,0.465714,-0.026243,49858.0,46593.0,-0.065486,761.0,726.0,3.755748e-16,7.699283e-15,5399.0,1.0
6,A_B,RR_Total,0.466625,0.495096,0.028471,49858.0,46593.0,-0.065486,761.0,726.0,9.806838e-19,4.0208040000000006e-17,4568.0,1.0
3,A_B,AR_3 - 6 km,0.613037,0.580068,-0.032969,10386.0,9673.0,-0.06865,668.0,633.0,2.109819e-06,1.44171e-05,3247.0,1.0
4,A_B,AR_6 - 10 km,0.51274,0.491861,-0.02088,13775.0,12839.0,-0.067949,680.0,642.0,0.0006939669,0.003161405,8526.0,1.0
5,A_B,AR_+10 km,0.344917,0.320019,-0.024898,18187.0,17174.0,-0.055699,693.0,648.0,7.250097e-07,5.94508e-06,5423.0,1.0
9,A_B,RR_3 - 6 km,0.354034,0.391399,0.037364,10386.0,9673.0,-0.06865,668.0,633.0,4.866771e-08,6.651254e-07,2437.0,1.0
10,A_B,RR_6 - 10 km,0.443412,0.467638,0.024226,13775.0,12839.0,-0.067949,680.0,642.0,7.705138e-05,0.0004513009,6257.0,1.0
11,A_B,RR_+10 km,0.604553,0.630371,0.025819,18187.0,17174.0,-0.055699,693.0,648.0,6.326404e-07,5.94508e-06,5336.0,1.0


In [22]:
#экспорт в эксель
experiment_summary.to_excel(r'res_ttl.xlsx', sheet_name='experiment_summary', index = False)
prokras.to_excel(r'res_prokras.xlsx', sheet_name='prokras', index = False)