# Packages

In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
from ambrosia.designer import Designer
from ambrosia.tester import Tester

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl
from typing import List, Tuple, Union


# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

## Functions

In [2]:
def ztest_proportion(
    df: pd.DataFrame,
    metric_col: str,
    ab_group_col: str,
    pairs_list: List[Tuple[int, int]] = [(0, 1)],
    corrected_ci: float = 0.95,
    flag_notation: bool = False
    ) -> pd.DataFrame:
    """Perform proportion tests between two groups."""
    res_table = pd.DataFrame()
    tail = (1 + corrected_ci) / 2
    for pair in pairs_list:
        num0 = df[df[ab_group_col] == pair[0]][metric_col].sum()
        denom0 = df[df[ab_group_col] == pair[0]][metric_col].count()
        num1 = df[df[ab_group_col] == pair[1]][metric_col].sum()
        denom1 = df[df[ab_group_col] == pair[1]][metric_col].count()
        p0 = num0 / denom0
        p1 = num1 / denom1
        std0 = df[df[ab_group_col] == pair[0]][metric_col].std()
        std1 = df[df[ab_group_col] == pair[1]][metric_col].std()
        r = test_proportions_2indep(
            num0, denom0,
            num1, denom1,
            value=0,
            method='wald',
            compare='diff',
            alternative='two-sided',
            return_results = True
        )
        se = np.sqrt(r.variance)
        delta = p1 - p0
        delta_per = (p1 / p0 - 1) * 100
        lb = delta - stats.norm.ppf(tail) * se
        ub = delta + stats.norm.ppf(tail) * se
        lb_per = lb * 100 / p0
        ub_per = ub * 100 / p0
        
        if flag_notation == True:
            print(f'\nComparison between groups: {pair[0]} and {pair[1]}')
            print(f'statistic: {r.statistic}, pvalue: {r.pvalue}')
            print(f'delta = {delta}')
            print(f'delta,% = {delta_per}%')
            print(f'Confidence interval for delta: ({lb}, {ub})')
            print(f'Confidence interval for delta, %: ({lb_per}, {ub_per})')

        result = pd.DataFrame(
            np.array([metric_col, denom0, denom1, pair[0], pair[1], r.statistic, r.pvalue, p0, p1, delta, delta_per, lb, ub, lb_per, ub_per]).reshape(1, -1),
            columns=['metric_name', 
                     'group0_sample_size', 
                     'group1_sample_size', 
                     'group0', 
                     'group1', 
                     'statistic', 
                     'pvalue', 
                     'mean0', 
                     'mean1', 
                     'diff_mean', 
                     'diff_mean_%', 
                     'lower_boundary', 
                     'upper_boundary', 
                     'lower_boundary_%', 
                     'upper_boundary_%',]
        )
        res_table = pd.concat([res_table, result])

        for column in res_table.columns[5:]:
            res_table[column] = res_table[column].astype(float)
        
    return res_table

def ttest(
    df: pd.DataFrame,
    metric_col: str,
    ab_group_col: str,
    pairs_list: List[Tuple[int, int]] = [(0, 1)],
    corrected_ci: float = 0.95,
    flag_notation: bool = False
    ) -> pd.DataFrame:
    """Perform t-tests between two groups."""
    res_table = pd.DataFrame()
    tail = (1 + corrected_ci) / 2
    for pair in pairs_list:
        sample0 = df.loc[df[ab_group_col] == pair[0], metric_col]
        sample1 = df.loc[df[ab_group_col] == pair[1], metric_col]
        m0 = sample0.mean()
        m1 = sample1.mean()
        v0 = sample0.std()**2
        v1 = sample1.std()**2
        n0 = len(sample0)
        n1 = len(sample1)
        t, pvalue, df_ = ws.ttest_ind(
            sample0,
            sample1,
            alternative='two-sided',
            usevar='unequal'
        )
        se = np.sqrt(v0 / n0 + v1 / n1)
        delta = m1 - m0
        delta_per = (m1 / m0 - 1) * 100
        lb = delta - stats.t.ppf(tail, df_) * se
        ub = delta + stats.t.ppf(tail, df_) * se
        lb_per = lb * 100 / m0
        ub_per = ub * 100 / m0
        
        if flag_notation == True:
            print(f'\nComparison between groups: {pair[0]} and {pair[1]}')
            print(f't-statistic: {t}, pvalue: {pvalue}, df: {df_}')
            print(f'delta = {delta}')
            print(f'delta,% = {delta_per}%')
            print(f'Confidence interval for delta: ({lb}, {ub})')
            print(f'Confidence interval for delta, %: ({lb_per}, {ub_per})')

        result = pd.DataFrame(
            np.array([metric_col, n0, n1, pair[0], pair[1], t, 
            # df_, 
            pvalue, m0, m1, delta, delta_per, lb, ub, lb_per, ub_per]).reshape(1, -1),
            columns=['metric_name', 
                     'group0_sample_size', 
                     'group1_sample_size',
                     'group0', 
                     'group1', 
                     'statistic', 
                    #  'df', 
                     'pvalue', 
                     'mean0', 
                     'mean1', 
                     'diff_mean', 
                     'diff_mean_%', 
                     'lower_boundary', 
                     'upper_boundary', 
                     'lower_boundary_%', 
                     'upper_boundary_%']
        )
        res_table = pd.concat([res_table, result])
    
    for column in res_table.columns[5:]:
        res_table[column] = res_table[column].astype(float)

    return res_table

def method_benjamini_hochberg(
    pvalues: np.ndarray,
    alpha: float = 0.05
    ) -> np.ndarray:
    """Apply the Benjamini-Hochberg procedure for multiple hypothesis testing."""
    m = len(pvalues)
    array_alpha = np.arange(1, m + 1) * alpha / m
    sorted_pvalue_indexes = np.argsort(pvalues)
    res = np.zeros(m)
    for idx, pvalue_index in enumerate(sorted_pvalue_indexes):
        pvalue = pvalues[pvalue_index]
        alpha_ = array_alpha[idx]
        if pvalue <= alpha_:
            res[pvalue_index] = 1
        else:
            break
    return res.astype(int)

# Shapiro-Wilk test & Distributions
def check_normality(df, group_column, value_column):
    groups = df[group_column].unique()

    for group in groups:
        group_data = df[df[group_column] == group][value_column].dropna() 
        stat, p = stats.shapiro(group_data)
        print(f'Group {group}: W={stat:.4f}, p-value={p:.4f}')
        if p > 0.05:
            print(f'Group {group}, Metric: {value_column}: Data is normal distributed')
        else:
            print(f'Group {group}, Metric: {value_column}: Data is not normal distributed')

def plot_distribution(df, group_column, value_column):

    groups = df[group_column].unique()
    fig, axes = plt.subplots(2, 2, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.5]})

    sns.histplot(data=df, x=value_column, hue=group_column, kde=True, bins=30, alpha=0.4, ax=axes[0, 0])
    axes[0, 0].set_title("Graph + KDE")
    axes[0, 0].set_xlabel(value_column)
    axes[0, 0].set_ylabel("Frequence")

    sns.boxplot(data=df, x=group_column, y=value_column, ax=axes[0, 1])
    axes[0, 1].set_title("Boxplot grouped")
    axes[0, 1].set_xlabel(group_column)
    axes[0, 1].set_ylabel(value_column)

    sns.histplot(df[df[group_column] == groups[0]][value_column], bins=30, kde=True, color='blue', alpha=0.5, ax=axes[1, 0])
    axes[1, 0].set_title(f'Hist for the {groups[0]}')
    axes[1, 0].set_xlabel(value_column)
    axes[1, 0].set_ylabel("frequence")

    sns.histplot(df[df[group_column] == groups[1]][value_column], bins=30, kde=True, color='orange', alpha=0.5, ax=axes[1, 1])
    axes[1, 1].set_title(f'Hist for the {groups[1]}')
    axes[1, 1].set_xlabel(value_column)
    axes[1, 1].set_ylabel("Frequence")

    plt.tight_layout()
    plt.show()

# Levene's & Bartlet's test
def levene(df, indicator, metric):
    w_stats, p_value = st.levene(
        df[df['group_name'] == 0][indicator], 
        df[df['group_name'] == 1][indicator],
                            center=metric)
    
    alpha = 0.05
    
    if p_value > alpha:
        print(f"Variance are from the same population on {metric}")
    else:
        print(f"Variance are from the different population on {metric}")
    
# Cohen's D
def cohens_d(df, metric):
    group1 = df[df['group_name']==1][metric]
    group2 = df[df['group_name']==0][metric]
    mean1, mean2 = np.mean(group1), np.mean(group2)
     
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
     
    d = (mean1 - mean2) / pooled_std
     
    # if d <= 0.3:
    #     print(f'Small effect: d ≈ 0-0.3 ({d:.3f})')
    # elif 0.31 <= d <= 0.8:
    #     print(f'Medium effect: d ≈ 0.3-0.8 ({d:.3f})')
    # elif 0.81 <= d <= 1:
    #     print(f'Large effect: d ≈ 0.8-1 ({d:.3f})')

    return d

# SRM
def srm(df):
    srm_df = pd.DataFrame()

    for city in df['city_name'].unique():
        
        observed = [
            (df.query(f'group_name == 0 and city_name == "{city}"')['user_id'].count()), 
            (df.query(f'group_name == 1 and city_name == "{city}"')['user_id'].count())
            ]

        total_traffic = sum(observed)

        expected = [total_traffic/2, total_traffic/2]

        chi = st.chisquare(observed, f_exp = expected)

        if chi[1] < 0.01:
            conclusion = "Sample ratio mismatch (SRM) may be present"
        else:
            conclusion = "Sample ratio mismatch (SRM) probably not present"
            print(f"{city}, {chi[1]}")

        
        new_srm_df = pd.DataFrame(
            [[city, observed, total_traffic, expected, round(chi[1], 3), conclusion]], 
            columns=['city_name',  'sample_sizes', 'total_size', 'expected_sizes', 'chi_value', 'conclusion']
            )

        srm_df = pd.concat([srm_df, new_srm_df]).sort_values(['city_name', 'total_size'], ascending=False).reset_index(drop=True)

    return srm_df

# Calcualting the significance by cities
def calcualate_result(df_cr, df_abs):
    df_results = pd.DataFrame()

    for city in df_cr['city_name'].unique():

        absolute_values_keys_result = df_abs[df_abs['city_name']==f'{city}'].copy()

        cr_df = ztest_proportion(df_cr[df_cr['city_name']==f'{city}'], 'has_ride', 'group_name')
        cr_df['metric'] = 'Conversion'
        cr_df['cohen_d'] = cohens_d(df_cr[df_cr['city_name']==f'{city}'], 'has_ride')

        rides_df = ttest(absolute_values_keys_result, 'rides', 'group_name')
        rides_df['metric'] = 'Quantitive'
        rides_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'rides')

        gmv_df = ttest(absolute_values_keys_result, 'gmv', 'group_name')
        gmv_df['metric'] = 'Quantitive'
        gmv_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'gmv')

        orders_df = ttest(absolute_values_keys_result, 'orders', 'group_name')
        orders_df['metric'] = 'Quantitive'
        orders_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'orders')

        df_total = pd.concat([cr_df, rides_df, gmv_df, orders_df])

        df_total['region'] = city
        df_total['segment'] = 'By city'
        df_total['significance'] = (df_total['pvalue']<0.05)*1
        df_total['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

        df_results = pd.concat([df_results, df_total])

    total_cr_df = ztest_proportion(df_cr, 'has_ride', 'group_name')
    total_cr_df['metric'] = 'Conversion'
    total_cr_df['cohen_d'] = cohens_d(df_cr, 'has_ride')

    total_rides_df = ttest(df_abs, 'rides', 'group_name')
    total_rides_df['metric'] = 'Quantitive'
    total_rides_df['cohen_d'] = cohens_d(df_abs, 'rides')

    total_gmv_df = ttest(df_abs, 'gmv', 'group_name')
    total_gmv_df['metric'] = 'Quantitive'
    total_gmv_df['cohen_d'] = cohens_d(df_abs, 'gmv')

    total_orders_df = ttest(df_abs, 'orders', 'group_name')
    total_orders_df['metric'] = 'Quantitive'
    total_orders_df['cohen_d'] = cohens_d(df_abs, 'orders')


    total_total_df = pd.concat([total_cr_df, total_rides_df, total_gmv_df, total_orders_df])
    total_total_df['region'] = 'All'
    total_total_df['segment'] = 'Total'
    total_total_df['significance'] = (df_total['pvalue']<0.05)*1
    total_total_df['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

    df_results = pd.concat([df_results, total_total_df])

    df_results

    return df_results

# Design of experiment

### Pulling the data

In [7]:
print('1')
df_to_approve_ride = read_bq("""
   WITH liveness AS (SELECT user_id,
                         os_name,
                         event_dt_part,
                         city_id,
                         country_id,
                         city_name,
                         country_name,
                         fulfilled_flow,
                         MIN(IF(name = 'client.verification_flow_result_status.show' AND
                                LOWER(status) = 'approve',
                                client_time,
                                NULL)) AS status_result
                  FROM (SELECT user_id,
                               name,
                               os_name,
                               event_dt_part,
                               TIMESTAMP_MILLIS(client_time)                                           AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               IF(LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY user_id, event_dt_part ORDER BY client_time) IS NULL,
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow'),
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY user_id, event_dt_part ORDER BY client_time)) AS fulfilled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                                AS status
                        FROM indriver-e6e40.ods_event_tracker.event t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show',
                                       'client.verification_flow_result_status.show'
                            )
                          AND event_dt_part >= '2025-02-01'
                          AND t2.city_id IN
                              (4263, 4267, 4243, 4545, 4540, 4197, 4530, 5568, 4255, 4559, 4300, 4227, 19943, 4261,
                               5573, 4266, 4196, 4376, 4154, 798, 4225, 4198, 4385, 4271, 4374, 4299, 5368, 4229, 4199,
                               4524, 4242, 4143, 4155, 4517, 5589, 5548, 4755, 4397, 4226, 4269, 4404, 5600, 4373, 4375,
                               4153, 4231, 5535, 4200, 5528, 4234, 4825, 4142, 5536, 4264, 4549, 4228, 5291, 4257,
                               4516))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7, 8),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part >= '2025-02-01'
                 AND city_id IN
                     (4263, 4267, 4243, 4545, 4540, 4197, 4530, 5568, 4255, 4559, 4300, 4227, 19943, 4261, 5573, 4266,
                      4196, 4376, 4154, 798, 4225, 4198, 4385, 4271, 4374, 4299, 5368, 4229, 4199, 4524, 4242, 4143,
                      4155, 4517, 5589, 5548, 4755, 4397, 4226, 4269, 4404, 5600, 4373, 4375, 4153, 4231, 5535, 4200,
                      5528, 4234, 4825, 4142, 5536, 4264, 4549, 4228, 5291, 4257, 4516)
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
    SELECT t1.user_id,
        t1.os_name,
        t1.city_id,
        t1.country_id,
        t1.city_name,
        t1.country_name,
        t1.status_result,
        t2.order_timestamp,
        status_order
    FROM liveness t1
            LEFT JOIN rides t2
                    ON t1.user_id = t2.pass_id
                        AND t2.order_timestamp BETWEEN status_result AND DATE_ADD(status_result, INTERVAL + 1 DAY)
    WHERE 1 = 1
    QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_timestamp ASC) = 1
""")

print('2')
df_rides_gmv_orders = read_bq("""
    WITH rides AS (SELECT t1.user_id,
                      t1.country_id,
                      t2.country_name,
                      t1.city_id,
                      t2.city_name,
                      AVG(orders_count)  AS orders,
                      AVG(rides_count)   AS rides,
                      SUM(gmv_clean_usd) AS gmv
               FROM indriver-bi.incity.tbl_incity_growth_metrics_detail t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                             ON t1.city_id = t2.city_id
               WHERE user_type = 'pass'
                 AND t1.city_id IN (4263, 4267, 4243, 4545, 4540, 4197, 4530, 5568, 4255, 4559, 4300, 4227, 19943,
                                    4261, 5573, 4266, 4196, 4376, 4154, 798, 4225, 4198, 4385, 4271, 4374, 4299,
                                    5368, 4229, 4199, 4524, 4242, 4143, 4155, 4517, 5589, 5548, 4755, 4397, 4226,
                                    4269, 4404, 5600, 4373, 4375, 4153, 4231, 5535, 4200, 5528, 4234, 4825, 4142,
                                    5536, 4264, 4549, 4228, 5291, 4257, 4516)
                 AND metric_date_utc >= '2025-03-02'
               GROUP BY t1.user_id, t1.country_id, t2.country_name, t1.city_id, t2.city_name),
     newbies AS (SELECT DISTINCT user_id
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND city_id IN (4263, 4267, 4243, 4545, 4540, 4197, 4530, 5568, 4255, 4559, 4300, 4227, 19943,
                                   4261, 5573, 4266, 4196, 4376, 4154, 798, 4225, 4198, 4385, 4271, 4374, 4299,
                                   5368, 4229, 4199, 4524, 4242, 4143, 4155, 4517, 5589, 5548, 4755, 4397, 4226,
                                   4269, 4404, 5600, 4373, 4375, 4153, 4231, 5535, 4200, 5528, 4234, 4825, 4142,
                                   5536, 4264, 4549, 4228, 5291, 4257, 4516)
                   AND metric_date_utc BETWEEN '2024-01-01' AND '2025-03-01')
    SELECT r.*
    FROM rides r
    WHERE r.user_id NOT IN (SELECT user_id FROM newbies);
    """)

print('3')
df_sample_size = read_bq("""
        WITH checker AS (SELECT user_id,
                        os_name,
                        event_dt_part,
                        DATE_TRUNC(event_dt_part, WEEK)  AS weekly,
                        DATE_TRUNC(event_dt_part, MONTH) AS monthly,
                        city_id,
                        country_id,
                        city_name,
                        country_name,
                        fulfilled_flow,
                        MIN(IF(name = 'client.verification_start.show',
                               client_time,
                               NULL))                    AS banner_show,
                        MIN(IF(name = 'client.verification_start.click',
                               client_time,
                               NULL))                    AS banner_click,
                        MIN(IF(name = 'client.verification_flow_result_status.show' AND
                               LOWER(status) = 'approve',
                               client_time,
                               NULL))                    AS status_result
                 FROM (SELECT user_id,
                              name,
                              os_name,
                              event_dt_part,
                              TIMESTAMP_MILLIS(client_time)                            AS client_time,
                              t1.city_id,
                              t2.city_name,
                              t2.country_id,
                              t2.country_name,
                              IF(LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                     OVER (PARTITION BY user_id ORDER BY client_time) IS NULL,
                                 JSON_EXTRACT_SCALAR(payload, '$.verification_flow'),
                                 LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                     OVER (PARTITION BY user_id ORDER BY client_time)) AS fulfilled_flow,
                              JSON_EXTRACT_SCALAR(payload, '$.status')                 AS status
                       FROM indriver-e6e40.ods_event_tracker.event t1
                                JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                     ON
                                         t1.city_id = t2.city_id
                       WHERE 1 = 1
                         AND name IN (
                                      'client.verification_start.show',
                                      'client.verification_flow_result_status.show',
                                      'client.verification_start.click'
                           )
                         AND event_dt_part BETWEEN '2025-02-01' AND '2025-03-30'
                         AND t2.city_id IN
                             (4263, 4267, 4243, 4545, 4540, 4197, 4530, 5568, 4255, 4559, 4300, 4227, 19943, 4261,
                              5573, 4266, 4196, 4376, 4154, 798, 4225, 4198, 4385, 4271, 4374, 4299, 5368, 4229, 4199,
                              4524, 4242, 4143, 4155, 4517, 5589, 5548, 4755, 4397, 4226, 4269, 4404, 5600, 4373, 4375,
                              4153, 4231, 5535, 4200, 5528, 4234, 4825, 4142, 5536, 4264, 4549, 4228, 5291, 4257,
                              4516)
                       QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id, name, event_dt_part ORDER BY client_time) = 1)
                 GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
       SELECT event_dt_part, weekly, monthly, COUNT(DISTINCT user_id) AS users
       FROM checker
       WHERE status_result IS NOT NULL
       GROUP BY 1, 2, 3
""")


3


### MDE & Period calculating

In [8]:
daily = int(df_sample_size.groupby(['event_dt_part'], as_index=False)['users'].sum()['users'].mean()*0.6)
weekly = int(df_sample_size.groupby(['weekly'], as_index=False)['users'].sum()['users'].mean()*0.6)
monthly = int(df_sample_size.groupby(['monthly'], as_index=False)['users'].sum()['users'].mean()*0.6)

effects = [1.01, 1.015, 1.05, 1.1]  # MDE in percents
sizes = [daily, weekly, weekly*2, monthly]  # Size of each group
first_type_errors = [0.01, 0.05]
second_type_errors = [0.1, 0.2]

df_cr_to_ride_agg = df_to_approve_ride.groupby(['user_id', 'city_name'], as_index=False)['order_timestamp'].count()
df_cr_to_approve = df_to_approve_ride.groupby(['user_id', 'city_name'], as_index=False)['status_result'].count()

design_cr_to_ride = Designer(
        dataframe=df_cr_to_ride_agg, 
        metrics='order_timestamp'
        )

design_cr_to_ride.set_first_errors(first_type_errors)
design_cr_to_ride.set_second_errors(second_type_errors)

df_cr_to_ride_design = design_cr_to_ride.run(
            to_design='effect', 
            method='theory', 
            effects=effects,
            sizes=sizes
            )
df_cr_to_ride_design['metric'] = 'cr_to_ride'

design_gmv = Designer(
        dataframe=df_rides_gmv_orders, 
        metrics='gmv'
        )

design_gmv.set_first_errors(first_type_errors)
design_gmv.set_second_errors(second_type_errors)

df_gmv_design = design_gmv.run(
            to_design='effect', 
            method='theory', 
            effects=effects,
            sizes=sizes
            )
df_gmv_design['metric'] = 'gmv'

design_rides = Designer(
        dataframe=df_rides_gmv_orders, 
        metrics='rides'
        )

design_rides.set_first_errors(first_type_errors)
design_rides.set_second_errors(second_type_errors)

df_rides_design = design_rides.run(
            to_design='effect', 
            method='theory', 
            effects=effects,
            sizes=sizes
            )
df_rides_design['metric'] = 'rides'




pd.concat([df_cr_to_ride_design, df_rides_design, df_gmv_design])


"Errors ($\alpha$, $\beta$)",(0.01; 0.1),(0.01; 0.2),(0.05; 0.1),(0.05; 0.2),metric
Group sizes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
25389,3.9%,3.4%,3.2%,2.8%,cr_to_ride
147259,1.6%,1.4%,1.3%,1.2%,cr_to_ride
294518,1.1%,1.0%,1.0%,0.8%,cr_to_ride
736298,0.7%,0.6%,0.6%,0.5%,cr_to_ride
25389,3.3%,2.9%,2.8%,2.4%,rides
147259,1.4%,1.2%,1.2%,1.0%,rides
294518,1.0%,0.9%,0.8%,0.7%,rides
736298,0.6%,0.5%,0.5%,0.4%,rides
25389,6.4%,5.6%,5.4%,4.6%,gmv
147259,2.6%,2.3%,2.2%,1.9%,gmv


# Calculating the results. Summarizing

### Data pulling

### Check the key statistics over the tests

### Applying the test for significance calculating