# Packages

In [67]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

## Functions

In [68]:
def method_benjamini_hochberg(
    pvalues: np.ndarray,
    alpha: float = 0.05
    ) -> np.ndarray:
    """Apply the Benjamini-Hochberg procedure for multiple hypothesis testing."""
    m = len(pvalues)
    array_alpha = np.arange(1, m + 1) * alpha / m
    sorted_pvalue_indexes = np.argsort(pvalues)
    res = np.zeros(m)
    for idx, pvalue_index in enumerate(sorted_pvalue_indexes):
        pvalue = pvalues[pvalue_index]
        alpha_ = array_alpha[idx]
        if pvalue <= alpha_:
            res[pvalue_index] = 1
        else:
            break
    return res.astype(int)

# Shapiro-Wilk test & Distributions
def check_normality(df, group_column, value_column):
    groups = df[group_column].unique()

    for group in groups:
        group_data = df[df[group_column] == group][value_column].dropna() 
        stat, p = stats.shapiro(group_data)
        print(f'Group {group}: W={stat:.4f}, p-value={p:.4f}')
        if p > 0.05:
            print(f'Group {group}, Metric: {value_column}: Data is normal distributed')
        else:
            print(f'Group {group}, Metric: {value_column}: Data is not normal distributed')

def plot_distribution(df, group_column, value_column):

    groups = df[group_column].unique()
    fig, axes = plt.subplots(2, 2, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.5]})

    sns.histplot(data=df, x=value_column, hue=group_column, kde=True, bins=30, alpha=0.4, ax=axes[0, 0])
    axes[0, 0].set_title("Graph + KDE")
    axes[0, 0].set_xlabel(value_column)
    axes[0, 0].set_ylabel("Frequence")

    sns.boxplot(data=df, x=group_column, y=value_column, ax=axes[0, 1])
    axes[0, 1].set_title("Boxplot grouped")
    axes[0, 1].set_xlabel(group_column)
    axes[0, 1].set_ylabel(value_column)

    sns.histplot(df[df[group_column] == groups[0]][value_column], bins=30, kde=True, color='blue', alpha=0.5, ax=axes[1, 0])
    axes[1, 0].set_title(f'Hist for the {groups[0]}')
    axes[1, 0].set_xlabel(value_column)
    axes[1, 0].set_ylabel("frequence")

    sns.histplot(df[df[group_column] == groups[1]][value_column], bins=30, kde=True, color='orange', alpha=0.5, ax=axes[1, 1])
    axes[1, 1].set_title(f'Hist for the {groups[1]}')
    axes[1, 1].set_xlabel(value_column)
    axes[1, 1].set_ylabel("Frequence")

    plt.tight_layout()
    plt.show()

# Levene's & Bartlet's test
def levene(df, indicator, metric):
    w_stats, p_value = st.levene(
        df[df['group_name'] == 0][indicator], 
        df[df['group_name'] == 1][indicator],
                            center=metric)
    
    alpha = 0.05
    
    if p_value > alpha:
        print(f"Variance are from the same population on {metric}")
    else:
        print(f"Variance are from the different population on {metric}")
    
# Cohen's D
def cohens_d(df, metric):
    group1 = df[df['group_name']==1][metric]
    group2 = df[df['group_name']==0][metric]
    mean1, mean2 = np.mean(group1), np.mean(group2)
     
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
     
    d = (mean1 - mean2) / pooled_std
     
    # if d <= 0.3:
    #     print(f'Small effect: d ≈ 0-0.3 ({d:.3f})')
    # elif 0.31 <= d <= 0.8:
    #     print(f'Medium effect: d ≈ 0.3-0.8 ({d:.3f})')
    # elif 0.81 <= d <= 1:
    #     print(f'Large effect: d ≈ 0.8-1 ({d:.3f})')

    return d

# SRM
def srm(df):
    srm_df = pd.DataFrame()

    for city in df['city_name'].unique():
        
        observed = [
            (df.query(f'group_name == 0 and city_name == "{city}"')['user_id'].count()), 
            (df.query(f'group_name == 1 and city_name == "{city}"')['user_id'].count())
            ]

        total_traffic = sum(observed)

        expected = [total_traffic/2, total_traffic/2]

        chi = st.chisquare(observed, f_exp = expected)

        if chi[1] < 0.01:
            conclusion = "Sample ratio mismatch (SRM) may be present"
        else:
            conclusion = "Sample ratio mismatch (SRM) probably not present"
            print(f"{city}, {chi[1]}")

        
        new_srm_df = pd.DataFrame(
            [[city, observed, total_traffic, expected, round(chi[1], 3), conclusion]], 
            columns=['city_name',  'sample_sizes', 'total_size', 'expected_sizes', 'chi_value', 'conclusion']
            )

        srm_df = pd.concat([srm_df, new_srm_df]).sort_values(['city_name', 'total_size'], ascending=False).reset_index(drop=True)

    return srm_df

# Calcualting the significance by cities
def calcualate_result(df_cr, df_abs):
    df_results = pd.DataFrame()

    for city in df_cr['city_name'].unique():

        absolute_values_keys_result = df_abs[df_abs['city_name']==f'{city}'].copy()

        cr_df = ztest_proportion(df_cr[df_cr['city_name']==f'{city}'], 'has_ride', 'group_name')
        cr_df['metric'] = 'Conversion'
        cr_df['cohen_d'] = cohens_d(df_cr[df_cr['city_name']==f'{city}'], 'has_ride')

        rides_df = ttest(absolute_values_keys_result, 'rides', 'group_name')
        rides_df['metric'] = 'Quantitive'
        rides_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'rides')

        gmv_df = ttest(absolute_values_keys_result, 'gmv', 'group_name')
        gmv_df['metric'] = 'Quantitive'
        gmv_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'gmv')

        orders_df = ttest(absolute_values_keys_result, 'orders', 'group_name')
        orders_df['metric'] = 'Quantitive'
        orders_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'orders')

        df_total = pd.concat([cr_df, rides_df, gmv_df, orders_df])

        df_total['region'] = city
        df_total['segment'] = 'By city'
        df_total['significance'] = (df_total['pvalue']<0.05)*1
        df_total['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

        df_results = pd.concat([df_results, df_total])

    total_cr_df = ztest_proportion(df_cr, 'has_ride', 'group_name')
    total_cr_df['metric'] = 'Conversion'
    total_cr_df['cohen_d'] = cohens_d(df_cr, 'has_ride')

    total_rides_df = ttest(df_abs, 'rides', 'group_name')
    total_rides_df['metric'] = 'Quantitive'
    total_rides_df['cohen_d'] = cohens_d(df_abs, 'rides')

    total_gmv_df = ttest(df_abs, 'gmv', 'group_name')
    total_gmv_df['metric'] = 'Quantitive'
    total_gmv_df['cohen_d'] = cohens_d(df_abs, 'gmv')

    total_orders_df = ttest(df_abs, 'orders', 'group_name')
    total_orders_df['metric'] = 'Quantitive'
    total_orders_df['cohen_d'] = cohens_d(df_abs, 'orders')


    total_total_df = pd.concat([total_cr_df, total_rides_df, total_gmv_df, total_orders_df])
    total_total_df['region'] = 'All'
    total_total_df['segment'] = 'Total'
    total_total_df['significance'] = (df_total['pvalue']<0.05)*1
    total_total_df['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

    df_results = pd.concat([df_results, total_total_df])

    df_results

    return df_results

def sequential_wald_test(df, date_col, metric_col, group_col, user_col, alpha=0.05, beta=0.2):
    
    A = np.round(np.log(beta / (1 - alpha)), 2)   
    B = np.round(np.log((1 - beta) / alpha), 2) 
    
    df_grouped = df.groupby([date_col, group_col]).agg(
        users=(user_col, 'nunique'), 
        conversions=(metric_col, 'sum') 
    ).reset_index()

    df_grouped["cum_users"] = df_grouped.groupby(group_col)["users"].cumsum()
    df_grouped["cum_conversions"] = df_grouped.groupby(group_col)["conversions"].cumsum()

    df_A = df_grouped[df_grouped[group_col] == 0].drop(columns=[group_col]).rename(
        columns={"users": "users_A", "conversions": "conv_A", "cum_users": "cum_users_A", "cum_conversions": "cum_conv_A"}
    )
    df_B = df_grouped[df_grouped[group_col] == 1].drop(columns=[group_col]).rename(
        columns={"users": "users_B", "conversions": "conv_B", "cum_users": "cum_users_B", "cum_conversions": "cum_conv_B"}
    )

    df_merged = pd.merge(df_A, df_B, on=date_col, how="outer").fillna(0)

    print("Колонки в df_merged:", df_merged.columns)

    p_values, llr_values = [], []
    stop_day = None

    for i in range(len(df_merged)):
        try:
            users_A, conv_A = df_merged.loc[i, ["cum_users_A", "cum_conv_A"]]
            users_B, conv_B = df_merged.loc[i, ["cum_users_B", "cum_conv_B"]]

            p_A = conv_A / users_A if users_A > 0 else 0
            p_B = conv_B / users_B if users_B > 0 else 0

            r = test_proportions_2indep(
                conv_A, users_A,
                conv_B, users_B,
                value=0,
                method='wald',
                compare='diff',
                alternative='two-sided',
                return_results=True
            )

            p_value = r.pvalue
            p_values.append(p_value)

            llr = np.log(p_B / p_A) if p_B > 0 and p_A > 0 else 0
            llr_values.append(llr)

            if llr <= A:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} <= {A:.3f} (Accept H0)")
                break
            elif llr >= B:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} >= {B:.3f} (Accept H1)")
                break

        except Exception as e:
            print(f"⚠️ Ошибка на дне {df_merged.loc[i, date_col]}: {e}")
            p_values.append(np.nan)
            llr_values.append(np.nan)

    # Создаем DataFrame с результатами
    df_results = df_merged.iloc[:len(p_values)].copy()
    df_results["p_value"] = p_values
    df_results["LLR"] = llr_values
    df_results["A/B"] = str([A, B])
    df_results["alpha_threshold"] = np.linspace(alpha, alpha / np.sqrt(len(df_results)), len(p_values))  # Коррекция alpha

    # Визуализация результатов
    plt.figure(figsize=(12, 6))
    plt.plot(df_results[date_col], df_results["p_value"], label="P-value", marker="o")
    plt.plot(df_results[date_col], df_results["alpha_threshold"], label="Corrected Alpha", linestyle="dashed")
    plt.axhline(y=alpha, color="red", linestyle="--", label="Standard Alpha (0.05)")
    plt.xticks(rotation=45)
    plt.xlabel("Date")
    plt.ylabel("P-Value")
    plt.title("P-value daily vs. Corrected Alpha")
    plt.legend()
    plt.grid()
    plt.show()

    return df_results

def calculate_criteria(df):
    df_res_1 = pd.DataFrame()
    indicators = ['has_ride', 'rides', 'gmv']

    total_res_z = expab.ztest_proportion(df, 'has_ride', 'group_name')
    total_res_z['city_name'] = 'all the cities together'
    total_res_t = expab.ttest(df, 'gmv', 'group_name')
    total_res_t['city_name'] = 'all the cities together'
    total_res_t2 = expab.ttest(df, 'rides', 'group_name')
    total_res_t2['city_name'] = 'all the cities together'

    df_res_1 = pd.concat([df_res_1, total_res_z, total_res_t, total_res_t2])

    for city in df['city_name'].unique():
        
        for metric in indicators:
            if metric == 'has_ride':
                city_df_z = expab.ztest_proportion(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_z['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_z])

            else:
                city_df_t = expab.ttest(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_t['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_t])

    df_res_1['corrected_pvalue'] = expab.method_benjamini_hochberg(df_res_1['pvalue'].values)
    df_res_1['significance'] = (df_res_1['pvalue']<0.05)*1


    return df_res_1

def calculate_numbers(df):

    df_agg = df.groupby(['group_name', 'city_name'], as_index=False)[['user_id', 'has_ride', 'rides', 'orders', 'gmv']].agg(
        {'user_id':'count', 
        'has_ride':'sum', 
        'rides':'sum', 
        'gmv':'sum'}
        ).sort_values(['city_name', 'group_name', 'user_id'], ascending=True)

    df_agg['group_name'] = df_agg['group_name'].astype(str)

    df_agg['group_name'] = df_agg['group_name'].replace({'0':'Control', '1':'Treatment'})

    df_agg['cr_ride_%'] = np.round(df_agg['has_ride'] / df_agg['user_id'] * 100,2)
    df_agg['cr_ride'] = np.round(df_agg['has_ride'] / df_agg['user_id'],5)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'].astype(str)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'] + '%'

    return df_agg

# Design

### Test in Colombia, Ecuador, Peru

In [16]:
cr_to_ride = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 25),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-04-01' AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
               AND country_id = 25
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4540557, 0, 1) AS group_id,
                      created_dt_part,
                      t2.metric_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < t1.created_dt_part
               WHERE test_id = 3139
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN (
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-04-01' AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
                                AND city_id IN (4231, 4242, 4226, 4255, 4278)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          CASE
                              WHEN t2.metric_date IS NULL THEN 1
                              ELSE 0
                              END newbie_flag,
                          t1.os_name,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.filled_flow,
                          t1.show_dt,
                          t1.click_dt,
                          t1.approve_dt,
                          t1.not_approve_dt
                   FROM liveness t1
                            LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < DATE(t1.show_dt)
                   QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.metric_date DESC) = 1),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-04-01'
                   AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
                 AND status_order = 'RIDE_STATUS_DONE'
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
SELECT t1.user_id,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       filled_flow,
       show_dt,
       click_dt,
       COALESCE(approve_dt, t2.order_timestamp)                    AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part BETWEEN DATE(show_dt) AND DATE_ADD(DATE(show_dt), INTERVAL +3 DAY)
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")

# CR to order upon check

cr_to_order = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 25),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-04-01' AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
               AND country_id = 25
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4540557, 0, 1) AS group_id,
                      created_dt_part,
                      t2.metric_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < t1.created_dt_part
               WHERE test_id = 3139
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN (
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-04-01' AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
                                AND city_id IN (4231, 4242, 4226, 4255, 4278)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          CASE
                              WHEN t2.metric_date IS NULL THEN 1
                              ELSE 0
                              END newbie_flag,
                          t1.os_name,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.filled_flow,
                          t1.show_dt,
                          t1.click_dt,
                          t1.approve_dt,
                          t1.not_approve_dt
                   FROM liveness t1
                            LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < DATE(t1.show_dt)
                   QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.metric_date DESC) = 1),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-04-01'
                         AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY))
SELECT t1.user_id,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       filled_flow,
       show_dt,
       click_dt,
       COALESCE(approve_dt, t2.order_timestamp)                    AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part BETWEEN DATE(show_dt) AND DATE_ADD(DATE(show_dt), INTERVAL +3 DAY)
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")



In [18]:
df_sample_size_id = read_bq("""
  SELECT name,
       event_dt_part,
       DATE_TRUNC(event_dt_part, WEEK)  AS weekly,
       DATE_TRUNC(event_dt_part, MONTH) AS monthly,
       t1.city_id,
       t2.city_name,
       t2.country_id,
       t2.country_name,
       COUNT(DISTINCT user_id)          AS users
FROM indriver-e6e40.ods_event_tracker.event t1
         JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
              ON
                  t1.city_id = t2.city_id
WHERE 1 = 1
  AND name IN (
    'client.verification_start.click'
    )
  AND event_dt_part >= '2025-04-01'
  AND t1.city_id IN
      (4231, 4242, 4226, 4255, 4278)
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8
""")

df_sample_size_id['event_dt_part'] = pd.to_datetime(df_sample_size_id['event_dt_part'])

In [17]:
cr_to_ride_newbies = cr_to_ride[cr_to_ride['newbie_flag']==1].copy()


In [None]:
cr_to_ride_newbies[['approve_flag', 'not_approve_flag', 'order_flag']].mean()


approve_flag       0.82
not_approve_flag   0.01
order_flag         0.57
dtype: Float64

In [15]:

daily = df_sample_size_id.groupby(['event_dt_part'], as_index=False)['users'].sum()['users'].mean().round(0)*0.5
weekly = df_sample_size_id.groupby(['weekly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.5
month = df_sample_size_id.groupby(['monthly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.5

effects = [1.01, 1.015, 1.05, 1.1]  # MDE in percents
sizes = [int(daily), int(weekly), int(weekly)*2, int(month)]  # Size of each group
first_type_errors = [0.01, 0.05]
second_type_errors = [0.1, 0.2]


def calculate_mde_unequal_split(
    baseline_conversion_rate: float,
    alpha: float = 0.05,
    power: float = 0.8,
    n_total: int = None,
    allocation_ratio: tuple = (0.5, 0.5) # Соотношение (контроль, тест), например (0.7, 0.3)
) -> dict:
    """
    Рассчитывает MDE (минимально обнаруживаемый эффект) для A/B-теста
    с неравномерным распределением групп.

    Args:
        baseline_conversion_rate (float): Базовая конверсия контрольной группы (от 0 до 1).
        alpha (float): Уровень значимости (ошибка I рода), по умолчанию 0.05.
        power (float): Статистическая мощность (1 - ошибка II рода), по умолчанию 0.8.
        n_total (int): Общий размер выборки для обеих групп.
                       Если не указан, функция вернет ошибку, так как MDE зависит от размера выборки.
        allocation_ratio (tuple): Кортеж, представляющий соотношение распределения
                                  (доля контрольной группы, доля тестовой группы).
                                  Сумма долей должна быть равна 1.0.
                                  По умолчанию (0.5, 0.5) - равномерное распределение.

    Returns:
        dict: Словарь, содержащий рассчитанный MDE (абсолютный и относительный),
              а также размеры групп и другие параметры.
              Возвращает None, если n_total не указан.
    """
    if n_total is None:
        return {"error": "Для расчета MDE необходимо указать общий размер выборки (n_total)."}

    if not (0 < baseline_conversion_rate < 1):
        raise ValueError("Базовая конверсия должна быть между 0 и 1.")
    if not (0 < alpha < 1):
        raise ValueError("Уровень значимости (альфа) должен быть между 0 и 1.")
    if not (0 < power < 1):
        raise ValueError("Мощность должна быть между 0 и 1.")
    if not (np.isclose(sum(allocation_ratio), 1.0) and all(r > 0 for r in allocation_ratio)):
        raise ValueError("Соотношение распределения должно быть кортежем положительных чисел, сумма которых равна 1.0.")

    # Разделяем общий размер выборки согласно соотношению
    n_control = ceil(n_total * allocation_ratio[0])
    n_variant = ceil(n_total * allocation_ratio[1])

    # Убедимся, что n_control + n_variant не превышает n_total (из-за округления ceil)
    # и корректируем, если это так
    if n_control + n_variant > n_total:
        if allocation_ratio[0] > allocation_ratio[1]:
            n_control = n_total - n_variant
        else:
            n_variant = n_total - n_control
    
    # Если одна из групп слишком мала после округления, убедимся, что она не 0
    if n_control == 0:
        n_control = 1
        n_variant = n_total - 1
    if n_variant == 0:
        n_variant = 1
        n_control = n_total - 1
    
    # Рассчитываем отношение размеров групп для NormalIndPower
    # NormalIndPower ожидает nobs1 (размер первой группы) и ratio (nobs2 / nobs1)
    # Мы используем n_control как nobs1, а n_variant как nobs2
    ratio_nobs = n_variant / n_control if n_control > 0 else 1 # Избегаем деления на ноль

    # Создаем объект для расчета мощности
    power_calculator = NormalIndPower()

    # Находим размер эффекта (Cohen's h)
    # solve_power возвращает effect_size, если остальные параметры заданы
    effect_size_cohen_h = power_calculator.solve_power(
        effect_size=None,
        nobs1=n_control,
        alpha=alpha,
        power=power,
        ratio=ratio_nobs,
        alternative='two-sided' # Двусторонний тест (обнаруживаем как увеличение, так и уменьшение)
    )

    # Преобразуем Cohen's h обратно в конверсию тестовой группы
    # effect_size = 2 * arcsin(sqrt(p2)) - 2 * arcsin(sqrt(p1))
    # Мы знаем p1 (baseline_conversion_rate) и effect_size_cohen_h
    # Нужно найти p2
    arcsin_sqrt_p1 = np.arcsin(np.sqrt(baseline_conversion_rate))
    arcsin_sqrt_p2 = arcsin_sqrt_p1 + (effect_size_cohen_h / 2) # Для увеличения
    
    # Конверсия не может быть больше 1
    detectable_conversion_rate_variant = np.sin(arcsin_sqrt_p2)**2
    if detectable_conversion_rate_variant > 1:
        detectable_conversion_rate_variant = 1.0

    # MDE (абсолютная разница)
    mde_absolute = detectable_conversion_rate_variant - baseline_conversion_rate

    # MDE (относительная разница, в процентах)
    mde_relative_percentage = (mde_absolute / baseline_conversion_rate) * 100 if baseline_conversion_rate != 0 else float('inf')

    return {
        "baseline_conversion_rate": baseline_conversion_rate,
        "alpha": alpha,
        "power": power,
        "n_total": n_total,
        "n_control": int(n_control),
        "n_variant": int(n_variant),
        "allocation_ratio": allocation_ratio,
        "effect_size_cohen_h": effect_size_cohen_h,
        "detectable_conversion_rate_variant": detectable_conversion_rate_variant,
        "mde_absolute": mde_absolute,
        "mde_relative_percentage": mde_relative_percentage
    }

def analyze_mde_scenarios(
    baseline_conversion_rate: float,
    alpha_levels: list[float],
    power_levels: list[float],
    n_total_levels: list[int],
    allocation_ratio: tuple = (0.5, 0.5)
) -> pd.DataFrame:
    """
    Анализирует MDE для различных сценариев, возвращая результаты в DataFrame.

    Args:
        baseline_conversion_rate (float): Базовая конверсия контрольной группы (от 0 до 1).
        alpha_levels (list[float]): Список уровней значимости (альфа) для тестирования.
        power_levels (list[float]): Список уровней статистической мощности для тестирования.
        n_total_levels (list[int]): Список общих размеров выборок для тестирования.
        allocation_ratio (tuple): Кортеж, представляющий соотношение распределения
                                  (доля контрольной группы, доля тестовой группы).

    Returns:
        pd.DataFrame: DataFrame, содержащий MDE и размеры групп для каждого сценария.
    """
    results = []
    for n_total in n_total_levels:
        for alpha in alpha_levels:
            for power in power_levels:
                beta = 1 - power # Ошибка второго рода
                
                mde_result = calculate_mde_unequal_split(
                    baseline_conversion_rate=baseline_conversion_rate,
                    alpha=alpha,
                    power=power,
                    n_total=n_total,
                    allocation_ratio=allocation_ratio
                )
                
                if "error" not in mde_result:
                    results.append({
                        "Alpha": alpha,
                        "Beta": beta,
                        "N_Total": n_total,
                        "Allocation_Ratio": allocation_ratio,
                        "N_Control": mde_result["n_control"],
                        "N_Variant": mde_result["n_variant"],
                        "MDE_Absolute": mde_result["mde_absolute"],
                        "MDE_Relative_Percentage": mde_result["mde_relative_percentage"]
                    })
                else:
                    print(f"Ошибка для N_Total={n_total}, Alpha={alpha}, Power={power}: {mde_result['error']}")

    df_results = pd.DataFrame(results)
    
    # Добавляем столбец с форматированной строкой для ошибок
    df_results['Errors (α, β)'] = df_results.apply(
        lambda row: f"({row['Alpha']}; {row['Beta']:.1f})", axis=1
    )
    
    # Добавляем столбец с форматированной строкой для размеров групп
    df_results['Group sizes'] = df_results.apply(
        lambda row: f"Control: {row['N_Control']}; Test: {row['N_Variant']}", axis=1
    )

    # Переупорядочиваем столбцы для соответствия запрошенному формату
    # 'Metric' не является прямым столбцом, это скорее категория.
    # Я представлю MDE_Absolute и MDE_Relative_Percentage как отдельные столбцы.
    final_columns = [
        "Errors (α, β)",
        "N_Total",
        "Allocation_Ratio",
        "Group sizes",
        "MDE_Absolute",
        "MDE_Relative_Percentage"
    ]
    
    return df_results[final_columns]

baseline_cr = 0.57
alpha_levels_to_test = [0.01, 0.05]
power_levels_to_test = [0.8] 
n_total_levels_to_test = [int(weekly), int(weekly)*2, int(month)]
allocation_ratio_to_test = (0.9, 0.1) 

mde_analysis_df = analyze_mde_scenarios(
    baseline_conversion_rate=baseline_cr,
    alpha_levels=alpha_levels_to_test,
    power_levels=power_levels_to_test,
    n_total_levels=n_total_levels_to_test,
    allocation_ratio=allocation_ratio_to_test
)

mde_analysis_df



Unnamed: 0,"Errors (α, β)",N_Total,Allocation_Ratio,Group sizes,MDE_Absolute,MDE_Relative_Percentage
0,(0.01; 0.2),44784,"(0.9, 0.1)",Control: 40305; Test: 4479,0.03,4.65
1,(0.05; 0.2),44784,"(0.9, 0.1)",Control: 40305; Test: 4479,0.02,3.82
2,(0.01; 0.2),89568,"(0.9, 0.1)",Control: 80611; Test: 8957,0.02,3.3
3,(0.05; 0.2),89568,"(0.9, 0.1)",Control: 80611; Test: 8957,0.02,2.7
4,(0.01; 0.2),179139,"(0.9, 0.1)",Control: 161225; Test: 17914,0.01,2.33
5,(0.05; 0.2),179139,"(0.9, 0.1)",Control: 161225; Test: 17914,0.01,1.91


# Monitoring

In [None]:
df_peru = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 24),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-04-01' AND CURRENT_DATE()
               AND country_id = 24
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4541490, 0, 1) AS group_id,
                      created_dt_part,
                      t2.metric_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < t1.created_dt_part
               WHERE test_id = 3206
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN (
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-05-01' AND CURRENT_DATE()
                                AND country_id IN (24)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.group_id,
                          t1.created_dt_part,
                          t1.newbie_flag,
                          t2.user_id AS user_with_svf,
                          t2.os_name,
                          t2.filled_flow,
                          t2.show_dt,
                          t2.click_dt,
                          t2.approve_dt,
                          t2.not_approve_dt
                   FROM total t1
                            JOIN liveness t2
                                 ON t1.user_id = t2.user_id AND DATE(t2.show_dt) >= t1.created_dt_part),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-05-01'
                   AND CURRENT_DATE()
                 AND status_order = 'RIDE_STATUS_DONE'
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
SELECT t1.user_id,
       t1.group_id,
       t1.created_dt_part,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       filled_flow,
       show_dt,
       click_dt,
       approve_dt                                                  AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part >= created_dt_part
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")

df_colombia = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 22),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-05-01' AND CURRENT_DATE()
               AND country_id = 22
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4541552, 0, 1) AS group_id,
                      created_dt_part,
                      t2.metric_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2 ON t1.user_id = t2.user_id AND t2.metric_date < t1.created_dt_part
               WHERE test_id = 3223
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN (
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-05-01' AND CURRENT_DATE()
                                AND country_id IN (22)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.group_id,
                          t1.created_dt_part,
                          t1.newbie_flag,
                          t2.user_id AS user_with_svf,
                          t2.os_name,
                          t2.filled_flow,
                          t2.show_dt,
                          t2.click_dt,
                          t2.approve_dt,
                          t2.not_approve_dt
                   FROM total t1
                            JOIN liveness t2
                                 ON t1.user_id = t2.user_id AND DATE(t2.show_dt) >= t1.created_dt_part),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-05-01'
                   AND CURRENT_DATE()
                 AND status_order = 'RIDE_STATUS_DONE'
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
SELECT t1.user_id,
       t1.group_id,
       t1.created_dt_part,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       filled_flow,
       show_dt,
       click_dt,
       approve_dt                                                  AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part >= created_dt_part
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")

In [35]:
df_grouped = df_peru[~df_peru['show_dt'].isna()].groupby(['group_id', 'newbie_flag'], as_index=False)[['show_dt', 'click_dt', 'approve_dt', 'not_approve_dt', 'order_flag']].agg({'show_dt':'count', 'click_dt':'count', 'approve_dt':'count', 'not_approve_dt':'count', 'order_flag':'sum'})

df_grouped['cr_to_approve'] = df_grouped['approve_dt'] / df_grouped['click_dt'] * 100
df_grouped['cr_to_ride'] = df_grouped['order_flag'] / df_grouped['approve_dt'] * 100
df_grouped['cr_to_not_approve'] = df_grouped['not_approve_dt'] / df_grouped['click_dt'] * 100




df_grouped

Unnamed: 0,group_id,newbie_flag,show_dt,click_dt,approve_dt,not_approve_dt,order_flag,cr_to_approve,cr_to_ride,cr_to_not_approve
0,0,0,722,718,655,9,646,91.23,98.63,1.25
1,0,1,1704,1664,1408,5,915,84.62,64.99,0.3
2,1,0,80,80,74,1,74,92.5,100.0,1.25
3,1,1,213,202,128,6,83,63.37,64.84,2.97


In [65]:
2.97-0.3

2.6700000000000004

In [36]:
df_grouped = df_colombia[~df_colombia['show_dt'].isna()].groupby(['group_id', 'newbie_flag'], as_index=False)[['show_dt', 'click_dt', 'approve_dt', 'not_approve_dt', 'order_flag']].agg({'show_dt':'count', 'click_dt':'count', 'approve_dt':'count', 'not_approve_dt':'count', 'order_flag':'sum'})

df_grouped['cr_to_approve'] = df_grouped['approve_dt'] / df_grouped['click_dt'] * 100
df_grouped['cr_to_ride'] = df_grouped['order_flag'] / df_grouped['approve_dt'] * 100
df_grouped['cr_to_not_approve'] = df_grouped['not_approve_dt'] / df_grouped['click_dt'] * 100




df_grouped

Unnamed: 0,group_id,newbie_flag,show_dt,click_dt,approve_dt,not_approve_dt,order_flag,cr_to_approve,cr_to_ride,cr_to_not_approve
0,0,0,54594,53750,41336,1686,30596,76.9,74.02,3.14
1,0,1,14704,14228,11620,116,7577,81.67,65.21,0.82
2,1,0,5950,5821,3162,1556,2098,54.32,66.35,26.73
3,1,1,2133,2045,626,525,433,30.61,69.17,25.67


# Summarising

In [41]:
df_peru['rides'] = df_peru['rides'].fillna(0)
df_peru['gmv'] = df_peru['gmv'].fillna(0)
df_peru.head()

Unnamed: 0,user_id,group_id,created_dt_part,os_name,city_id,city_name,country_id,country_name,newbie_flag,filled_flow,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,12587106,1,2025-05-28,ios,4272,Chiclayo,24,Peru,0,liveness,2025-05-31 14:18:52.214000+00:00,2025-05-31 14:18:57.129000+00:00,2025-05-31 14:19:44.223000+00:00,1,NaT,0,2025-05-28 01:46:07+00:00,1,72.59,33,54.0
1,13837432,0,2025-06-01,ios,4272,Chiclayo,24,Peru,0,liveness,2025-06-02 08:56:49.763000+00:00,2025-06-02 08:56:51.131000+00:00,2025-06-02 08:57:22.166000+00:00,1,NaT,0,2025-06-01 00:38:49+00:00,1,101.5,42,53.0
2,148377114,0,2025-06-01,android,4272,Chiclayo,24,Peru,1,liveness,2025-06-01 08:22:31.072000+00:00,2025-06-01 08:22:32.725000+00:00,2025-06-01 08:23:14.108000+00:00,1,NaT,0,2025-06-01 08:23:17+00:00,1,3.29,1,1.0
3,169355421,0,2025-05-28,ios,4272,Chiclayo,24,Peru,0,liveness,2025-06-01 23:23:14.911000+00:00,2025-06-01 23:23:16.460000+00:00,2025-06-01 23:23:49.105000+00:00,1,NaT,0,2025-06-01 18:19:39+00:00,1,15.86,8,10.0
4,169896480,0,2025-06-02,android,4272,Chiclayo,24,Peru,1,liveness,2025-06-02 20:23:53.598000+00:00,2025-06-02 20:24:07.055000+00:00,NaT,0,NaT,0,NaT,0,0.0,0,


In [42]:
df_colombia['rides'] = df_colombia['rides'].fillna(0)
df_colombia['gmv'] = df_colombia['gmv'].fillna(0)
df_colombia.head()

Unnamed: 0,user_id,group_id,created_dt_part,os_name,city_id,city_name,country_id,country_name,newbie_flag,filled_flow,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,36150212,0,2025-05-29,android,4242,Medellin,22,Colombia,0,liveness,2025-05-29 20:01:32.623000+00:00,2025-05-29 20:01:35.955000+00:00,2025-05-29 20:02:24.344000+00:00,1,NaT,0,2025-05-29 20:02:38+00:00,1,46.04,10,12
1,76732329,0,2025-05-28,android,4242,Medellin,22,Colombia,0,liveness,2025-05-28 22:36:56.904000+00:00,2025-05-28 22:36:58.282000+00:00,2025-05-28 22:37:26.393000+00:00,1,NaT,0,2025-05-28 22:37:30+00:00,1,160.59,40,65
2,107662317,0,2025-05-28,ios,4242,Medellin,22,Colombia,0,liveness,2025-05-29 19:30:09.213000+00:00,2025-05-29 19:30:16.496000+00:00,2025-05-29 19:30:44.855000+00:00,1,NaT,0,2025-05-28 19:39:47+00:00,1,98.47,46,91
3,125849185,0,2025-05-31,ios,4242,Medellin,22,Colombia,0,liveness,2025-06-02 23:49:55.078000+00:00,2025-06-02 23:49:56.154000+00:00,2025-06-02 23:50:37.699000+00:00,1,NaT,0,2025-05-31 14:02:54+00:00,1,26.62,12,12
4,164040409,0,2025-06-01,ios,4242,Medellin,22,Colombia,0,liveness,2025-06-01 02:00:00.198000+00:00,2025-06-01 02:00:01.595000+00:00,2025-06-01 02:00:43.179000+00:00,1,NaT,0,2025-06-01 02:12:26+00:00,1,42.78,18,32


In [55]:
indicators = ['approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']
res_df = pd.DataFrame()


for metric in indicators:

    if metric in ['rides', 'gmv']:
        ttest = expab.ttest(
            df_colombia[(df_colombia['newbie_flag']==1)&(~df_colombia['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ttest['criteria'] = 'ttest'
        ttest['region'] = 'Colombia'
        res_df = pd.concat([res_df, ttest])

    else:
        ztest = expab.ztest_proportion(
            df_colombia[(df_colombia['newbie_flag']==1)&(~df_colombia['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ztest['criteria'] = 'ztest'
        ztest['region'] = 'Colombia'
        res_df = pd.concat([res_df, ztest])


    res_df['significance'] = (res_df['pvalue']<0.05)*1
    res_df['result_with_corr'] = method_benjamini_hochberg(res_df['pvalue'].values)

for metric in indicators:

    if metric in ['rides', 'gmv']:
        ttest = expab.ttest(
            df_peru[(df_peru['newbie_flag']==1)&(~df_peru['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ttest['criteria'] = 'ttest'
        ttest['region'] = 'Peru'
        res_df = pd.concat([res_df, ttest])

    else:
        ztest = expab.ztest_proportion(
            df_peru[(df_peru['newbie_flag']==1)&(~df_peru['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ztest['criteria'] = 'ztest'
        ztest['region'] = 'Peru'
        res_df = pd.concat([res_df, ztest])

    res_df['significance'] = (res_df['pvalue']<0.05)*1
    res_df['result_with_corr'] = method_benjamini_hochberg(res_df['pvalue'].values)


res_df

Unnamed: 0,metric_name,group0_sample_size,group1_sample_size,group0,group1,statistic,pvalue,mean0,mean1,diff_mean,diff_mean_%,lb,ub,lb_%,ub_%,criteria,region,significance,result_with_corr
0,approve_flag,14704,2133,0,1,47.7,0.0,0.79,0.29,-0.5,-62.86,-0.52,-0.48,-65.45,-60.28,ztest,Colombia,1,1
0,not_approve_flag,14704,2133,0,1,-24.72,0.0,0.01,0.23,0.23,3664.95,0.21,0.24,3374.35,3955.56,ztest,Colombia,1,1
0,order_flag,14704,2133,0,1,32.41,0.0,0.52,0.2,-0.31,-60.61,-0.33,-0.29,-64.27,-56.94,ztest,Colombia,1,1
0,rides,14704,2133,0,1,22.14,0.0,0.86,0.35,-0.51,-59.28,-0.55,-0.46,-64.53,-54.03,ttest,Colombia,1,1
0,gmv,14704,2133,0,1,20.66,0.0,3.26,1.36,-1.9,-58.21,-2.08,-1.72,-63.74,-52.69,ttest,Colombia,1,1
0,approve_flag,1704,213,0,1,6.48,0.0,0.83,0.6,-0.23,-27.27,-0.29,-0.16,-35.52,-19.02,ztest,Peru,1,1
0,not_approve_flag,1704,213,0,1,-2.37,0.02,0.0,0.03,0.03,2300.0,0.0,0.05,401.82,4198.18,ztest,Peru,1,1
0,order_flag,1704,213,0,1,4.15,0.0,0.54,0.39,-0.15,-27.43,-0.22,-0.08,-40.4,-14.46,ztest,Peru,1,1
0,rides,1704,213,0,1,3.56,0.0,0.92,0.63,-0.29,-31.3,-0.45,-0.13,-48.62,-13.98,ttest,Peru,1,1
0,gmv,1704,213,0,1,3.6,0.0,2.61,1.77,-0.84,-32.32,-1.31,-0.38,-50.01,-14.64,ttest,Peru,1,1


In [72]:
df_mexico = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 12),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-06-01' AND CURRENT_DATE()
               AND country_id = 12
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4543357, 0, 1) AS group_id,
                      participant_first_toggle_date,
                      t2.metric_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag
               FROM indrive-core.ab_platform.tbl_ab_experiment_markup t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2
                                  ON t1.user_id = t2.user_id AND t2.metric_date < t1.participant_first_toggle_date
               WHERE experiment_id = 3420
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN (
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-06-01' AND CURRENT_DATE()
                                AND country_id IN (12)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN (
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.group_id,
                          t1.participant_first_toggle_date,
                          t1.newbie_flag,
                          t2.user_id AS user_with_svf,
                          t2.os_name,
                          t2.filled_flow,
                          t2.show_dt,
                          t2.click_dt,
                          t2.approve_dt,
                          t2.not_approve_dt
                   FROM total t1
                            JOIN liveness t2
                                 ON t1.user_id = t2.user_id AND DATE(t2.show_dt) >= t1.participant_first_toggle_date),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-06-01'
                   AND CURRENT_DATE()
                 AND status_order = 'RIDE_STATUS_DONE'
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
SELECT t1.user_id,
       t1.group_id,
       t1.participant_first_toggle_date,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       filled_flow,
       show_dt,
       click_dt,
       approve_dt                                                  AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part >= participant_first_toggle_date
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")

df_mexico.head()

Unnamed: 0,user_id,group_id,participant_first_toggle_date,os_name,city_id,city_name,country_id,country_name,newbie_flag,filled_flow,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,69554358,0,2025-06-21,android,4231,San Luis Potosi,12,Mexico,0,liveness,2025-06-22 19:24:57.653000+00:00,2025-06-22 19:25:01.442000+00:00,2025-06-22 19:25:31.148000+00:00,1,NaT,0,2025-06-22 19:35:54+00:00,1,12.36,9,10
1,268673902,0,2025-06-20,android,4231,San Luis Potosi,12,Mexico,0,liveness,2025-06-20 22:07:33.428000+00:00,2025-06-20 22:07:35.686000+00:00,2025-06-20 22:07:55.345000+00:00,1,NaT,0,2025-06-22 03:35:51+00:00,1,52.68,13,39
2,301580868,0,2025-06-21,ios,4231,San Luis Potosi,12,Mexico,1,liveness,2025-06-21 21:10:08.375000+00:00,2025-06-21 21:10:09.485000+00:00,2025-06-21 21:10:28.477000+00:00,1,NaT,0,2025-06-21 22:13:29+00:00,1,26.73,9,12
3,239776795,0,2025-06-24,ios,4228,Tijuana,12,Mexico,0,liveness,2025-06-24 12:37:50.728000+00:00,2025-06-24 12:37:51.830000+00:00,2025-06-24 12:38:32.884000+00:00,1,NaT,0,2025-06-24 12:40:20+00:00,1,63.73,8,12
4,236372252,0,2025-06-21,ios,4228,Tijuana,12,Mexico,0,liveness,2025-06-21 09:42:34.932000+00:00,2025-06-21 09:42:37.736000+00:00,2025-06-21 09:43:14.565000+00:00,1,NaT,0,2025-06-22 07:30:49+00:00,1,48.96,8,19


In [77]:
df_mexico['rides'] = df_mexico['rides'].fillna(0)
df_mexico['gmv'] = df_mexico['gmv'].fillna(0)
df_mexico.head()

Unnamed: 0,user_id,group_id,participant_first_toggle_date,os_name,city_id,city_name,country_id,country_name,newbie_flag,filled_flow,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,69554358,0,2025-06-21,android,4231,San Luis Potosi,12,Mexico,0,liveness,2025-06-22 19:24:57.653000+00:00,2025-06-22 19:25:01.442000+00:00,2025-06-22 19:25:31.148000+00:00,1,NaT,0,2025-06-22 19:35:54+00:00,1,12.36,9,10
1,268673902,0,2025-06-20,android,4231,San Luis Potosi,12,Mexico,0,liveness,2025-06-20 22:07:33.428000+00:00,2025-06-20 22:07:35.686000+00:00,2025-06-20 22:07:55.345000+00:00,1,NaT,0,2025-06-22 03:35:51+00:00,1,52.68,13,39
2,301580868,0,2025-06-21,ios,4231,San Luis Potosi,12,Mexico,1,liveness,2025-06-21 21:10:08.375000+00:00,2025-06-21 21:10:09.485000+00:00,2025-06-21 21:10:28.477000+00:00,1,NaT,0,2025-06-21 22:13:29+00:00,1,26.73,9,12
3,239776795,0,2025-06-24,ios,4228,Tijuana,12,Mexico,0,liveness,2025-06-24 12:37:50.728000+00:00,2025-06-24 12:37:51.830000+00:00,2025-06-24 12:38:32.884000+00:00,1,NaT,0,2025-06-24 12:40:20+00:00,1,63.73,8,12
4,236372252,0,2025-06-21,ios,4228,Tijuana,12,Mexico,0,liveness,2025-06-21 09:42:34.932000+00:00,2025-06-21 09:42:37.736000+00:00,2025-06-21 09:43:14.565000+00:00,1,NaT,0,2025-06-22 07:30:49+00:00,1,48.96,8,19


In [78]:
indicators = ['approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']

ttest = expab.ttest(
            df_mexico[(df_mexico['newbie_flag']==1)&(~df_mexico['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            'rides',
            'group_id'
            )
ttest['criteria'] = 'ttest'
ttest

Unnamed: 0,metric_name,group0_sample_size,group1_sample_size,group0,group1,statistic,pvalue,mean0,mean1,diff_mean,diff_mean_%,lb,ub,lb_%,ub_%,criteria
0,rides,7336,1157,0,1,6.67,0.0,0.77,0.49,-0.28,-36.64,-0.37,-0.2,-47.41,-25.87,ttest


In [80]:
indicators = ['approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']
res_df = pd.DataFrame()

for metric in indicators:

    if metric in ['rides', 'gmv']:
        ttest = expab.ttest(
            df_mexico[(df_mexico['newbie_flag']==1)&(~df_mexico['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ttest['criteria'] = 'ttest'
        res_df = pd.concat([res_df, ttest])

    else:
        ztest = expab.ztest_proportion(
            df_mexico[(df_mexico['newbie_flag']==1)&(~df_mexico['show_dt'].isna())][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ztest['criteria'] = 'ztest'
        res_df = pd.concat([res_df, ztest])

    res_df['significance'] = (res_df['pvalue']<0.05)*1
    res_df['result_with_corr'] = method_benjamini_hochberg(res_df['pvalue'].values)

res_df

Unnamed: 0,metric_name,group0_sample_size,group1_sample_size,group0,group1,statistic,pvalue,mean0,mean1,diff_mean,diff_mean_%,lb,ub,lb_%,ub_%,criteria,significance,result_with_corr
0,approve_flag,7336,1157,0,1,20.44,0.0,0.77,0.45,-0.32,-41.07,-0.35,-0.29,-45.01,-37.13,ztest,1,1
0,not_approve_flag,7336,1157,0,1,-1.77,0.08,0.01,0.01,0.01,79.65,-0.0,0.01,-8.79,168.08,ztest,0,0
0,order_flag,7336,1157,0,1,13.38,0.0,0.41,0.23,-0.18,-44.12,-0.21,-0.16,-50.59,-37.66,ztest,1,1
0,rides,7336,1157,0,1,6.67,0.0,0.77,0.49,-0.28,-36.64,-0.37,-0.2,-47.41,-25.87,ttest,1,1
0,gmv,7336,1157,0,1,5.61,0.0,3.96,2.58,-1.38,-34.79,-1.86,-0.9,-46.96,-22.62,ttest,1,1
