# Packages

In [2]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

## Functions

In [3]:
def method_benjamini_hochberg(
    pvalues: np.ndarray,
    alpha: float = 0.05
    ) -> np.ndarray:
    """Apply the Benjamini-Hochberg procedure for multiple hypothesis testing."""
    m = len(pvalues)
    array_alpha = np.arange(1, m + 1) * alpha / m
    sorted_pvalue_indexes = np.argsort(pvalues)
    res = np.zeros(m)
    for idx, pvalue_index in enumerate(sorted_pvalue_indexes):
        pvalue = pvalues[pvalue_index]
        alpha_ = array_alpha[idx]
        if pvalue <= alpha_:
            res[pvalue_index] = 1
        else:
            break
    return res.astype(int)

# Shapiro-Wilk test & Distributions
def check_normality(df, group_column, value_column):
    groups = df[group_column].unique()

    for group in groups:
        group_data = df[df[group_column] == group][value_column].dropna() 
        stat, p = stats.shapiro(group_data)
        print(f'Group {group}: W={stat:.4f}, p-value={p:.4f}')
        if p > 0.05:
            print(f'Group {group}, Metric: {value_column}: Data is normal distributed')
        else:
            print(f'Group {group}, Metric: {value_column}: Data is not normal distributed')

def plot_distribution(df, group_column, value_column):

    groups = df[group_column].unique()
    fig, axes = plt.subplots(2, 2, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.5]})

    sns.histplot(data=df, x=value_column, hue=group_column, kde=True, bins=30, alpha=0.4, ax=axes[0, 0])
    axes[0, 0].set_title("Graph + KDE")
    axes[0, 0].set_xlabel(value_column)
    axes[0, 0].set_ylabel("Frequence")

    sns.boxplot(data=df, x=group_column, y=value_column, ax=axes[0, 1])
    axes[0, 1].set_title("Boxplot grouped")
    axes[0, 1].set_xlabel(group_column)
    axes[0, 1].set_ylabel(value_column)

    sns.histplot(df[df[group_column] == groups[0]][value_column], bins=30, kde=True, color='blue', alpha=0.5, ax=axes[1, 0])
    axes[1, 0].set_title(f'Hist for the {groups[0]}')
    axes[1, 0].set_xlabel(value_column)
    axes[1, 0].set_ylabel("frequence")

    sns.histplot(df[df[group_column] == groups[1]][value_column], bins=30, kde=True, color='orange', alpha=0.5, ax=axes[1, 1])
    axes[1, 1].set_title(f'Hist for the {groups[1]}')
    axes[1, 1].set_xlabel(value_column)
    axes[1, 1].set_ylabel("Frequence")

    plt.tight_layout()
    plt.show()

# Levene's & Bartlet's test
def levene(df, indicator, metric):
    w_stats, p_value = st.levene(
        df[df['group_name'] == 0][indicator], 
        df[df['group_name'] == 1][indicator],
                            center=metric)
    
    alpha = 0.05
    
    if p_value > alpha:
        print(f"Variance are from the same population on {metric}")
    else:
        print(f"Variance are from the different population on {metric}")
    
# Cohen's D
def cohens_d(df, metric):
    group1 = df[df['group_name']==1][metric]
    group2 = df[df['group_name']==0][metric]
    mean1, mean2 = np.mean(group1), np.mean(group2)
     
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
     
    d = (mean1 - mean2) / pooled_std
     
    # if d <= 0.3:
    #     print(f'Small effect: d ≈ 0-0.3 ({d:.3f})')
    # elif 0.31 <= d <= 0.8:
    #     print(f'Medium effect: d ≈ 0.3-0.8 ({d:.3f})')
    # elif 0.81 <= d <= 1:
    #     print(f'Large effect: d ≈ 0.8-1 ({d:.3f})')

    return d

# SRM
def srm(df):
    srm_df = pd.DataFrame()

    for city in df['city_name'].unique():
        
        observed = [
            (df.query(f'group_name == 0 and city_name == "{city}"')['user_id'].count()), 
            (df.query(f'group_name == 1 and city_name == "{city}"')['user_id'].count())
            ]

        total_traffic = sum(observed)

        expected = [total_traffic/2, total_traffic/2]

        chi = st.chisquare(observed, f_exp = expected)

        if chi[1] < 0.01:
            conclusion = "Sample ratio mismatch (SRM) may be present"
        else:
            conclusion = "Sample ratio mismatch (SRM) probably not present"
            print(f"{city}, {chi[1]}")

        
        new_srm_df = pd.DataFrame(
            [[city, observed, total_traffic, expected, round(chi[1], 3), conclusion]], 
            columns=['city_name',  'sample_sizes', 'total_size', 'expected_sizes', 'chi_value', 'conclusion']
            )

        srm_df = pd.concat([srm_df, new_srm_df]).sort_values(['city_name', 'total_size'], ascending=False).reset_index(drop=True)

    return srm_df

# Calcualting the significance by cities
def calcualate_result(df_cr, df_abs):
    df_results = pd.DataFrame()

    for city in df_cr['city_name'].unique():

        absolute_values_keys_result = df_abs[df_abs['city_name']==f'{city}'].copy()

        cr_df = ztest_proportion(df_cr[df_cr['city_name']==f'{city}'], 'has_ride', 'group_name')
        cr_df['metric'] = 'Conversion'
        cr_df['cohen_d'] = cohens_d(df_cr[df_cr['city_name']==f'{city}'], 'has_ride')

        rides_df = ttest(absolute_values_keys_result, 'rides', 'group_name')
        rides_df['metric'] = 'Quantitive'
        rides_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'rides')

        gmv_df = ttest(absolute_values_keys_result, 'gmv', 'group_name')
        gmv_df['metric'] = 'Quantitive'
        gmv_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'gmv')

        orders_df = ttest(absolute_values_keys_result, 'orders', 'group_name')
        orders_df['metric'] = 'Quantitive'
        orders_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'orders')

        df_total = pd.concat([cr_df, rides_df, gmv_df, orders_df])

        df_total['region'] = city
        df_total['segment'] = 'By city'
        df_total['significance'] = (df_total['pvalue']<0.05)*1
        df_total['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

        df_results = pd.concat([df_results, df_total])

    total_cr_df = ztest_proportion(df_cr, 'has_ride', 'group_name')
    total_cr_df['metric'] = 'Conversion'
    total_cr_df['cohen_d'] = cohens_d(df_cr, 'has_ride')

    total_rides_df = ttest(df_abs, 'rides', 'group_name')
    total_rides_df['metric'] = 'Quantitive'
    total_rides_df['cohen_d'] = cohens_d(df_abs, 'rides')

    total_gmv_df = ttest(df_abs, 'gmv', 'group_name')
    total_gmv_df['metric'] = 'Quantitive'
    total_gmv_df['cohen_d'] = cohens_d(df_abs, 'gmv')

    total_orders_df = ttest(df_abs, 'orders', 'group_name')
    total_orders_df['metric'] = 'Quantitive'
    total_orders_df['cohen_d'] = cohens_d(df_abs, 'orders')


    total_total_df = pd.concat([total_cr_df, total_rides_df, total_gmv_df, total_orders_df])
    total_total_df['region'] = 'All'
    total_total_df['segment'] = 'Total'
    total_total_df['significance'] = (df_total['pvalue']<0.05)*1
    total_total_df['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

    df_results = pd.concat([df_results, total_total_df])

    df_results

    return df_results

def sequential_wald_test(df, date_col, metric_col, group_col, user_col, alpha=0.05, beta=0.2):
    
    A = np.round(np.log(beta / (1 - alpha)), 2)   
    B = np.round(np.log((1 - beta) / alpha), 2) 
    
    df_grouped = df.groupby([date_col, group_col]).agg(
        users=(user_col, 'nunique'), 
        conversions=(metric_col, 'sum') 
    ).reset_index()

    df_grouped["cum_users"] = df_grouped.groupby(group_col)["users"].cumsum()
    df_grouped["cum_conversions"] = df_grouped.groupby(group_col)["conversions"].cumsum()

    df_A = df_grouped[df_grouped[group_col] == 0].drop(columns=[group_col]).rename(
        columns={"users": "users_A", "conversions": "conv_A", "cum_users": "cum_users_A", "cum_conversions": "cum_conv_A"}
    )
    df_B = df_grouped[df_grouped[group_col] == 1].drop(columns=[group_col]).rename(
        columns={"users": "users_B", "conversions": "conv_B", "cum_users": "cum_users_B", "cum_conversions": "cum_conv_B"}
    )

    df_merged = pd.merge(df_A, df_B, on=date_col, how="outer").fillna(0)

    print("Колонки в df_merged:", df_merged.columns)

    p_values, llr_values = [], []
    stop_day = None

    for i in range(len(df_merged)):
        try:
            users_A, conv_A = df_merged.loc[i, ["cum_users_A", "cum_conv_A"]]
            users_B, conv_B = df_merged.loc[i, ["cum_users_B", "cum_conv_B"]]

            p_A = conv_A / users_A if users_A > 0 else 0
            p_B = conv_B / users_B if users_B > 0 else 0

            r = test_proportions_2indep(
                conv_A, users_A,
                conv_B, users_B,
                value=0,
                method='wald',
                compare='diff',
                alternative='two-sided',
                return_results=True
            )

            p_value = r.pvalue
            p_values.append(p_value)

            llr = np.log(p_B / p_A) if p_B > 0 and p_A > 0 else 0
            llr_values.append(llr)

            if llr <= A:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} <= {A:.3f} (Accept H0)")
                break
            elif llr >= B:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} >= {B:.3f} (Accept H1)")
                break

        except Exception as e:
            print(f"⚠️ Ошибка на дне {df_merged.loc[i, date_col]}: {e}")
            p_values.append(np.nan)
            llr_values.append(np.nan)

    # Создаем DataFrame с результатами
    df_results = df_merged.iloc[:len(p_values)].copy()
    df_results["p_value"] = p_values
    df_results["LLR"] = llr_values
    df_results["A/B"] = str([A, B])
    df_results["alpha_threshold"] = np.linspace(alpha, alpha / np.sqrt(len(df_results)), len(p_values))  # Коррекция alpha

    # Визуализация результатов
    plt.figure(figsize=(12, 6))
    plt.plot(df_results[date_col], df_results["p_value"], label="P-value", marker="o")
    plt.plot(df_results[date_col], df_results["alpha_threshold"], label="Corrected Alpha", linestyle="dashed")
    plt.axhline(y=alpha, color="red", linestyle="--", label="Standard Alpha (0.05)")
    plt.xticks(rotation=45)
    plt.xlabel("Date")
    plt.ylabel("P-Value")
    plt.title("P-value daily vs. Corrected Alpha")
    plt.legend()
    plt.grid()
    plt.show()

    return df_results

def calculate_criteria(df):
    df_res_1 = pd.DataFrame()
    indicators = ['has_ride', 'rides', 'gmv']

    total_res_z = expab.ztest_proportion(df, 'has_ride', 'group_name')
    total_res_z['city_name'] = 'all the cities together'
    total_res_t = expab.ttest(df, 'gmv', 'group_name')
    total_res_t['city_name'] = 'all the cities together'
    total_res_t2 = expab.ttest(df, 'rides', 'group_name')
    total_res_t2['city_name'] = 'all the cities together'

    df_res_1 = pd.concat([df_res_1, total_res_z, total_res_t, total_res_t2])

    for city in df['city_name'].unique():
        
        for metric in indicators:
            if metric == 'has_ride':
                city_df_z = expab.ztest_proportion(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_z['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_z])

            else:
                city_df_t = expab.ttest(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_t['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_t])

    df_res_1['corrected_pvalue'] = expab.method_benjamini_hochberg(df_res_1['pvalue'].values)
    df_res_1['significance'] = (df_res_1['pvalue']<0.05)*1


    return df_res_1

def calculate_numbers(df):

    df_agg = df.groupby(['group_name', 'city_name'], as_index=False)[['user_id', 'has_ride', 'rides', 'orders', 'gmv']].agg(
        {'user_id':'count', 
        'has_ride':'sum', 
        'rides':'sum', 
        'gmv':'sum'}
        ).sort_values(['city_name', 'group_name', 'user_id'], ascending=True)

    df_agg['group_name'] = df_agg['group_name'].astype(str)

    df_agg['group_name'] = df_agg['group_name'].replace({'0':'Control', '1':'Treatment'})

    df_agg['cr_ride_%'] = np.round(df_agg['has_ride'] / df_agg['user_id'] * 100,2)
    df_agg['cr_ride'] = np.round(df_agg['has_ride'] / df_agg['user_id'],5)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'].astype(str)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'] + '%'

    return df_agg

# Monitoring

In [75]:
df = read_bq("""
WITH newbies AS (SELECT user_id,
                        metric_date
                 FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                 WHERE user_type = 'pass'
                   AND rides_count > 0
                   AND metric_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -1 YEAR)
                   AND country_id = 25),
     registered AS (SELECT id, DATE(created) AS created
                    FROM dwh-storage-327422.personal_data.tbl_user_act
                    WHERE DATE(created) BETWEEN '2020-01-01' AND CURRENT_DATE()
                      AND country_id IN (25)),
     gmv AS (SELECT user_id,
                    SUM(gmv_clean_usd) AS gmv,
                    SUM(rides_count)   AS rides,
                    SUM(orders_count)  AS orders
             FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
             WHERE user_type = 'pass'
               AND metric_date BETWEEN '2025-06-01' AND CURRENT_DATE()
               AND country_id = 25
             GROUP BY 1),
     total AS (SELECT t1.user_id,
                      t1.city_id,
                      geo.city_name,
                      geo.country_id,
                      geo.country_name,
                      IF(group_id = 4546283, 0, 1) AS group_id,
                      participant_first_toggle_date,
                      t2.metric_date,
                      t3.created                   AS registration_date,
                      CASE
                          WHEN t2.metric_date IS NULL THEN 1
                          ELSE 0
                          END                         newbie_flag,
                      CASE
                          WHEN t3.created = t1.participant_first_toggle_date THEN 1
                          ELSE 0
                          END                         reg_same_day_flag
               FROM indrive-core.ab_platform.tbl_ab_experiment_markup t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                             ON
                                 t1.city_id = geo.city_id
                        LEFT JOIN newbies t2
                                  ON t1.user_id = t2.user_id AND t2.metric_date < t1.participant_first_toggle_date
                        LEFT JOIN registered t3
                                  ON t1.user_id = t3.id
               WHERE experiment_id = 3684
               QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY metric_date DESC) = 1),
     liveness AS (SELECT user_id,
                         os_name,
                         city_id,
                         city_name,
                         country_id,
                         country_name,
                         COALESCE(filled_flow, 'liveness')                                  AS filled_flow,
                         MAX(IF(name = 'registration.success', client_time, NULL))          AS registration_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.show', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS show_dt,
                         COALESCE(MAX(IF(name = 'client.verification_start.click', client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) = 'approve'), client_time, NULL)),
                                  MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                          LOWER(status) != 'approve'), client_time, NULL))) AS click_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) = 'approve'), client_time, NULL))            AS approve_dt,
                         MAX(IF((name = 'client.verification_flow_result_status.show' AND
                                 LOWER(status) != 'approve'), client_time, NULL))           AS not_approve_dt
                  FROM (SELECT t1.user_id,
                               t1.name,
                               t1.os_name,
                               DATE(TIMESTAMP_MILLIS(t1.client_time))                  AS event_dt_part,
                               TIMESTAMP_MILLIS(t1.client_time)                        AS client_time,
                               t1.city_id,
                               t2.city_name,
                               t2.country_id,
                               t2.country_name,
                               JSON_EXTRACT_SCALAR(payload, '$.verification_flow')     AS verification_flow,
                               IF(JSON_EXTRACT_SCALAR(payload, '$.verification_flow') IS NULL,
                                  LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                                      OVER (PARTITION BY t1.user_id ORDER BY client_time),
                                  JSON_EXTRACT_SCALAR(payload, '$.verification_flow')) AS filled_flow,
                               JSON_EXTRACT_SCALAR(payload, '$.status')                AS status
                        FROM (SELECT *
                              FROM indriver-e6e40.emart.product_event t1
                              WHERE 1 = 1
                                AND name IN ('registration.success',
                                             'client.verification_start.show',
                                             'client.verification_start.click',
                                             'client.verification_flow_result_status.show'
                                  )
                                AND event_dt_part BETWEEN '2025-06-01' AND CURRENT_DATE()
                                AND country_id IN (25)
                              QUALIFY
                                  ROW_NUMBER() OVER (PARTITION BY user_id, name, os_name, event_dt_part, JSON_EXTRACT_SCALAR(payload, '$.verification_flow') ORDER BY client_time DESC) =
                                  1) t1
                                 JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                      ON
                                          t1.city_id = t2.city_id
                        WHERE 1 = 1
                          AND name IN ('registration.success',
                                       'client.verification_start.show', 'client.verification_start.click',
                                       'client.verification_flow_result_status.show'
                            ))
                  GROUP BY 1, 2, 3, 4, 5, 6, 7),
     total_liv AS (SELECT t1.user_id,
                          t1.city_id,
                          t1.city_name,
                          t1.country_id,
                          t1.country_name,
                          t1.group_id,
                          t1.participant_first_toggle_date,
                          t1.registration_date,
                          t1.newbie_flag,
                          t1.reg_same_day_flag,
                          t2.user_id AS user_with_svf,
                          t2.os_name,
                          t2.filled_flow,
                          t2.registration_dt,
                          t2.show_dt,
                          t2.click_dt,
                          t2.approve_dt,
                          t2.not_approve_dt
                   FROM total t1
                            JOIN liveness t2
                                 ON t1.user_id = t2.user_id AND
                                    DATE(t2.registration_dt) >= t1.participant_first_toggle_date),
     rides AS (SELECT order_uuid,
                      user_id    AS pass_id,
                      driver_id,
                      city_id    AS order_city_id,
                      country_id AS order_country_id,
                      status_order,
                      order_timestamp,
                      at_pickup_dttm,
                      departed_pickup_dttm,
                      at_destination_dttm,
                      departed_destination_dttm,
                      driveraccept_timestamp,
                      driverarrived_timestamp,
                      driverstarttheride_timestamp,
                      driverdone_timestamp,
                      clientdone_timestamp,
                      clientcancel_timestamp,
                      drivercancel_timestamp,
                      user_reg_date,
                      driver_reg_date,
                      stage,
                      created_date_order_part,
                      duration_in_seconds
               FROM indriver-e6e40.imart.incity_detail_new_order
               WHERE created_date_order_part BETWEEN '2025-06-01'
                   AND CURRENT_DATE()
                 AND status_order = 'RIDE_STATUS_DONE'
                 AND driveraccept_timestamp IS NOT NULL
                 AND (clientcancel_timestamp IS NULL
                   AND drivercancel_timestamp IS NULL))
SELECT t1.user_id,
       t1.group_id,
       t1.participant_first_toggle_date,
       t1.registration_date,
       t1.os_name,
       t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       t1.newbie_flag,
       t1.reg_same_day_flag,
       filled_flow,
       registration_dt,
       show_dt,
       click_dt,
       approve_dt                                                  AS approve_dt,
       IF(approve_dt IS NOT NULL, 1, 0)                            AS approve_flag,
       not_approve_dt,
       IF(not_approve_dt IS NOT NULL AND approve_dt IS NULL, 1, 0) AS not_approve_flag,
       t2.order_timestamp,
       IF(t2.order_timestamp IS NOT NULL, 1, 0)                    AS order_flag,
       t3.gmv,
       t3.rides,
       t3.orders
FROM total_liv t1
         LEFT JOIN rides t2
                   ON t1.user_id = t2.pass_id AND
                      t2.created_date_order_part >= participant_first_toggle_date
         LEFT JOIN gmv t3 ON t1.user_id = t3.user_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t1.user_id ORDER BY t2.order_timestamp) = 1
""")

df.head()

Unnamed: 0,user_id,group_id,participant_first_toggle_date,registration_date,os_name,city_id,city_name,country_id,country_name,newbie_flag,reg_same_day_flag,filled_flow,registration_dt,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,230148120,1,2025-07-29,2024-04-25,android,4200,Santiago,25,Chile,1,0,liveness,2025-07-29 18:17:23.807000+00:00,NaT,NaT,NaT,0,NaT,0,NaT,0,,,
1,304889087,0,2025-08-02,2025-07-11,ios,4200,Santiago,25,Chile,1,0,liveness,2025-08-02 18:30:09.670000+00:00,2025-08-02 18:36:52.791000+00:00,2025-08-02 18:36:53.540000+00:00,NaT,0,2025-08-02 18:37:12.366000+00:00,1,NaT,0,,,
2,305079415,0,2025-07-31,2025-07-13,android,4267,Arica,25,Chile,1,0,liveness,2025-07-31 23:57:00.446000+00:00,NaT,NaT,NaT,0,NaT,0,NaT,0,,,
3,307494834,0,2025-07-30,2025-07-27,android,4200,Santiago,25,Chile,1,0,liveness,2025-07-30 15:42:02.807000+00:00,NaT,NaT,NaT,0,NaT,0,NaT,0,,,
4,307652767,0,2025-07-28,2025-07-28,android,4200,Santiago,25,Chile,1,1,liveness,2025-07-28 01:59:21.729000+00:00,NaT,NaT,NaT,0,NaT,0,NaT,0,,,


In [79]:
df_grouped = df[(df['newbie_flag']==1)&(df['reg_same_day_flag']==1)].groupby(['group_id', 'newbie_flag'], as_index=False)[['registration_dt', 'show_dt', 'click_dt', 'approve_dt', 'not_approve_dt', 'order_flag']].agg({'registration_dt':'count', 'show_dt':'count', 'click_dt':'count', 'approve_dt':'count', 'not_approve_dt':'count', 'order_flag':'sum'})

df_grouped['cr_to_show'] = df_grouped['show_dt'] / df_grouped['registration_dt'] * 100
df_grouped['cr_to_approve'] = df_grouped['approve_dt'] / df_grouped['click_dt'] * 100
df_grouped['cr_to_ride'] = df_grouped['order_flag'] / df_grouped['approve_dt'] * 100
df_grouped['cr_to_ride_2'] = df_grouped['order_flag'] / df_grouped['registration_dt'] * 100
df_grouped['cr_to_not_approve'] = df_grouped['not_approve_dt'] / df_grouped['click_dt'] * 100




df_grouped

Unnamed: 0,group_id,newbie_flag,registration_dt,show_dt,click_dt,approve_dt,not_approve_dt,order_flag,cr_to_show,cr_to_approve,cr_to_ride,cr_to_ride_2,cr_to_not_approve
0,0,1,8169,5179,5052,2856,722,1903,63.4,56.53,66.63,23.3,14.29
1,1,1,966,851,788,409,106,256,88.1,51.9,62.59,26.5,13.45


In [103]:
time_anal = df.query("reg_same_day_flag == 1 and newbie_flag == 1")[['group_id', 'registration_dt', 'show_dt']]

time_anal['registration_dt'] = pd.to_datetime(time_anal['registration_dt'])
time_anal['show_dt'] = pd.to_datetime(time_anal['show_dt'])

time_anal['time_diff'] = (time_anal['show_dt'] - time_anal['registration_dt']).dt.seconds

time_anal = time_anal.groupby(['group_id'], as_index=False)['time_diff'].agg(['median', 'mean'])

time_anal['minutes'] = time_anal['median']/60
time_anal['minutes_2'] = time_anal['mean']/60

time_anal

Unnamed: 0,group_id,median,mean,minutes,minutes_2
0,0,295.0,11347.13,4.92,189.12
1,1,20.0,7892.66,0.33,131.54


In [107]:
8000*0.7

5600.0

In [105]:
2257/19000*100

11.878947368421052

# Summarising

In [82]:
df['rides'] = df['rides'].fillna(0)
df['gmv'] = df['gmv'].fillna(0)

df_grouped = df.query("reg_same_day_flag == 1").groupby(['group_id', 'newbie_flag'], as_index=False)[['registration_dt', 'show_dt', 'click_dt', 'approve_dt', 'not_approve_dt', 'order_flag']].agg({'registration_dt':'count', 'show_dt':'count', 'click_dt':'count', 'approve_dt':'count', 'not_approve_dt':'count', 'order_flag':'sum'})

df_grouped['cr_to_approve'] = df_grouped['approve_dt'] / df_grouped['registration_dt'] * 100
df_grouped['cr_to_ride'] = df_grouped['order_flag'] / df_grouped['registration_dt'] * 100
df_grouped['cr_to_not_approve'] = df_grouped['not_approve_dt'] / df_grouped['registration_dt'] * 100


df_grouped.query("newbie_flag == 1")

Unnamed: 0,group_id,newbie_flag,registration_dt,show_dt,click_dt,approve_dt,not_approve_dt,order_flag,cr_to_approve,cr_to_ride,cr_to_not_approve
1,0,1,8169,5179,5052,2856,722,1903,34.96,23.3,8.84
3,1,1,966,851,788,409,106,256,42.34,26.5,10.97


In [84]:
indicators = ['approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']
res_df = pd.DataFrame()

for metric in indicators:

    if metric in ['rides', 'gmv']:
        ttest = expab.ttest(
            df[(df['newbie_flag']==1)&(df['reg_same_day_flag']==1)][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ttest['criteria'] = 'ttest'
        res_df = pd.concat([res_df, ttest])

    else:
        ztest = expab.ztest_proportion(
            df[(df['newbie_flag']==1)&(df['reg_same_day_flag']==1)][['user_id', 'group_id', 'os_name', 'approve_flag', 'not_approve_flag', 'order_flag', 'rides', 'gmv']],
            metric,
            'group_id'
            )
        ztest['criteria'] = 'ztest'
        res_df = pd.concat([res_df, ztest])

    res_df['significance'] = (res_df['pvalue']<0.05)*1
    res_df['result_with_corr'] = method_benjamini_hochberg(res_df['pvalue'].values)

res_df

Unnamed: 0,metric_name,group0_sample_size,group1_sample_size,group0,group1,statistic,pvalue,mean0,mean1,diff_mean,diff_mean_%,lb,ub,lb_%,ub_%,criteria,significance,result_with_corr
0,approve_flag,8169,966,0,1,-4.4,0.0,0.35,0.42,0.07,21.1,0.04,0.11,11.71,30.49,ztest,1,1
0,not_approve_flag,8169,966,0,1,-1.41,0.16,0.09,0.1,0.01,16.81,-0.01,0.03,-6.48,40.1,ztest,0,0
0,order_flag,8169,966,0,1,-2.14,0.03,0.23,0.27,0.03,13.76,0.0,0.06,1.18,26.34,ztest,1,0
0,rides,8169,966,0,1,-2.27,0.02,0.57,0.74,0.16,28.01,0.02,0.3,3.79,52.23,ttest,1,0
0,gmv,8169,966,0,1,-1.91,0.06,3.03,3.66,0.63,20.72,-0.02,1.27,-0.51,41.96,ttest,0,0
