# Packages

In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.3f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

## Functions

In [2]:
def method_benjamini_hochberg(
    pvalues: np.ndarray,
    alpha: float = 0.05
    ) -> np.ndarray:
    """Apply the Benjamini-Hochberg procedure for multiple hypothesis testing."""
    m = len(pvalues)
    array_alpha = np.arange(1, m + 1) * alpha / m
    sorted_pvalue_indexes = np.argsort(pvalues)
    res = np.zeros(m)
    for idx, pvalue_index in enumerate(sorted_pvalue_indexes):
        pvalue = pvalues[pvalue_index]
        alpha_ = array_alpha[idx]
        if pvalue <= alpha_:
            res[pvalue_index] = 1
        else:
            break
    return res.astype(int)

# Shapiro-Wilk test & Distributions
def check_normality(df, group_column, value_column):
    groups = df[group_column].unique()

    for group in groups:
        group_data = df[df[group_column] == group][value_column].dropna() 
        stat, p = stats.shapiro(group_data)
        print(f'Group {group}: W={stat:.4f}, p-value={p:.4f}')
        if p > 0.05:
            print(f'Group {group}, Metric: {value_column}: Data is normal distributed')
        else:
            print(f'Group {group}, Metric: {value_column}: Data is not normal distributed')

def plot_distribution(df, group_column, value_column):

    groups = df[group_column].unique()
    fig, axes = plt.subplots(2, 2, figsize=(14, 10), gridspec_kw={'height_ratios': [1, 1.5]})

    sns.histplot(data=df, x=value_column, hue=group_column, kde=True, bins=30, alpha=0.4, ax=axes[0, 0])
    axes[0, 0].set_title("Graph + KDE")
    axes[0, 0].set_xlabel(value_column)
    axes[0, 0].set_ylabel("Frequence")

    sns.boxplot(data=df, x=group_column, y=value_column, ax=axes[0, 1])
    axes[0, 1].set_title("Boxplot grouped")
    axes[0, 1].set_xlabel(group_column)
    axes[0, 1].set_ylabel(value_column)

    sns.histplot(df[df[group_column] == groups[0]][value_column], bins=30, kde=True, color='blue', alpha=0.5, ax=axes[1, 0])
    axes[1, 0].set_title(f'Hist for the {groups[0]}')
    axes[1, 0].set_xlabel(value_column)
    axes[1, 0].set_ylabel("frequence")

    sns.histplot(df[df[group_column] == groups[1]][value_column], bins=30, kde=True, color='orange', alpha=0.5, ax=axes[1, 1])
    axes[1, 1].set_title(f'Hist for the {groups[1]}')
    axes[1, 1].set_xlabel(value_column)
    axes[1, 1].set_ylabel("Frequence")

    plt.tight_layout()
    plt.show()

# Levene's & Bartlet's test
def levene(df, indicator, metric):
    w_stats, p_value = st.levene(
        df[df['group_name'] == 0][indicator], 
        df[df['group_name'] == 1][indicator],
                            center=metric)
    
    alpha = 0.05
    
    if p_value > alpha:
        print(f"Variance are from the same population on {metric}")
    else:
        print(f"Variance are from the different population on {metric}")
    
# Cohen's D
def cohens_d(df, metric):
    group1 = df[df['group_name']==1][metric]
    group2 = df[df['group_name']==0][metric]
    mean1, mean2 = np.mean(group1), np.mean(group2)
     
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
     
    d = (mean1 - mean2) / pooled_std
     
    # if d <= 0.3:
    #     print(f'Small effect: d ≈ 0-0.3 ({d:.3f})')
    # elif 0.31 <= d <= 0.8:
    #     print(f'Medium effect: d ≈ 0.3-0.8 ({d:.3f})')
    # elif 0.81 <= d <= 1:
    #     print(f'Large effect: d ≈ 0.8-1 ({d:.3f})')

    return d

# SRM
def srm(df):
    srm_df = pd.DataFrame()

    for city in df['city_name'].unique():
        
        observed = [
            (df.query(f'group_name == 0 and city_name == "{city}"')['user_id'].count()), 
            (df.query(f'group_name == 1 and city_name == "{city}"')['user_id'].count())
            ]

        total_traffic = sum(observed)

        expected = [total_traffic/2, total_traffic/2]

        chi = st.chisquare(observed, f_exp = expected)

        if chi[1] < 0.01:
            conclusion = "Sample ratio mismatch (SRM) may be present"
        else:
            conclusion = "Sample ratio mismatch (SRM) probably not present"
            print(f"{city}, {chi[1]}")

        
        new_srm_df = pd.DataFrame(
            [[city, observed, total_traffic, expected, round(chi[1], 3), conclusion]], 
            columns=['city_name',  'sample_sizes', 'total_size', 'expected_sizes', 'chi_value', 'conclusion']
            )

        srm_df = pd.concat([srm_df, new_srm_df]).sort_values(['city_name', 'total_size'], ascending=False).reset_index(drop=True)

    return srm_df

# Calcualting the significance by cities
def calcualate_result(df_cr, df_abs):
    df_results = pd.DataFrame()

    for city in df_cr['city_name'].unique():

        absolute_values_keys_result = df_abs[df_abs['city_name']==f'{city}'].copy()

        cr_df = ztest_proportion(df_cr[df_cr['city_name']==f'{city}'], 'has_ride', 'group_name')
        cr_df['metric'] = 'Conversion'
        cr_df['cohen_d'] = cohens_d(df_cr[df_cr['city_name']==f'{city}'], 'has_ride')

        rides_df = ttest(absolute_values_keys_result, 'rides', 'group_name')
        rides_df['metric'] = 'Quantitive'
        rides_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'rides')

        gmv_df = ttest(absolute_values_keys_result, 'gmv', 'group_name')
        gmv_df['metric'] = 'Quantitive'
        gmv_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'gmv')

        orders_df = ttest(absolute_values_keys_result, 'orders', 'group_name')
        orders_df['metric'] = 'Quantitive'
        orders_df['cohen_d'] = cohens_d(absolute_values_keys_result, 'orders')

        df_total = pd.concat([cr_df, rides_df, gmv_df, orders_df])

        df_total['region'] = city
        df_total['segment'] = 'By city'
        df_total['significance'] = (df_total['pvalue']<0.05)*1
        df_total['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

        df_results = pd.concat([df_results, df_total])

    total_cr_df = ztest_proportion(df_cr, 'has_ride', 'group_name')
    total_cr_df['metric'] = 'Conversion'
    total_cr_df['cohen_d'] = cohens_d(df_cr, 'has_ride')

    total_rides_df = ttest(df_abs, 'rides', 'group_name')
    total_rides_df['metric'] = 'Quantitive'
    total_rides_df['cohen_d'] = cohens_d(df_abs, 'rides')

    total_gmv_df = ttest(df_abs, 'gmv', 'group_name')
    total_gmv_df['metric'] = 'Quantitive'
    total_gmv_df['cohen_d'] = cohens_d(df_abs, 'gmv')

    total_orders_df = ttest(df_abs, 'orders', 'group_name')
    total_orders_df['metric'] = 'Quantitive'
    total_orders_df['cohen_d'] = cohens_d(df_abs, 'orders')


    total_total_df = pd.concat([total_cr_df, total_rides_df, total_gmv_df, total_orders_df])
    total_total_df['region'] = 'All'
    total_total_df['segment'] = 'Total'
    total_total_df['significance'] = (df_total['pvalue']<0.05)*1
    total_total_df['corrected_pvalue'] = method_benjamini_hochberg(df_total['pvalue'].values)

    df_results = pd.concat([df_results, total_total_df])

    df_results

    return df_results

def sequential_wald_test(df, date_col, metric_col, group_col, user_col, alpha=0.05, beta=0.2):
    
    A = np.round(np.log(beta / (1 - alpha)), 2)   
    B = np.round(np.log((1 - beta) / alpha), 2) 
    
    df_grouped = df.groupby([date_col, group_col]).agg(
        users=(user_col, 'nunique'), 
        conversions=(metric_col, 'sum') 
    ).reset_index()

    df_grouped["cum_users"] = df_grouped.groupby(group_col)["users"].cumsum()
    df_grouped["cum_conversions"] = df_grouped.groupby(group_col)["conversions"].cumsum()

    df_A = df_grouped[df_grouped[group_col] == 0].drop(columns=[group_col]).rename(
        columns={"users": "users_A", "conversions": "conv_A", "cum_users": "cum_users_A", "cum_conversions": "cum_conv_A"}
    )
    df_B = df_grouped[df_grouped[group_col] == 1].drop(columns=[group_col]).rename(
        columns={"users": "users_B", "conversions": "conv_B", "cum_users": "cum_users_B", "cum_conversions": "cum_conv_B"}
    )

    df_merged = pd.merge(df_A, df_B, on=date_col, how="outer").fillna(0)

    print("Колонки в df_merged:", df_merged.columns)

    p_values, llr_values = [], []
    stop_day = None

    for i in range(len(df_merged)):
        try:
            users_A, conv_A = df_merged.loc[i, ["cum_users_A", "cum_conv_A"]]
            users_B, conv_B = df_merged.loc[i, ["cum_users_B", "cum_conv_B"]]

            p_A = conv_A / users_A if users_A > 0 else 0
            p_B = conv_B / users_B if users_B > 0 else 0

            r = test_proportions_2indep(
                conv_A, users_A,
                conv_B, users_B,
                value=0,
                method='wald',
                compare='diff',
                alternative='two-sided',
                return_results=True
            )

            p_value = r.pvalue
            p_values.append(p_value)

            llr = np.log(p_B / p_A) if p_B > 0 and p_A > 0 else 0
            llr_values.append(llr)

            if llr <= A:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} <= {A:.3f} (Accept H0)")
                break
            elif llr >= B:
                stop_day = df_merged.loc[i, date_col]
                print(f"On {stop_day} might be stopped: LLR={llr:.3f} >= {B:.3f} (Accept H1)")
                break

        except Exception as e:
            print(f"⚠️ Ошибка на дне {df_merged.loc[i, date_col]}: {e}")
            p_values.append(np.nan)
            llr_values.append(np.nan)

    # Создаем DataFrame с результатами
    df_results = df_merged.iloc[:len(p_values)].copy()
    df_results["p_value"] = p_values
    df_results["LLR"] = llr_values
    df_results["A/B"] = str([A, B])
    df_results["alpha_threshold"] = np.linspace(alpha, alpha / np.sqrt(len(df_results)), len(p_values))  # Коррекция alpha

    # Визуализация результатов
    plt.figure(figsize=(12, 6))
    plt.plot(df_results[date_col], df_results["p_value"], label="P-value", marker="o")
    plt.plot(df_results[date_col], df_results["alpha_threshold"], label="Corrected Alpha", linestyle="dashed")
    plt.axhline(y=alpha, color="red", linestyle="--", label="Standard Alpha (0.05)")
    plt.xticks(rotation=45)
    plt.xlabel("Date")
    plt.ylabel("P-Value")
    plt.title("P-value daily vs. Corrected Alpha")
    plt.legend()
    plt.grid()
    plt.show()

    return df_results

def calculate_criteria(df):
    df_res_1 = pd.DataFrame()
    indicators = ['has_ride', 'rides', 'gmv']

    total_res_z = expab.ztest_proportion(df, 'has_ride', 'group_name')
    total_res_z['city_name'] = 'all the cities together'
    total_res_t = expab.ttest(df, 'gmv', 'group_name')
    total_res_t['city_name'] = 'all the cities together'
    total_res_t2 = expab.ttest(df, 'rides', 'group_name')
    total_res_t2['city_name'] = 'all the cities together'

    df_res_1 = pd.concat([df_res_1, total_res_z, total_res_t, total_res_t2])

    for city in df['city_name'].unique():
        
        for metric in indicators:
            if metric == 'has_ride':
                city_df_z = expab.ztest_proportion(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_z['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_z])

            else:
                city_df_t = expab.ttest(df.query(f"city_name == '{city}'"), metric, 'group_name')
                city_df_t['city_name'] = city

                df_res_1 = pd.concat([df_res_1, city_df_t])

    df_res_1['corrected_pvalue'] = expab.method_benjamini_hochberg(df_res_1['pvalue'].values)
    df_res_1['significance'] = (df_res_1['pvalue']<0.05)*1


    return df_res_1

def calculate_numbers(df):

    df_agg = df.groupby(['group_name', 'city_name'], as_index=False)[['user_id', 'has_ride', 'rides', 'orders', 'gmv']].agg(
        {'user_id':'count', 
        'has_ride':'sum', 
        'rides':'sum', 
        'gmv':'sum'}
        ).sort_values(['city_name', 'group_name', 'user_id'], ascending=True)

    df_agg['group_name'] = df_agg['group_name'].astype(str)

    df_agg['group_name'] = df_agg['group_name'].replace({'0':'Control', '1':'Treatment'})

    df_agg['cr_ride_%'] = np.round(df_agg['has_ride'] / df_agg['user_id'] * 100,2)
    df_agg['cr_ride'] = np.round(df_agg['has_ride'] / df_agg['user_id'],5)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'].astype(str)
    df_agg['cr_ride_%'] = df_agg['cr_ride_%'] + '%'

    return df_agg

# Design

### ID & Liveness hist exps

In [None]:
# liveness = read_bq("""
# DECLARE start_dt DATE;
# DECLARE end_dt DATE;
# SET start_dt = '2025-03-07';
# SET end_dt = '2025-03-24';
# WITH old_users AS (
#     SELECT 
#         DISTINCT user_id
#     FROM 
#         indriver-e6e40.emart.incity_detail
#     WHERE 1=1
#         AND DATE(created_date_order_part) < start_dt
#         AND city_id IN (6587,4230,5495,4272,4396)
#     GROUP BY 1
# ),
# ab as (
#     SELECT user_id,
#            max(case
#                    when group_id = 4535352 then 0
#                    when group_id = 4535353 then 1
#                end) has_treatment
#     FROM 
#         indriver-e6e40.ss_ab_platform_mart.markup_users
#     WHERE 
#         group_id IN (4535352, 4535353)
#         AND test_id = 2699
#         AND user_id NOT IN (SELECT user_id FROM old_users)
#     GROUP BY 1 
# ),
# appeals AS (
#     SELECT
#         target_id user_id,
#         COUNT(DISTINCT uuid) cnt_appeals
#     FROM
#         indriver-e6e40.ods_moderation_feed_red_pill.appeal t1
#     LEFT JOIN
#         indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 t3 ON t1.uuid = JSON_EXTRACT_SCALAR(t3.payload, '$.uuid')
#     WHERE
#         DATE(t1.created_at) BETWEEN start_dt AND end_dt
#         AND DATE(t3.export_raw_dt) BETWEEN start_dt AND end_dt
#         --AND t1.initiator_id = 1 --жалоба от пассажира водиле
#         AND t1.initiator_id = 0 --жалоба от водилы пассажиру
#         AND t1.city_id IN (6587,4230,5495,4272,4396)
#         AND JSON_EXTRACT_SCALAR(t3.model, '$.result.top_category[0]') NOT IN ('CATEGORY_TEXT_NOT_RECOGNISED',
#                                                                          'CATEGORY_LOCATION_DISPUTE',
#                                                                          'CATEGORY_RIDE_REFUSAL',
#                                                                          'CATEGORY_CANCELLED_BY_DRIVER_REQUEST',
#                                                                          'CATEGORY_BARGAINING_AFTER_ACCEPT',
#                                                                          'CATEGORY_PASSENGER_WAS_LATE',
#                                                                          'CATEGORY_DRIVER_WAS_LATE',
#                                                                          'CATEGORY_APP_PROBLEM',
#                                                                          'CATEGORY_POSITIVE_REVIEW',
#                                                                          'CATEGORY_DIFFERENT_CAR',
#                                                                          'CATEGORY_DRIVER_REPORTED_CAR_MALFUNCTION',
#                                                                          'CATEGORY_STRANGER_IN_CAR')
#     GROUP BY 1
# ),
# reviews AS (
#     SELECT
#         target_id as user_id,
#         COUNT(DISTINCT r.uuid) cnt_reviews
#     FROM 
#         indriver-e6e40.ods_moderation_feed_red_pill.review r
#     LEFT JOIN 
#         indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 src on JSON_EXTRACT_SCALAR(src.payload, '$.uuid') = r.uuid
#     WHERE
#         DATE(r.created_at) BETWEEN start_dt AND end_dt
#         AND DATE(src.export_raw_dt) BETWEEN start_dt AND end_dt
#         AND rating < 5
#         AND r.visibility_id = 1 --жалобы от водителя на пассажира 
#         AND r.city_id IN (6587,4230,5495,4272,4396)
#         AND JSON_EXTRACT_SCALAR(src.model, '$.result.top_category[0]') NOT IN ('CATEGORY_TEXT_NOT_RECOGNISED',
#                                                                              'CATEGORY_DIRTY_CABIN',
#                                                                              'CATEGORY_SUSPICIOUS_AREA', --УДАЛИТЬ?
#                                                                              'CATEGORY_ASSAULT', --УДАЛИТЬ?
#                                                                              'CATEGORY_BARGAINING_AFTER_ACCEPT',
#                                                                              'CATEGORY_POSITIVE_REVIEW',
#                                                                              'CATEGORY_DIFFERENT_CAR',
#                                                                              'CATEGORY_PASSENGER_REPORTED_CAR_MALFUNCTION',
#                                                                              'CATEGORY_NO_CHANGE',
#                                                                              'CATEGORY_DANGEROUS_DRIVING',
#                                                                              'CATEGORY_CANCELLED_BY_PASSENGER_REQUEST')
#     GROUP BY 1
# ),
# orders_raw AS (
#     SELECT 
#         DISTINCT
#         order_uuid, 
#         user_id
#     FROM indriver-e6e40.emart.incity_detail
#     WHERE 
#         created_date_order_part BETWEEN start_dt AND end_dt
#         AND city_id IN (6587,4230,5495,4272,4396)
# ),
# support_raw AS (
#     SELECT
#         DISTINCT
#         t1.id support_id,
#         lower(t4.order_id) order_uuid
#     FROM 
#         indriver-e6e40.ods_customer_support.request t1
#     JOIN 
#         indriver-bi.customer_service.tbl_customer_support_chats_just_detail t2 ON t1.id = t2.request_id
#     JOIN
#         dwh-storage-327422.ods_customer_support.chat_request_entry t4 ON t1.id = t4.request_id
#     JOIN
#         (select distinct country_id, country_name, city_id from indriver-bi.heap.vw_geo_mapping) t3 ON t2.country_name = t3.country_name
#     WHERE
#         DATE(t1.created_dt_part) BETWEEN start_dt AND end_dt
#         AND t4.created_dt_part BETWEEN start_dt AND end_dt
#         AND contact_category is not null
#         AND contact_reason is not null
#         AND who_contacts = 'Driver'
#         AND contact_category IN ('Complaints against Passenger', 'Safety')
#         AND t4.city_id IN (6587,4230,5495,4272,4396)
# ),
# support AS (
#     SELECT
#         orders_raw.user_id,
#         COUNT(DISTINCT support_id) cnt_support
#     FROM
#         orders_raw
#     JOIN
#         support_raw ON orders_raw.order_uuid = support_raw.order_uuid
#     GROUP BY 1
# )
# SELECT
#     ab.user_id,
#     ab.has_treatment,
#     appeals.cnt_appeals,
#     reviews.cnt_reviews,
#     support.cnt_support
# FROM
#     ab
# LEFT JOIN
#     appeals ON ab.user_id = appeals.user_id
# LEFT JOIN
#     reviews ON ab.user_id = reviews.user_id
# LEFT JOIN
#     support ON ab.user_id = support.user_id
# """)

# id = read_bq("""

# """)

Unnamed: 0,user_id,group_id,created_dt_part,os_name,city_id,city_name,country_id,country_name,newbie_flag,filled_flow,show_dt,click_dt,approve_dt,approve_flag,not_approve_dt,not_approve_flag,order_timestamp,order_flag,gmv,rides,orders
0,10977349,0,2025-05-15,,4267,Arica,25,Chile,0,,NaT,NaT,NaT,0,NaT,0,2025-05-15 12:11:06+00:00,1,69.03,24,41
1,11354201,0,2025-05-18,,4200,Santiago,25,Chile,0,,NaT,NaT,NaT,0,NaT,0,2025-05-18 23:33:50+00:00,1,120.7,17,22
2,12499562,0,2025-05-15,android,4200,Santiago,25,Chile,1,liveness,2025-05-15 12:40:40.313000+00:00,2025-05-15 12:40:51.559000+00:00,2025-05-15 12:41:50.602000+00:00,1,NaT,0,2025-05-15 12:41:54+00:00,1,11.24,3,5
3,12736336,0,2025-05-16,,4200,Santiago,25,Chile,0,,NaT,NaT,NaT,0,NaT,0,2025-05-16 13:25:16+00:00,1,46.96,6,10
4,13132543,0,2025-05-18,,4267,Arica,25,Chile,0,,NaT,NaT,NaT,0,NaT,0,NaT,0,2.57,1,1


### Soft, medium, hard launch MDE calculating

- Жесткий (опасный) сценарий: берем все опасные и большие города и запускаем 50/50 на 2-4 недели
- Средний сценарий: берем 50% опасных городов и запускаем 50/50 на 2-4 недели
- Мягкий сценарий: берем 10% опасных городов и запускаем 30/70 на 2-4 недели

#### Cities with metrics

In [None]:
df_cities = read_bq("""
WITH incidents AS (SELECT t1.city_id,
                          t1.city_name,
                          geo.country_id,
                          geo.country_name,
                          t1.monthly,
                          SUM(rides)                                                                 AS rides,
                          SUM(incidents)                                                             AS incidents,
                          SUM(conf_incidents)                                                        AS confirmed_incidents,
                          SUM(SAFE_DIVIDE(t1.incidents, t2.rides) * 100000) / COUNT(t1.city_id)      AS inc_rate,
                          SUM(SAFE_DIVIDE(t1.conf_incidents, t2.rides) * 100000) / COUNT(t1.city_id) AS conf_inc_rate
                   FROM (SELECT DATE_TRUNC(incident_date, MONTH)                              AS monthly,
                                t1.city_id,
                                t1.city_name,
                                COUNT(redmine_id)                                             AS incidents,
                                COUNT(IF(information_status = 'Confirmed', redmine_id, NULL)) AS conf_incidents
                         FROM indriver-bi.safety.vw_safety_incidents_detail t1
                                  JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                       ON t1.city_id = t2.city_id
                         WHERE incident_date >= '2025-01-01'
                           AND t2.country_id IN (24, 12, 22, 25)
                         GROUP BY 1, 2, 3) t1
                            JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                                 ON t1.city_id = geo.city_id
                            LEFT JOIN (SELECT t1.city_id,
                                              DATE_TRUNC(metric_date, MONTH) AS monthly,
                                              SUM(rides_count)               AS rides
                                       FROM indriver-bi.incity.tbl_incity_growth_metrics_detail t1
                                                JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                                     ON t1.city_id = t2.city_id
                                       WHERE user_type = 'pass'
                                         AND t2.country_id IN (24, 12, 22, 25)
                                         AND metric_date >= '2025-01-01'
                                       GROUP BY 1, 2) t2 ON t1.city_id = t2.city_id AND t1.monthly = t2.monthly
                   GROUP BY 1, 2, 3, 4, 5),
     metric AS (SELECT t1.city_id,
                       DATE_TRUNC(metric_date, MONTH)                                       AS monthly,
                       COUNT(DISTINCT t1.user_id)                                           AS total_users,
                       COUNT(DISTINCT IF(t1.metric_date = t3.first_ride, t3.user_id, NULL)) AS newbies,
                       COUNT(DISTINCT IF(t1.metric_date = t3.first_ride, t3.user_id, NULL)) /
                       COUNT(DISTINCT t1.user_id)                                           AS share_of_newbies,
                       SUM(rides_count)                                                     AS rides,
                       SUM(gmv_unclean_usd)                                                 AS gmv,
                       SUM(rides_count) / COUNT(t1.user_id)                                 AS avg_rides_by_one_user
                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail t1
                         JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                              ON t1.city_id = t2.city_id
                         LEFT JOIN (SELECT user_id,
                                           MIN(metric_date) AS first_ride
                                    FROM indriver-bi.incity.tbl_incity_growth_metrics_detail t1
                                             JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                                  ON t1.city_id = t2.city_id
                                    WHERE user_type = 'pass'
                                      AND t2.country_id IN (24, 12, 22, 25)
                                      AND metric_date >= '2023-01-01'
                                    GROUP BY 1) t3 ON t1.user_id = t3.user_id
                WHERE user_type = 'pass'
                  AND t2.country_id IN (24, 12, 22, 25)
                  AND metric_date >= '2025-01-01'
                GROUP BY 1, 2)
SELECT t1.city_id,
       t1.city_name,
       t1.country_id,
       t1.country_name,
       AVG(t1.rides)                           AS rides,
       ROUND(AVG(t2.gmv), 2)                   AS gmv,
       ROUND(AVG(t2.avg_rides_by_one_user), 2) AS avg_rides_by_one_user,
       ROUND(AVG(t2.total_users), 0)           AS total_users,
       ROUND(AVG(t2.share_of_newbies), 2)      AS share_of_newbies,
       ROUND(AVG(t2.newbies), 0)               AS newbies,
       ROUND(AVG(t1.incidents), 0)             AS incidents,
       ROUND(AVG(t1.confirmed_incidents), 0)   AS confirmed_incidents,
       ROUND(AVG(t1.inc_rate), 2)              AS inc_rate,
       ROUND(AVG(t1.conf_inc_rate), 2)         AS conf_inc_rate,
       '(' || (ROUND(AVG(t2.newbies) * 0.5 * 0.25, 0) || ', ' || ROUND(AVG(t2.newbies) * 0.5 * 0.3, 0) || ', ' ||
               ROUND(AVG(t2.newbies) * 0.5 * 0.35, 0)) ||
       ')'                                     AS loss_interval_newbies,
       ROUND(AVG(t2.newbies) * 0.5 * 0.25, 0)  AS lower,
       ROUND(AVG(t2.newbies) * 0.5 * 0.3, 0)   AS middle,
       ROUND(AVG(t2.newbies) * 0.5 * 0.35, 0)  AS upper
FROM incidents t1
         JOIN metric t2
              ON t1.city_id = t2.city_id AND t1.monthly = t2.monthly
GROUP BY 1, 2, 3, 4
""")

df_cities.head()

#### MDE

In [3]:


list_of_cities = [
    4199,4142,4197,4242,4143,4825,4404,4373,4198,4524,4200,4540, 
    4255,4257,4252,4231,4538,4258,4263,4229,4194,4193,4226,4516,4228,5543,4244,4266,4271,4545,4148,4559,4519,4154,4374,4155,4196,5512,4236,4549,4542,4515,5536,4272,4377,4163,4269,4233,4178,4267,5568,4241,4261,4144,4264,4180,4354,5528,4230,4281,4555,5504,4143,4825,4404,4373,4198,4524, 
    4524,4200,4540,4375,4376,4385,4225,5548,5512,4194,4236,4241,4515,4244,5504,4263,4148,4180,5543,4271,4272,4196,4542,4229,4381,4240,4267,4266,5568,4516,4275,4226,4228,4155
]

df = read_bq(f"""
WITH users AS (SELECT user_id, MAX(city_id) AS city_id
               FROM indriver-e6e40.ods_event_tracker.event
               WHERE 1 = 1
                 AND name IN (
                   'client.verification_start.click'
                   )
                 AND event_dt_part >= '2025-04-01'
                 AND city_id IN {tuple(set(list_of_cities))}
               GROUP BY 1)
   , incidents AS (SELECT DISTINCT pass_id AS agg_id
                   FROM indriver-bi.safety.vw_safety_incidents_detail
                   WHERE aggressor = 'Passenger'
                     AND information_status IN ('Confirmed', 'Automated ML decision')
                     AND pass_id IS NOT NULL
                     AND city_id IN {tuple(set(list_of_cities))})      -- City
   , appeals AS (SELECT target_id            user_id
                      , COUNT(DISTINCT uuid) cnt_appeals
                 FROM indriver-e6e40.ods_moderation_feed_red_pill.appeal t1
                          LEFT JOIN
                      indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 t3
                      ON t1.uuid = JSON_EXTRACT_SCALAR(t3.payload
                          , '$.uuid')
                 WHERE DATE(t1.created_at) BETWEEN '2025-04-01' AND CURRENT_DATE()
                   AND DATE(t3.export_raw_dt) BETWEEN '2025-04-01' AND CURRENT_DATE()
--AND t1.initiator_id = 1 --жалоба от пассажира водиле
                   AND t1.initiator_id = 0 --жалоба от водилы пассажиру
                   AND t1.city_id IN
                       {tuple(set(list_of_cities))}              -- City
                   AND JSON_EXTRACT_SCALAR(t3.model
                           , '$.result.top_category[0]') NOT IN
                       ('CATEGORY_TEXT_NOT_RECOGNISED', 'CATEGORY_LOCATION_DISPUTE', 'CATEGORY_RIDE_REFUSAL',
                        'CATEGORY_CANCELLED_BY_DRIVER_REQUEST', 'CATEGORY_BARGAINING_AFTER_ACCEPT',
                        'CATEGORY_PASSENGER_WAS_LATE', 'CATEGORY_DRIVER_WAS_LATE', 'CATEGORY_APP_PROBLEM',
                        'CATEGORY_POSITIVE_REVIEW', 'CATEGORY_DIFFERENT_CAR',
                        'CATEGORY_DRIVER_REPORTED_CAR_MALFUNCTION', 'CATEGORY_STRANGER_IN_CAR')
                 GROUP BY 1)
   , reviews AS (SELECT target_id AS           user_id
                      , COUNT(DISTINCT r.uuid) cnt_reviews
                 FROM indriver-e6e40.ods_moderation_feed_red_pill.review r
                          LEFT JOIN
                      indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 src
                      ON JSON_EXTRACT_SCALAR(src.payload
                             , '$.uuid') = r.uuid
                 WHERE DATE(r.created_at) BETWEEN '2025-04-01'
                     AND CURRENT_DATE()
                   AND DATE(src.export_raw_dt) BETWEEN '2025-04-01'
                     AND CURRENT_DATE()
                   AND rating
                     < 5
                   AND r.visibility_id = 1 --жалобы от водителя на пассажира
                   AND r.city_id IN {tuple(set(list_of_cities))} -- City
                   AND JSON_EXTRACT_SCALAR(src.model
                           , '$.result.top_category[0]') NOT IN
                       ('CATEGORY_TEXT_NOT_RECOGNISED', 'CATEGORY_DIRTY_CABIN', 'CATEGORY_SUSPICIOUS_AREA', --УДАЛИТЬ?
                        'CATEGORY_ASSAULT', --УДАЛИТЬ?
                        'CATEGORY_BARGAINING_AFTER_ACCEPT', 'CATEGORY_POSITIVE_REVIEW', 'CATEGORY_DIFFERENT_CAR',
                        'CATEGORY_PASSENGER_REPORTED_CAR_MALFUNCTION', 'CATEGORY_NO_CHANGE',
                        'CATEGORY_DANGEROUS_DRIVING', 'CATEGORY_CANCELLED_BY_PASSENGER_REQUEST')
                 GROUP BY 1)
   , orders_raw AS (SELECT DISTINCT order_uuid
                                  , user_id
                    FROM indriver-e6e40.emart.incity_detail
                    WHERE created_date_order_part BETWEEN '2025-04-01'
                        AND CURRENT_DATE()
                      AND city_id IN {tuple(set(list_of_cities))})     -- City
   , support_raw AS (SELECT DISTINCT t1.id              support_id
                                   , LOWER(t4.order_id) order_uuid
                     FROM indriver-e6e40.ods_customer_support.request t1
                              JOIN
                          indriver-bi.customer_service.tbl_customer_support_chats_just_detail t2
                          ON t1.id = t2.request_id
                              JOIN
                          dwh-storage-327422.ods_customer_support.chat_request_entry t4 ON t1.id = t4.request_id
                              JOIN
                          (SELECT DISTINCT country_id
                                         , country_name
                                         , city_id
                           FROM indriver-bi.heap.vw_geo_mapping) t3
                          ON t2.country_name = t3.country_name
                     WHERE DATE(t1.created_dt_part) BETWEEN '2025-04-01'
                         AND CURRENT_DATE()
                       AND t4.created_dt_part BETWEEN '2025-04-01'
                         AND CURRENT_DATE()
                       AND contact_category IS NOT NULL
                       AND contact_reason IS NOT NULL
                       AND who_contacts = 'Driver'
                       AND contact_category IN ('Complaints against Passenger', 'Safety')
                       AND t4.city_id IN {tuple(set(list_of_cities))}) -- City
   , support AS (SELECT orders_raw.user_id
                      , COUNT(DISTINCT support_id) cnt_support
                 FROM orders_raw
                          JOIN
                      support_raw
                      ON orders_raw.order_uuid = support_raw.order_uuid
                 GROUP BY 1)
SELECT t0.user_id,
       t0.city_id,
       COALESCE(t1.cnt_appeals, 0) AS cnt_appeals,
       COALESCE(t2.cnt_reviews, 0) AS cnt_reviews,
       COALESCE(t3.cnt_support, 0) AS cnt_support,
       IF(agg_id IS NULL, 0, 1)    AS incident_flag
FROM users t0
         LEFT JOIN appeals t1 ON t0.user_id = t1.user_id
         LEFT JOIN reviews t2 ON t0.user_id = t2.user_id
         LEFT JOIN support t3 ON t0.user_id = t3.user_id
         LEFT JOIN incidents t4 ON t0.user_id = t4.agg_id
""")

df['composed_metrics'] = df['cnt_appeals']+df['cnt_reviews']+df['cnt_support']

df.head()

Unnamed: 0,user_id,city_id,cnt_appeals,cnt_reviews,cnt_support,incident_flag,composed_metrics
0,302021628,4281,0,0,0,0,0
1,284485087,4374,0,0,0,0,0
2,296076598,4519,0,0,0,0,0
3,55299089,4377,0,0,0,0,0
4,173440584,4515,0,0,0,0,0


In [4]:
df['incident_flag'].mean()*100

0.21349767469355427

In [6]:
df['composed_metrics'].mean()*100

4.966634830603582

In [None]:
expab.get_mde()

In [19]:
df_sample = read_bq(f"""
SELECT name,
       event_dt_part,
       DATE_TRUNC(event_dt_part, WEEK)  AS weekly,
       DATE_TRUNC(event_dt_part, MONTH) AS monthly,
       t1.city_id,
       t2.city_name,
       t2.country_id,
       t2.country_name,
       COUNT(DISTINCT user_id)          AS users
FROM indriver-e6e40.ods_event_tracker.event t1
         JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
              ON
                  t1.city_id = t2.city_id
WHERE 1 = 1
  AND name IN (
    'client.verification_start.click'
    )
  AND event_dt_part >= '2025-02-01'
  AND t1.city_id IN {tuple(set(list_of_cities))}
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8
""")

df_sample.head()

Unnamed: 0,name,event_dt_part,weekly,monthly,city_id,city_name,country_id,country_name,users
0,client.verification_start.click,2025-05-07,2025-05-04,2025-05-01,4199,Lima,24,Peru,5288
1,client.verification_start.click,2025-04-04,2025-03-30,2025-04-01,4267,Arica,25,Chile,359
2,client.verification_start.click,2025-04-04,2025-03-30,2025-04-01,4228,Tijuana,12,Mexico,875
3,client.verification_start.click,2025-04-04,2025-03-30,2025-04-01,4229,Leon de los Aldama,12,Mexico,299
4,client.verification_start.click,2025-05-03,2025-04-27,2025-05-01,4197,Bogota,22,Colombia,3552


In [None]:
df_sample[df_sample['city_id'].isin()]

In [75]:

segment = ['hard', 'medium', 'low']
hard = [4199,4142,4197,4242,4143,4825,4404,4373,4198,4524,4200,4540]
medium = [4255,4257,4252,4231,4538,4258,4263,4229,4194,4193,4226,4516,4228,5543,4244,4266,4271,4545,4148,4559,4519,4154,4374,4155,4196,5512,4236,4549,4542,4515,5536,4272,4377,4163,4269,4233,4178,4267,5568,4241,4261,4144,4264,4180,4354,5528,4230,4281,4555,5504,4143,4825,4404,4373,4198,4524]
low = [4524,4200,4540,4375,4376,4385,4225,5548,5512,4194,4236,4241,4515,4244,5504,4263,4148,4180,5543,4271,4272,4196,4542,4229,4381,4240,4267,4266,5568,4516,4275,4226,4228,4155]

df_sample['event_dt_part'] = pd.to_datetime(df_sample['event_dt_part'])

sizes
res = pd.DataFrame()

for category in segment:
    if category == 'hard':
        df_group = df[df['city_id'].isin(hard)]
        daily = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(hard))].groupby(['event_dt_part'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        weekly = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(hard))].groupby(['weekly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        month = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(hard))].groupby(['monthly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        sizes = [int(daily), int(weekly), int(weekly)*2, int(month)]
    elif category == 'medium':
        df_group = df[df['city_id'].isin(medium)]
        daily = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(medium))].groupby(['event_dt_part'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        weekly = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(medium))].groupby(['weekly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        month = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(medium))].groupby(['monthly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        sizes = [int(daily), int(weekly), int(weekly)*2, int(month)]
    elif category == 'low':
        df_group = df[df['city_id'].isin(low)]
        daily = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(low))].groupby(['event_dt_part'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        weekly = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(low))].groupby(['weekly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        month = df_sample[(df_sample['event_dt_part']<= '2025-05-20')&(df_sample['city_id'].isin(low))].groupby(['monthly'], as_index=False)['users'].sum()['users'].mean().round(0)*0.4
        sizes = [int(daily), int(weekly), int(weekly)*2, int(month)]


    mean = df_group['composed_metrics'].mean()
    std = df_group['composed_metrics'].std()
    sample_size = len(df_group)
    mde_comp = expab.get_mde(mean, std, sizes, alpha=0.1, beta=0.2)

    mde_comp['mean'] = mean
    mde_comp['std'] = std
    mde_comp['Category'] = category
    mde_comp['Metric'] = 'composed'

    mean = df_group['cnt_reviews'].mean()
    std = df_group['cnt_reviews'].std()
    sample_size = len(df_group)
    mde = expab.get_mde(mean, std, sizes, alpha=0.1, beta=0.2)

    mde['mean'] = mean
    mde['std'] = std
    mde['Category'] = category
    mde['Metric'] = 'reviews'

    mean = df_group['incident_flag'].mean()
    std = df_group['incident_flag'].std()
    sample_size = len(df_group)
    mde_inc = expab.get_mde(mean, std, sizes, alpha=0.1, beta=0.2)

    mde_inc['mean'] = mean
    mde_inc['std'] = std
    mde_inc['Category'] = category
    mde_inc['Metric'] = 'incident'

    res = pd.concat([res, mde_inc, mde_comp, mde])

# res.pivot_table(columns='Category', index=['sample_size', 'Metric'], values=['mde_%', 'mde_abs', ''])
res[res['Category']=='low']

Unnamed: 0,sample_size,mde_abs,mde_%,alpha,beta,mean,std,Category,Metric
0,4412,0.004,144.952,0.1,0.2,0.003,0.052,low,incident
1,28290,0.002,57.243,0.1,0.2,0.003,0.052,low,incident
2,56580,0.001,40.477,0.1,0.2,0.003,0.052,low,incident
3,120234,0.001,27.767,0.1,0.2,0.003,0.052,low,incident
0,4412,0.022,44.39,0.1,0.2,0.049,0.289,low,composed
1,28290,0.009,17.53,0.1,0.2,0.049,0.289,low,composed
2,56580,0.006,12.396,0.1,0.2,0.049,0.289,low,composed
3,120234,0.004,8.503,0.1,0.2,0.049,0.289,low,composed
0,4412,0.01,69.256,0.1,0.2,0.014,0.128,low,reviews
1,28290,0.004,27.35,0.1,0.2,0.014,0.128,low,reviews


In [76]:
print((5.891-21.706)/21.706*100)
print((6.875-23.815)/23.815*100)
print((8.503-27.767)/27.767*100)

-72.86003869897723
-71.13163972286374
-69.37731839953902


In [89]:
(1-(1-(0.05/5))**5)*100

4.900995010000009