# Packages

In [261]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.3f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Research

### Common data

In [19]:
df_scale = read_bq("""
                   WITH banned AS (SELECT user_id,
                       unban_applied_cd,
                       DATE(created_dttm) AS created_dt
                FROM dwh-storage-327422.ods_expel.tbl_ban
                WHERE type_cd = 9
                  AND unban_applied_cd = 0
                  AND DATE(created_dttm) >= '2025-05-29'
                QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_dttm ASC) = 1),
     user_region_data AS (SELECT id AS user_id, country_id, city_id
                          FROM dwh-storage-327422.personal_data.tbl_user_act
                          WHERE country_id IN (12, 23, 54, 25, 13, 43, 24, 75, 72, 77, 11, 22, 10)),
     triggers AS (SELECT user_id, requester
                  FROM indriver-e6e40.ods_facechecker.user_liveness
                  WHERE DATE(created_at) >= '2020-01-01'
                    AND TRIM(status) NOT IN ('requested', 'decline', 'resubmission')
                  QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) = 1)
SELECT t1.user_id,
       t0.requester,
       t1.unban_applied_cd,
       t1.created_dt,
       DATE_TRUNC(t1.created_dt, WEEK) AS created_dt_week,
       t2.country_id,
       geo.country_name,
       t2.city_id,
       geo.city_name
FROM banned t1
         LEFT JOIN triggers t0 ON t1.user_id = t0.user_id
         LEFT JOIN user_region_data t2 ON t1.user_id = t2.user_id
         JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
              ON
                  t2.city_id = geo.city_id
                   """)

df_scale.head()

Unnamed: 0,user_id,requester,unban_applied_cd,created_dt,created_dt_week,country_id,country_name,city_id,city_name
0,168723089,banhammer,0,2025-06-03,2025-06-01,25,Chile,5600,Rancagua
1,137872631,user,0,2025-07-04,2025-06-29,25,Chile,4281,Valparaíso - Viña del Mar
2,161633640,banhammer,0,2025-06-28,2025-06-22,22,Colombia,22827,Firavitoba
3,296685985,user,0,2025-06-05,2025-06-01,25,Chile,4261,Concepción
4,296481221,user,0,2025-07-09,2025-07-06,25,Chile,5589,Osorno


In [22]:
df_scale.groupby(['created_dt_week'], as_index=False)['user_id'].nunique()['user_id'].sum()

11458

In [5]:
df_scale.groupby('country_name', as_index=False)['user_id'].nunique().sort_values('user_id', ascending=False)

Unnamed: 0,country_name,user_id
1,Brazil,2684
9,Mexico,1537
2,Chile,1263
11,Peru,1197
3,Colombia,1191
5,Ecuador,545
4,Costa Rica,454
6,Guatemala,354
10,Panama,342
0,Argentina,46


In [18]:
fig = px.bar(
    df_scale.groupby(['country_name'], as_index=False)['user_id'].nunique().sort_values('user_id', ascending=False), 
    x='country_name', 
    y='user_id',
    text_auto=True
    )
fig.show()

In [25]:
df_scale['requester'] = df_scale['requester'].replace({'banhammer':'Triggers', 'user':'Common liveness'})

In [29]:
fig = px.bar(
    df_scale.groupby(['country_name', 'requester'], as_index=False)['user_id'].nunique().sort_values('user_id', ascending=False), 
    x='country_name', 
    y='user_id',
    color='requester',
    labels={'user_id': 'Users', 'country_name': 'Country name', 'requester': 'Segment', 'user':'Common liveness', 'banhammer':'Triggers'},
    text_auto=True
    )
fig.show()

In [15]:
fig = px.bar(
    df_scale.groupby(['created_dt_week'], as_index=False)['user_id'].nunique(), 
    x='created_dt_week', 
    y='user_id',
    text_auto=True
    )
fig.show()

### Comparing the exp cities

In [70]:
df_proxy = read_bq("""
WITH users AS (SELECT DISTINCT user_id,
                               metric_date_utc,
                               t1.city_id,
                               t2.city_name,
                               t1.country_id,
                               t2.country_name
               FROM indriver-bi.incity.tbl_incity_growth_metrics_detail t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                             ON t1.city_id = t2.city_id
               WHERE user_type = 'pass'
                 AND orders_count > 0
                 AND metric_date_utc >= '2025-05-01'
                 AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
               QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id, metric_date_utc ORDER BY city_id) = 1),
     appeals AS (SELECT target_id              user_id,
                        DATE(t1.created_at) AS date,
                        COUNT(DISTINCT uuid)   cnt_appeals
                 FROM indriver-e6e40.ods_moderation_feed_red_pill.appeal t1
                          JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                               ON t1.city_id = t2.city_id
                          LEFT JOIN
                      indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 t3
                      ON t1.uuid = JSON_EXTRACT_SCALAR(t3.payload, '$.uuid')
                 WHERE DATE(t1.created_at) BETWEEN '2025-05-01' AND '2025-07-01'
                   AND DATE(t3.export_raw_dt) BETWEEN '2025-05-01' AND '2025-07-01'
                   --AND t1.initiator_id = 1 --жалоба от пассажира водиле
                   AND t1.initiator_id = 0 --жалоба от водилы пассажиру
                   AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
                   AND JSON_EXTRACT_SCALAR(t3.model, '$.result.top_category[0]') NOT IN ('CATEGORY_TEXT_NOT_RECOGNISED',
                                                                                         'CATEGORY_LOCATION_DISPUTE',
                                                                                         'CATEGORY_RIDE_REFUSAL',
                                                                                         'CATEGORY_CANCELLED_BY_DRIVER_REQUEST',
                                                                                         'CATEGORY_BARGAINING_AFTER_ACCEPT',
                                                                                         'CATEGORY_PASSENGER_WAS_LATE',
                                                                                         'CATEGORY_DRIVER_WAS_LATE',
                                                                                         'CATEGORY_APP_PROBLEM',
                                                                                         'CATEGORY_POSITIVE_REVIEW',
                                                                                         'CATEGORY_DIFFERENT_CAR',
                                                                                         'CATEGORY_DRIVER_REPORTED_CAR_MALFUNCTION',
                                                                                         'CATEGORY_STRANGER_IN_CAR')
                 GROUP BY 1, 2),
     reviews AS (SELECT target_id          AS  user_id,
                        DATE(r.created_at) AS  date,
                        COUNT(DISTINCT r.uuid) cnt_reviews
                 FROM indriver-e6e40.ods_moderation_feed_red_pill.review r
                          JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                               ON r.city_id = t2.city_id
                          LEFT JOIN
                      indriver-e6e40.ods_ds_moderation_system_cdc.violation_review_v3 src
                      ON JSON_EXTRACT_SCALAR(src.payload, '$.uuid') = r.uuid
                 WHERE DATE(r.created_at) BETWEEN '2025-05-01' AND '2025-07-01'
                   AND DATE(src.export_raw_dt) BETWEEN '2025-05-01' AND '2025-07-01'
                   AND rating < 5
                   AND r.visibility_id = 1 --жалобы от водителя на пассажира
                   AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
                   AND
                     JSON_EXTRACT_SCALAR(src.model, '$.result.top_category[0]') NOT IN ('CATEGORY_TEXT_NOT_RECOGNISED',
                                                                                        'CATEGORY_DIRTY_CABIN',
                                                                                        'CATEGORY_SUSPICIOUS_AREA', --УДАЛИТЬ?
                                                                                        'CATEGORY_ASSAULT', --УДАЛИТЬ?
                                                                                        'CATEGORY_BARGAINING_AFTER_ACCEPT',
                                                                                        'CATEGORY_POSITIVE_REVIEW',
                                                                                        'CATEGORY_DIFFERENT_CAR',
                                                                                        'CATEGORY_PASSENGER_REPORTED_CAR_MALFUNCTION',
                                                                                        'CATEGORY_NO_CHANGE',
                                                                                        'CATEGORY_DANGEROUS_DRIVING')
                 GROUP BY 1, 2),
     orders_raw AS (SELECT DISTINCT order_uuid,
                                    created_date_order_part AS date,
                                    user_id
                    FROM indriver-e6e40.emart.incity_detail t1
                             JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                  ON t1.city_id = t2.city_id
                    WHERE created_date_order_part BETWEEN '2025-05-01' AND '2025-07-01'
                      AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')),
     support_raw AS (SELECT DISTINCT t1.id                       support_id,
                                     DATE(t1.created_dt_part) AS date,
                                     LOWER(t4.order_id)          order_uuid
                     FROM indriver-e6e40.ods_customer_support.request t1
                              JOIN
                          indriver-bi.customer_service.tbl_customer_support_chats_just_detail t2
                          ON t1.id = t2.request_id
                              JOIN
                          dwh-storage-327422.ods_customer_support.chat_request_entry t4 ON t1.id = t4.request_id
                              JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                                   ON t4.city_id = geo.city_id
                              JOIN
                          (SELECT DISTINCT country_id, country_name, city_id FROM indriver-bi.heap.vw_geo_mapping) t3
                          ON t2.country_name = t3.country_name
                     WHERE DATE(t1.created_dt_part) BETWEEN '2025-05-01' AND '2025-07-01'
                       AND t4.created_dt_part BETWEEN '2025-05-01' AND '2025-07-01'
                       AND contact_category IS NOT NULL
                       AND contact_reason IS NOT NULL
                       AND who_contacts = 'Driver'
                       AND contact_category IN ('Complaints against Passenger', 'Safety')
                       AND geo.macroregion_name IN ('Latin America', 'Brazil', 'Africa')),
     support AS (SELECT orders_raw.user_id,
                        orders_raw.date,
                        COUNT(DISTINCT support_id) cnt_support
                 FROM orders_raw
                          JOIN
                      support_raw ON orders_raw.order_uuid = support_raw.order_uuid
                 GROUP BY 1, 2)
SELECT t1.metric_date_utc,
       t1.city_id,
       CASE
           WHEN city_id IN
                (4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568, 4257,
                 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
                 7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
                 5600, 4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243) THEN 'treatment'
           ELSE 'control'
           END                                  segment,
       t1.city_name,
       t1.country_name,
       COUNT(DISTINCT t1.user_id)            AS users,
       COALESCE(SUM(appeals.cnt_appeals), 0) AS appeals,
       COALESCE(SUM(reviews.cnt_reviews), 0) AS reviews,
       COALESCE(SUM(support.cnt_support), 0) AS support
FROM users t1
         LEFT JOIN
     appeals ON t1.user_id = appeals.user_id AND t1.metric_date_utc = appeals.date
         LEFT JOIN
     reviews ON t1.user_id = reviews.user_id AND t1.metric_date_utc = reviews.date
         LEFT JOIN
     support ON t1.user_id = support.user_id AND t1.metric_date_utc = support.date
WHERE city_id IN
      (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233, 21828, 4515, 5504,
       4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568,
       4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
       7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291, 5600,
       4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)
GROUP BY 1, 2, 3, 4, 5
""")

df_proxy.head()

Unnamed: 0,metric_date_utc,city_id,segment,city_name,country_name,users,appeals,reviews,support
0,2025-06-30,4257,treatment,Trujillo,Peru,71446,251,131,10
1,2025-05-20,4197,treatment,Bogota,Colombia,115832,345,153,56
2,2025-06-17,4225,treatment,Guadalajara,Mexico,24004,81,57,12
3,2025-06-04,4242,treatment,Medellin,Colombia,151398,614,269,44
4,2025-06-07,4199,treatment,Lima,Peru,423925,1802,1118,116


In [73]:
df_proxy['metric_date_utc'] = pd.to_datetime(df_proxy['metric_date_utc'])

df_proxy_data = df_proxy.groupby(['metric_date_utc', 'segment'], as_index=False)[['users', 'appeals', 'reviews', 'support']].sum()

df_proxy_data['composite'] = df_proxy_data['appeals']+df_proxy_data['reviews']+df_proxy_data['support']
df_proxy_data['composite_rel'] = df_proxy_data['composite']/df_proxy_data['users']*100
df_proxy_data['appeals_rel'] = df_proxy_data['appeals']/df_proxy_data['users']*100
df_proxy_data['reviews_rel'] = df_proxy_data['reviews']/df_proxy_data['users']*100
df_proxy_data['support_rel'] = df_proxy_data['support']/df_proxy_data['users']*100

df_proxy_data.head()

Unnamed: 0,metric_date_utc,segment,users,appeals,reviews,support,composite,composite_rel,appeals_rel,reviews_rel,support_rel
0,2025-05-01,control,411415,2095,1130,228,3453,0.84,0.51,0.27,0.06
1,2025-05-01,treatment,1579410,7445,4149,670,12264,0.78,0.47,0.26,0.04
2,2025-05-02,control,440320,2065,1151,259,3475,0.79,0.47,0.26,0.06
3,2025-05-02,treatment,1643242,7086,4052,659,11797,0.72,0.43,0.25,0.04
4,2025-05-03,control,455004,2122,1271,270,3663,0.81,0.47,0.28,0.06


In [74]:
from statsmodels.stats.weightstats import ztest as ztest

metric = ['composite_rel', 'appeals_rel', 'reviews_rel', 'support_rel']

for i in metric:
    print(i)
    z_stat, p_value = ztest(
        df_proxy_data.groupby(['metric_date_utc','segment'], as_index=False)[['composite_rel', 'appeals_rel',	'reviews_rel',	'support_rel']].mean().query("segment == 'control'")[i], 
        df_proxy_data.groupby(['metric_date_utc','segment'], as_index=False)[['composite_rel', 'appeals_rel',	'reviews_rel',	'support_rel']].mean().query("segment == 'treatment'")[i],
                            value=0)

    print(z_stat, p_value)

composite_rel
1.2356807144123687 0.21657726896230034
appeals_rel
1.1824918908473672 0.23701058124907504
reviews_rel
0.7998707131330633 0.42378570766696444
support_rel
3.9624770534737754 7.417613829954162e-05


In [75]:
fig = px.line(df_proxy_data.query("metric_date_utc<='2025-07-01'"), x="metric_date_utc", y="support_rel", color='segment', )
fig.show()

In [53]:
# df_incidents = read_bq("""
# SELECT redmine_id,
#        'driver'                        AS aggressor_mode,
#        driver_id                       AS aggressor_id,
#        pass_id,
#        driver_id,
#        city_id,
#        city_name,
#        country_name,
#        incident_date,
#        DATE_TRUNC(incident_date, WEEK) AS weekly,
#        incident_type,
#        incident_level,
#        information_status,
#        CASE
#            WHEN city_id IN
#                 (4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568, 4257,
#                  4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
#                  7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
#                  5600, 4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243) THEN 'treatment'
#            ELSE 'control'
#            END                            segment,
#        CASE
#            WHEN incident_level = 'Green' THEN 1
#            WHEN incident_level = 'Yellow' THEN 2
#            WHEN incident_level = 'Red' THEN 3
#            ELSE 0
#            END                            incident_level_digit
# FROM indriver-bi.safety.vw_safety_incidents_detail
# WHERE aggressor = 'Driver'
#   AND incident_date >= '2025-05-31'
#   AND city_id IN
#       (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233, 21828, 4515, 5504,
#        4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568,
#        4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
#        7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291, 5600,
#        4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)
# UNION
# DISTINCT
# SELECT redmine_id,
#        'pax'                           AS aggressor_mode,
#        pass_id                         AS aggressor_id,
#        pass_id,
#        driver_id,
#        city_id,
#        city_name,
#        country_name,
#        incident_date,
#        DATE_TRUNC(incident_date, WEEK) AS weekly,
#        incident_type,
#        incident_level,
#        information_status,
    #    CASE
    #        WHEN city_id IN
    #             (4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568, 4257,
    #              4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
    #              7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
    #              5600, 4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243) THEN 'treatment'
    #        ELSE 'control'
    #        END                            segment,
#        CASE
#            WHEN incident_level = 'Green' THEN 1
#            WHEN incident_level = 'Yellow' THEN 2
#            WHEN incident_level = 'Red' THEN 3
#            ELSE 0
#            END                            incident_level_digit
# FROM indriver-bi.safety.vw_safety_incidents_detail
# WHERE aggressor = 'Passenger'
#   AND incident_date >= '2025-05-31'
#   AND city_id IN
#       (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233, 21828, 4515, 5504,
#        4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568,
#        4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
#        7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291, 5600,
#        4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)

# """)

df_incident = read_bq("""
SELECT 
       CASE
           WHEN t1.city_id IN
                (4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568, 4257,
                 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
                 7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
                 5600, 4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243) THEN 'treatment'
           ELSE 'control'
           END                                                                       segment,
       t1.weekly,
       SUM(SAFE_DIVIDE(t1.incidents, t2.rides) * 100000) / COUNT(t1.city_id)      AS inc_rate,
       SUM(SAFE_DIVIDE(t1.conf_incidents, t2.rides) * 100000) / COUNT(t1.city_id) AS conf_inc_rate
FROM (SELECT incident_date                              AS weekly,
             t1.city_id,
             t1.city_name,
             COUNT(redmine_id)                                             AS incidents,
             COUNT(IF(information_status = 'Confirmed', redmine_id, NULL)) AS conf_incidents
      FROM indriver-bi.safety.vw_safety_incidents_detail t1
               JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                    ON
                        t1.city_id = t2.city_id
      WHERE incident_date >= '2025-05-29'
        AND t1.city_id IN
            (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233, 21828, 4515,
             5504,
             4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943,
             5568,
             4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271,
             4599,
             7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
             5600,
             4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)
      GROUP BY 1, 2, 3) t1
         LEFT JOIN (SELECT city_id,
                           metric_date AS weekly,
                           SUM(rides_count)               AS rides
                    FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                    WHERE user_type = 'pass'
                      AND metric_date >= '2025-05-29'
                      AND city_id IN
                          (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233,
                           21828, 4515, 5504,
                           4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404,
                           4228, 19943, 5568,
                           4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227,
                           23233, 4271, 4599,
                           7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726,
                           23130, 5291, 5600,
                           4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)
                    GROUP BY 1, 2) t2 ON t1.city_id = t2.city_id AND t1.weekly = t2.weekly
GROUP BY 1, 2
""")

df_incident.head()

Unnamed: 0,segment,weekly,inc_rate,conf_inc_rate
0,treatment,2025-07-07,34.34,5.03
1,treatment,2025-07-03,36.33,8.0
2,treatment,2025-06-04,38.39,12.02
3,treatment,2025-06-15,50.72,10.59
4,control,2025-06-24,32.72,6.46


In [59]:
df_incident.groupby(['segment'], as_index=False)[['inc_rate', 'conf_inc_rate']].mean()

Unnamed: 0,segment,inc_rate,conf_inc_rate
0,control,52.74,14.1
1,treatment,55.84,11.08


In [67]:
fig = px.line(df_incident.groupby(['weekly', 'segment'], as_index=False)['conf_inc_rate'].max(), x="weekly", y="conf_inc_rate", color='segment')
fig.show()

In [76]:
df_cat_incidents = read_bq("""
WITH bans AS (SELECT connected_account, COUNT(DISTINCT user_id) AS banned_users
              FROM (SELECT user_id,
                           CAST(REGEXP_EXTRACT(moderation_note, r"#(\d+)\.?") AS INT64) AS connected_account,
                           DATE(created_dttm)                                           AS created_dt
                    FROM dwh-storage-327422.ods_expel.tbl_ban
                    WHERE type_cd = 9
                      AND unban_applied_cd = 0
                      AND DATE(created_dttm) >= '2025-05-29'
                    QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_dttm ASC) = 1)
              GROUP BY 1),
     incidents AS (SELECT redmine_id,
                          'driver'                        AS aggressor_mode,
                          driver_id                       AS aggressor_id,
                          pass_id,
                          driver_id,
                          city_id,
                          city_name,
                          country_name,
                          incident_date,
                          DATE_TRUNC(incident_date, WEEK) AS weekly,
                          incident_type,
                          incident_level,
                          information_status,
                          CASE
                              WHEN incident_level = 'Green' THEN 1
                              WHEN incident_level = 'Yellow' THEN 2
                              WHEN incident_level = 'Red' THEN 3
                              ELSE 0
                              END                            incident_level_digit
                   FROM indriver-bi.safety.vw_safety_incidents_detail
                   WHERE aggressor = 'Driver'
                     AND information_status = 'Confirmed'
                     AND incident_date >= '2014-01-01'
                   UNION
                   DISTINCT
                   SELECT redmine_id,
                          'pax'                           AS aggressor_mode,
                          pass_id                         AS aggressor_id,
                          pass_id,
                          driver_id,
                          city_id,
                          city_name,
                          country_name,
                          incident_date,
                          DATE_TRUNC(incident_date, WEEK) AS weekly,
                          incident_type,
                          incident_level,
                          information_status,
                          CASE
                              WHEN incident_level = 'Green' THEN 1
                              WHEN incident_level = 'Yellow' THEN 2
                              WHEN incident_level = 'Red' THEN 3
                              ELSE 0
                              END                            incident_level_digit
                   FROM indriver-bi.safety.vw_safety_incidents_detail
                   WHERE aggressor = 'Passenger'
                     AND information_status = 'Confirmed'
                     AND incident_date >= '2014-01-01')
SELECT *,
       (prevented_accounts * 0.15)                                          AS potential_incidents,
       SUM(prevented_accounts) OVER ()                                      AS total_banned,
       ROUND(prevented_accounts / SUM(prevented_accounts) OVER () * 100, 2) AS share
FROM (SELECT t2.incident_level,
             information_status,
             incident_type,
             SUM(banned_users) AS prevented_accounts
      FROM bans t1
               LEFT JOIN incidents t2 ON t1.connected_account = t2.aggressor_id
      WHERE information_status IS NOT NULL
      GROUP BY 1, 2, 3)
QUALIFY ROW_NUMBER() OVER (PARTITION BY incident_level ORDER BY prevented_accounts DESC) IN (1, 2, 3, 4, 5)
""")

df_cat_incidents.head()

Unnamed: 0,incident_level,information_status,incident_type,prevented_accounts,potential_incidents,total_banned,share
0,Green,Confirmed,Attempted robbery,1120,168.0,4180,26.79
1,Green,Confirmed,Threat with weapons,121,18.15,4180,2.89
2,Green,Confirmed,Theft,90,13.5,4180,2.15
3,Green,Confirmed,Robbery,80,12.0,4180,1.91
4,Green,Confirmed,Fraud,39,5.85,4180,0.93


In [77]:
df_cat_incidents

Unnamed: 0,incident_level,information_status,incident_type,prevented_accounts,potential_incidents,total_banned,share
0,Green,Confirmed,Attempted robbery,1120,168.0,4180,26.79
1,Green,Confirmed,Threat with weapons,121,18.15,4180,2.89
2,Green,Confirmed,Theft,90,13.5,4180,2.15
3,Green,Confirmed,Robbery,80,12.0,4180,1.91
4,Green,Confirmed,Fraud,39,5.85,4180,0.93
5,Red,Confirmed,Car theft,332,49.8,4180,7.94
6,Red,Confirmed,Alcohol/narcotic intoxication of the driver,34,5.1,4180,0.81
7,Red,Confirmed,Robbery,7,1.05,4180,0.17
8,Red,Confirmed,Buying/transportation of drugs,3,0.45,4180,0.07
9,Red,Confirmed,An accident with the victims,2,0.3,4180,0.05


## Distribution by incident levels & Share of fraud lines

In [78]:
df_distr = read_bq("""
SELECT redmine_id,
       aggressor,
       driver_id,
       city_id,
       city_name,
       CASE
           WHEN city_id IN
                (4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568, 4257,
                 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
                 7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291,
                 5600, 4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243) THEN 'treatment'
           ELSE 'control'
           END                            segment,
       country_name,
       incident_date,
       DATE_TRUNC(incident_date, WEEK) AS weekly,
       incident_type,
       incident_level,
       information_status,
       CASE
           WHEN incident_level = 'Green' THEN 1
           WHEN incident_level = 'Yellow' THEN 2
           WHEN incident_level = 'Red' THEN 3
           ELSE 0
           END                            incident_level_digit
FROM indriver-bi.safety.vw_safety_incidents_detail
WHERE aggressor = 'Passenger'
  AND city_id IN
      (4163, 4545, 4232, 4144, 4825, 4235, 5543, 19549, 23089, 4524, 4194, 4746, 4240, 4236, 4233, 21828, 4515, 5504,
       4532, 4195, 4258, 4226, 4143, 5495, 22796, 4516, 4261, 4231, 4263, 23089, 4200, 5483, 4404, 4228, 19943, 5568,
       4257, 4196, 4267, 4242, 4225, 5513, 5573, 4272, 22654, 4155, 6083, 4266, 6117, 4397, 4227, 23233, 4271, 4599,
       7236, 4269, 22737, 5368, 4396, 4230, 4385, 4199, 4255, 5536, 5548, 5589, 4264, 4229, 4726, 23130, 5291, 5600,
       4198, 22817, 42833, 5535, 4234, 4197, 6587, 4243)
  AND incident_date >= '2025-01-01'
""")

df_distr['incident_date'] = pd.to_datetime(df_distr['incident_date'])
df_distr['weekly'] = pd.to_datetime(df_distr['weekly'])

df_distr.head()

Unnamed: 0,redmine_id,aggressor,driver_id,city_id,city_name,segment,country_name,incident_date,weekly,incident_type,incident_level,information_status,incident_level_digit
0,SQ-430855,Passenger,58438793,4233,Aguascalientes,control,Mexico,2025-01-03,2024-12-29,Buying/transportation of drugs,Green,Not confirmed,1
1,SQ-431962,Passenger,265816251,4155,Monterrey,treatment,Mexico,2025-01-03,2024-12-29,Robbery,Yellow,Confirmed,2
2,SQ-435132,Passenger,100305829,5573,Copiapó,treatment,Chile,2025-01-06,2025-01-05,Robbery,Yellow,Confirmed,2
3,SQ-441612,Passenger,226665611,5368,Kingston,treatment,Jamaica,2025-01-09,2025-01-05,Threats / insults before/after a trip,Green,Not confirmed,1
4,SQ-451310,Passenger,144035242,5573,Copiapó,treatment,Chile,2025-01-15,2025-01-12,Attempted robbery,,Automated ML decision,0


In [104]:
agg_data = df_distr[(df_distr['weekly']>='2025-04-01')&(df_distr['information_status']=='Confirmed')&(df_distr['segment']=='treatment')].groupby(['weekly', 'incident_level'], as_index=False)['redmine_id'].count()

agg_data['weekly_total'] = agg_data.groupby('weekly')['redmine_id'].transform('sum')

agg_data['share'] = agg_data['redmine_id'] / agg_data['weekly_total'] *100 

agg_data

Unnamed: 0,weekly,incident_level,redmine_id,weekly_total,share
0,2025-04-06,Green,174,495,35.15
1,2025-04-06,Red,43,495,8.69
2,2025-04-06,Yellow,278,495,56.16
3,2025-04-13,Green,176,478,36.82
4,2025-04-13,Red,35,478,7.32
5,2025-04-13,Yellow,267,478,55.86
6,2025-04-20,Green,213,521,40.88
7,2025-04-20,Red,30,521,5.76
8,2025-04-20,Yellow,278,521,53.36
9,2025-04-27,Green,223,558,39.96


In [115]:
    
groups = ['control', 'treatment']

for segment in groups:

    agg_data = df_distr[(df_distr['weekly']>='2025-04-01')&(df_distr['information_status']=='Confirmed')&(df_distr['segment']==segment)].groupby(['weekly', 'incident_level'], as_index=False)['redmine_id'].count()

    agg_data['weekly_total'] = agg_data.groupby('weekly')['redmine_id'].transform('sum')

    agg_data['share'] = agg_data['redmine_id'] / agg_data['weekly_total'] *100 

    fig = px.bar(
        agg_data.round(2),        
        x="weekly", 
        y="redmine_id", 
        color="incident_level",
        text_auto=True,
        title=f"Incidents, {segment}"
        )
    fig.show()

In [118]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Создаем subplot: 2 строки, 1 колонка
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    subplot_titles=("Incidents, control", "Incidents, treatment")
)

groups = ['control', 'treatment']

for i, segment in enumerate(groups):

    # Фильтруем и агрегируем данные
    agg_data = df_distr[
        (df_distr['weekly'] >= '2025-04-01') &
        (df_distr['information_status'] == 'Confirmed') &
        (df_distr['segment'] == segment)
    ].groupby(['weekly', 'incident_level'], as_index=False)['redmine_id'].count()

    # Оконная сумма по неделе
    agg_data['weekly_total'] = agg_data.groupby('weekly')['redmine_id'].transform('sum')
    agg_data['share'] = agg_data['redmine_id'] / agg_data['weekly_total'] * 100

    # Создаем временный график
    temp_fig = px.bar(
        agg_data.round(2),
        x="weekly",
        y="share",
        color="incident_level",
        text_auto=True,
        color_discrete_sequence=px.colors.qualitative.Set2
    )

    # Добавляем трейсы на subplot
    for trace in temp_fig.data:
        fig.add_trace(trace, row=i+1, col=1)

# Финальные настройки
fig.update_layout(
    height=800,
    barmode='stack',
    title_text='Incidents: Control vs Treatment',
    showlegend=True
)

fig.show()

In [132]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

my_custom_colors = {
    "Red": "#cc2900",
    "Yellow": "#ffff00",
    "Green": "#248f24"
}

# Создаем subplot: 2 строки, 1 колонка
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    subplot_titles=("Control group", "Treatment group"),
    vertical_spacing=0.1
)

groups = ['control', 'treatment']

for i, segment in enumerate(groups):

    # Фильтруем и агрегируем данные
    agg_data = df_distr[
        (df_distr['weekly'] >= '2025-04-01') &
        (df_distr['information_status'] == 'Confirmed') &
        (df_distr['segment'] == segment)
    ].groupby(['weekly', 'incident_level'], as_index=False)['redmine_id'].count()

    # Оконная сумма по неделе
    agg_data['weekly_total'] = agg_data.groupby('weekly')['redmine_id'].transform('sum')
    agg_data['share'] = agg_data['redmine_id'] / agg_data['weekly_total'] * 100

    # Создаем временный график
    temp_fig = px.bar(
        agg_data.round(2),
        x="weekly",
        y="share",
        color="incident_level",
        text_auto=True,
        color_discrete_map=my_custom_colors
    )

    for trace in temp_fig.data:
        # Показываем легенду только для первого сегмента
        trace.showlegend = (i == 0)
        fig.add_trace(trace, row=i+1, col=1)

# Финальные настройки
fig.update_layout(
    height=1000,
    barmode='stack',
    title_text='Incidents by levels. Control cities vs Treatment, %',
    showlegend=True
)

fig.show()

# Preparing for impact meeting 2025 on safety packages

In [464]:
df = read_bq("""
WITH users AS (SELECT t1.user_id,
                      t1.created_dt_part,
                      CASE
                          WHEN t1.group_id = 4535352 THEN 0
                          WHEN t1.group_id = 4535353 THEN 1
                          ELSE NULL
                          END                         group_name,
                      t1.city_id,
                      t2.city_name,
                      IF(t3.user_id IS NULL, 1, 0) AS user_with_rides_before
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                             ON
                                 t1.city_id = t2.city_id
                        LEFT JOIN (SELECT user_id, metric_date_utc
                                   FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                                   WHERE user_type = 'pass'
                                     AND rides_count > 0
                                     AND metric_date_utc >= '2023-01-01') t3
                                  ON t1.user_id = t3.user_id AND t3.metric_date_utc < t1.created_dt_part
               WHERE test_id = 2699),
     metric AS (SELECT user_id,
                       metric_date_utc,
                       SUM(rides_count)   AS rides,
                       SUM(orders_count)  AS orders,
                       SUM(gmv_clean_usd) AS gmv
                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                WHERE user_type = 'pass'
                  AND metric_date_utc BETWEEN '2025-03-07' AND '2025-06-30'
                GROUP BY 1, 2)
SELECT user_id, COALESCE(SUM(rides), 0) AS ride
FROM (SELECT t1.user_id,
             t1.created_dt_part,
             t1.group_name,
             t1.user_with_rides_before,
             t2.user_id AS user_id_rides,
             t2.metric_date_utc,
             t2.rides
      FROM users t1
               LEFT JOIN metric t2 ON t1.user_id = t2.user_id AND
                                      t2.metric_date_utc BETWEEN t1.created_dt_part AND DATE_ADD(t1.created_dt_part, INTERVAL + 30 DAY))
WHERE group_name = 0
  AND user_with_rides_before = 1
GROUP BY 1
""")

df.head()

Unnamed: 0,user_id,ride
0,281537261,5
1,284279435,1
2,285304742,1
3,275905676,4
4,272870368,0


In [465]:
df_monthly = read_bq("""
WITH users AS (SELECT t1.user_id,
                      t1.created_dt_part,
                      CASE
                          WHEN t1.group_id = 4535352 THEN 0
                          WHEN t1.group_id = 4535353 THEN 1
                          ELSE NULL
                          END                         group_name,
                      t1.city_id,
                      t2.city_name,
                      IF(t3.user_id IS NULL, 1, 0) AS user_with_rides_before
               FROM indriver-e6e40.ss_ab_platform_mart.markup_users t1
                        JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                             ON
                                 t1.city_id = t2.city_id
                        LEFT JOIN (SELECT user_id, metric_date_utc
                                   FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                                   WHERE user_type = 'pass'
                                     AND rides_count > 0
                                     AND metric_date_utc >= '2023-01-01') t3
                                  ON t1.user_id = t3.user_id AND t3.metric_date_utc < t1.created_dt_part
               WHERE test_id = 2699),
     metric AS (SELECT user_id,
                       metric_date_utc,
                       SUM(rides_count)   AS rides,
                       SUM(orders_count)  AS orders,
                       SUM(gmv_clean_usd) AS gmv
                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                WHERE user_type = 'pass'
                  AND metric_date_utc BETWEEN '2025-03-07' AND '2025-06-30'
                GROUP BY 1, 2)
SELECT user_id,
       DATE_TRUNC(metric_date_utc, MONTH) AS monthly,
       SUM(rides)                         AS rides
FROM (SELECT t1.user_id,
             t1.created_dt_part,
             t1.group_name,
             t1.user_with_rides_before,
             t2.user_id AS user_id_rides,
             t2.metric_date_utc,
             t2.rides
      FROM users t1
               LEFT JOIN metric t2 ON t1.user_id = t2.user_id AND
                                      t2.metric_date_utc >= t1.created_dt_part)
WHERE group_name = 0
  AND user_with_rides_before = 1
GROUP BY 1, 2
""")

df_monthly['monthly'] = pd.to_datetime(df_monthly['monthly'])
df_monthly.head()

Unnamed: 0,user_id,monthly,rides
0,285062531,NaT,
1,284946789,NaT,
2,179521730,2025-04-01,1.0
3,286832938,2025-03-01,6.0
4,251997414,2025-05-01,1.0


In [None]:
# df_month = read_bq("""
# SELECT 'liveness' as type, t1.monthly, t1.cut_users, t2.rides
# FROM (SELECT DATE_TRUNC(DATE(show), MONTH)    AS monthly,
#              COUNT(DISTINCT user_id)          AS users,
#              SUM(IF(show IS NULL, 1, 0))      AS show_users,
#              SUM(IF(confirmed IS NULL, 1, 0)) AS cut_users
#       FROM (SELECT user_id,
#                    MAX(IF(name = 'client.verification_start.click', client_time_ts, NULL)) AS show,
#                    MAX(IF(name = 'client.verification_flow_result_status.show' AND
#                           LOWER(JSON_EXTRACT_SCALAR(payload, '$.status')) = 'approve', client_time_ts,
#                           NULL))                                                           AS confirmed
#             FROM indriver-e6e40.emart.product_event t1
#                      JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
#                           ON
#                               t1.city_id = t2.city_id
#             WHERE 1 = 1
#               AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
#               AND name IN (
#                            'client.verification_start.click', 'client.verification_flow_result_status.show'
#                 )
#               AND event_dt_part BETWEEN '2025-01-01' AND '2025-06-30'
#             GROUP BY 1)
#       GROUP BY 1) t1
#          LEFT JOIN (SELECT DATE_TRUNC(created_date_order_part, MONTH) AS monthly,
#                            COUNT(DISTINCT order_uuid)                 AS rides
#                     FROM indriver-e6e40.emart.incity_detail
#                     WHERE 1 = 1
#                       AND created_date_order_part BETWEEN '2025-01-01' AND '2025-06-30'
#                       AND driveraccept_timestamp IS NOT NULL
#                       AND status_order = 'done'
#                     GROUP BY 1) t2 ON t1.monthly = t2.monthly
# WHERE t1.monthly BETWEEN '2025-01-01' AND '2025-06-01'
# UNION ALL
# SELECT 'CPF' as type, t1.monthly, t1.cut_users, t2.rides
# FROM (SELECT monthly,
#              COUNT(DISTINCT user_id)          AS users,
#              SUM(IF(show IS NULL, 1, 0))      AS show_users,
#              SUM(IF(confirmed IS NULL, 1, 0)) AS cut_users
#       FROM (SELECT user_id,
#                    DATE_TRUNC(event_dt_part, MONTH)                                      AS monthly,
#                    MAX(IF(name = 'city.client.id_doc_check.show', client_time_ts, NULL)) AS show,
#                    MAX(IF(name = 'registration.cpf_confirmed', client_time_ts, NULL))    AS confirmed
#             FROM indriver-e6e40.emart.product_event t1
#             WHERE 1 = 1
#               AND name IN (
#                            'city.client.id_doc_check.show', 'registration.cpf_confirmed'
#                 )
#               AND event_dt_part BETWEEN '2025-01-01' AND '2025-06-30'
#             GROUP BY 1, 2)
#       GROUP BY 1) t1
#          LEFT JOIN (SELECT DATE_TRUNC(created_date_order_part, MONTH) AS monthly,
#                            COUNT(DISTINCT order_uuid)                 AS rides
#                     FROM indriver-e6e40.emart.incity_detail
#                     WHERE 1 = 1
#                       AND created_date_order_part BETWEEN '2025-01-01' AND '2025-06-30'
#                       AND driveraccept_timestamp IS NOT NULL
#                       AND status_order = 'done'
#                     GROUP BY 1) t2 ON t1.monthly = t2.monthly
# """)

df_month = read_bq("""
SELECT 'liveness' AS type, t1.monthly, t1.cut_users, t2.rides
FROM (SELECT DATE_TRUNC(DATE(show), MONTH)    AS monthly,
             COUNT(DISTINCT user_id)          AS users,
             SUM(IF(show IS NULL, 1, 0))      AS show_users,
             SUM(IF(confirmed IS NULL, 1, 0)) AS cut_users
      FROM (SELECT user_id,
                   MAX(IF(name = 'client.verification_start.click', client_time_ts, NULL)) AS show,
                   MAX(IF(name = 'client.verification_flow_result_status.show' AND
                          LOWER(JSON_EXTRACT_SCALAR(payload, '$.status')) = 'approve', client_time_ts,
                          NULL))                                                           AS confirmed
            FROM indriver-e6e40.emart.product_event t1
                     JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                          ON
                              t1.city_id = t2.city_id
            WHERE 1 = 1
              -- AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
              AND t1.city_id IN
                  (6587, 4230, 5495, 4272, 4396, 4155, 4825, 5291, 4234, 4404, 5548, 4143, 4198, 4225, 4227, 4255, 4197,
                 4243, 5483, 4518, 4377, 4532, 4521, 4537, 4758, 4163, 4534, 4519, 4375)
              AND name IN (
                           'client.verification_start.click', 'client.verification_flow_result_status.show'
                )
              AND event_dt_part BETWEEN '2025-01-01' AND '2025-06-30'
            GROUP BY 1)
      GROUP BY 1) t1
         LEFT JOIN (SELECT DATE_TRUNC(created_date_order_part, MONTH) AS monthly,
                           COUNT(DISTINCT order_uuid)                 AS rides
                    FROM indriver-e6e40.emart.incity_detail
                    WHERE 1 = 1
                      AND created_date_order_part BETWEEN '2025-01-01' AND '2025-06-30'
                      AND driveraccept_timestamp IS NOT NULL
                      AND status_order = 'done'
                    GROUP BY 1) t2 ON t1.monthly = t2.monthly
WHERE t1.monthly BETWEEN '2025-01-01' AND '2025-08-01'
""")
df_month['monthly'] = pd.to_datetime(df_month['monthly'])
df_month.head()

Unnamed: 0,type,monthly,cut_users,rides
0,liveness,2025-04-01,34074,155881885
1,liveness,2025-05-01,51592,167409178
2,liveness,2025-02-01,8748,142850511
3,liveness,2025-01-01,1,143265585
4,liveness,2025-03-01,25048,155907445


In [469]:
res_table = pd.DataFrame()


months = ['2025-01-01', '2025-02-01', '2025-03-01', '2025-04-01', '2025-05-01', '2025-06-01']
users = [0]
coeff = [(4000/6490), (3706/4000), (3466/3706), (3466/3706)*1.01, (3466/3706)*1.013, (3466/3706)*1.0106]


for month, coeff in zip(months, coeff):

    res = pd.DataFrame()

    cut_users_by_mon = int(df_month[df_month['monthly']==month]['cut_users'].sum()+users[-1]*coeff)
    users.append(cut_users_by_mon)
    if month == '2025-01-01':
        df_agg = df_monthly.query(f"monthly == '2025-03-01'")[~df_monthly['monthly'].isna()].groupby(['rides'], as_index=False)['user_id'].count()
        # df_agg = df.groupby(['ride'], as_index=False)['user_id'].count()
        df_agg['total'] = df_agg['user_id'].sum()
        df_agg['share'] = df_agg['user_id'] / df_agg['total']
        df_agg['cut_users'] = cut_users_by_mon
        df_agg['cut_users_share'] = (df_agg['cut_users'] * df_agg['share']).astype(int)
        df_agg['lost_rides'] = (df_agg['cut_users_share'] * (df_agg['rides'])).astype(int)
        res['month'] = [month]
        res['cut_users'] = [cut_users_by_mon]
        res['lost_rides'] = [df_agg['lost_rides'].sum()+int((36890+5428+11201)/12)]
    elif month == '2025-02-01':
        df_agg = df_monthly.query(f"monthly == '2025-04-01'")[~df_monthly['monthly'].isna()].groupby(['rides'], as_index=False)['user_id'].count()
        # df_agg = df.groupby(['ride'], as_index=False)['user_id'].count()
        df_agg['total'] = df_agg['user_id'].sum()
        df_agg['share'] = df_agg['user_id'] / df_agg['total']
        df_agg['cut_users'] = cut_users_by_mon
        df_agg['cut_users_share'] = (df_agg['cut_users'] * df_agg['share']).astype(int)
        df_agg['lost_rides'] = (df_agg['cut_users_share'] * (df_agg['rides'])).astype(int)
        res['month'] = [month]
        res['cut_users'] = [cut_users_by_mon]
        res['lost_rides'] = [df_agg['lost_rides'].sum()+int((36890+5428+11201)/12)]
    else:
        df_agg = df_monthly.query(f"monthly == '{month}'")[~df_monthly['monthly'].isna()].groupby(['rides'], as_index=False)['user_id'].count()
        # df_agg = df.groupby(['ride'], as_index=False)['user_id'].count()
        df_agg['total'] = df_agg['user_id'].sum()
        df_agg['share'] = df_agg['user_id'] / df_agg['total']
        df_agg['cut_users'] = cut_users_by_mon
        df_agg['cut_users_share'] = (df_agg['cut_users'] * df_agg['share']).astype(int)
        df_agg['lost_rides'] = (df_agg['cut_users_share'] * (df_agg['rides'])).astype(int)
        res['month'] = [month]
        res['cut_users'] = [cut_users_by_mon]
        res['lost_rides'] = [df_agg['lost_rides'].sum()+int((36890+5428+11201)/12)]
        

    res_table = pd.concat([res_table, res])
    res_table['month'] = pd.to_datetime(res_table['month'])

res_table = res_table.merge(df_month.groupby('monthly', as_index=False)['rides'].max().rename(columns={'rides':'total_rides'}), left_on='month', right_on='monthly', how='inner')

res_table['monthly_impact_%'] = res_table['lost_rides'] / res_table['total_rides'] *100

res_table[['month', 'cut_users', 'lost_rides', 'total_rides', 'monthly_impact_%']]

Unnamed: 0,month,cut_users,lost_rides,total_rides,monthly_impact_%
0,2025-01-01,1,4459,143265585,0.003
1,2025-02-01,8748,36263,142850511,0.025
2,2025-03-01,33229,74057,155907445,0.048
3,2025-04-01,65461,245887,155881885,0.158
4,2025-05-01,113609,458941,167409178,0.274
5,2025-06-01,171881,659614,162857930,0.405


In [418]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.optimize import curve_fit # For more complex non-linear fits
from numpy.polynomial.polynomial import polyfit # For polynomial fit

# --- 1. Load the provided data ---
# Data from your image_e14ee9.png
data = {
    'month': ['2025-01-01', '2025-02-01', '2025-03-01', '2025-04-01', '2025-05-01', '2025-06-01'],
    'lost_rides': [118378, 793412, 818982, 2147740, 3142556, 3924421],
    'total_rides': [143265585, 142850511, 155907445, 155881885, 167409178, 162857926],
    'monthly_impact_%': [0.083, 0.555, 0.525, 1.378, 1.877, 2.410]
}
df = res_table[['month',  'lost_rides', 'total_rides', 'monthly_impact_%']]

# Convert 'month' column to datetime objects
df['month'] = pd.to_datetime(df['month'])

# --- 2. Prepare data for non-linear regression (Polynomial Regression) ---
# We need numerical representation for months.
# Using the number of months since the start of 2025.
df['month_numeric'] = (df['month'].dt.year - 2025) * 12 + df['month'].dt.month - 1

# Define X (independent variable) and Y (dependent variable) for total_rides
X_train_total_rides = df['month_numeric']
y_train_total_rides = df['total_rides']

# Choose the degree of the polynomial. Let's start with degree 2 (quadratic).
# polyfit returns coefficients in increasing power: c_0, c_1, c_2 for c_0 + c_1*x + c_2*x^2
degree = 2
coeffs = polyfit(X_train_total_rides, y_train_total_rides, degree)

print(f"Polynomial Regression (Degree {degree}) Coefficients for total_rides:")
print(f"  Coefficients (c_0, c_1, ..., c_n): {coeffs}")
print("-" * 30)

# Define the polynomial function to use for prediction
def polynomial_function(x, *coeffs):
    return sum(c * (x**i) for i, c in enumerate(coeffs))

# --- 3. Generate Future Dates and Numeric Months ---
last_known_month_dt = df['month'].max() # This is '2025-06-01'

# Generate dates from the last known month up to December
future_months = pd.date_range(start=last_known_month_dt, end='2025-12-01', freq='MS')

df_future_total_rides = pd.DataFrame({
    'month': future_months
})
df_future_total_rides['month_numeric'] = (df_future_total_rides['month'].dt.year - 2025) * 12 + df_future_total_rides['month'].dt.month - 1

# --- 4. Extrapolate Values for total_rides ---
df_future_total_rides['total_rides_extrapolated'] = polynomial_function(df_future_total_rides['month_numeric'], *coeffs)

# Ensure extrapolated values are non-negative (total_rides shouldn't decrease below 0)
df_future_total_rides['total_rides_extrapolated'] = df_future_total_rides['total_rides_extrapolated'].apply(lambda x: max(0, x))

# --- 5. Prepare data for plotting ---
# Original data for plotting
df_original_plot = df.copy()

# Extrapolated data for plotting
df_extrapolated_plot = df_future_total_rides.copy()

# Round the values for display consistency (optional, but good for large numbers)
df_original_plot['total_rides_display'] = df_original_plot['total_rides'].apply(lambda x: f"{int(x):,}")
df_extrapolated_plot['total_rides_display'] = df_extrapolated_plot['total_rides_extrapolated'].apply(lambda x: f"{int(x):,}")


# --- 6. Plot the Extended Data for total_rides ---
fig = go.Figure()

# Add Original data trace
fig.add_trace(go.Scatter(
    x=df_original_plot['month'],
    y=df_original_plot['total_rides'],
    mode='lines+markers+text',
    name='Original Total Rides',
    line=dict(color='blue', width=2),
    marker=dict(color='blue', size=7),
    text=df_original_plot['total_rides_display'],
    textposition='top center',
    textfont_size=10
))

# Add Extrapolated data trace
fig.add_trace(go.Scatter(
    x=df_extrapolated_plot['month'],
    y=df_extrapolated_plot['total_rides_extrapolated'],
    mode='lines+markers+text',
    name='Extrapolated Total Rides',
    line=dict(color='orange', dash='dot', width=2), # Dotted line for extrapolated
    marker=dict(color='orange', size=7),
    # Only show text labels for the *new* extrapolated points (July onwards)
    text=[val if date > last_known_month_dt else '' for val, date in zip(df_extrapolated_plot['total_rides_display'], df_extrapolated_plot['month'])],
    textposition='top center',
    textfont_size=10
))

fig.update_layout(
    title='Total Rides in 2025 (Extended with Polynomial Approximation)',
    xaxis_title='Month',
    yaxis_title='Total Rides',
    showlegend=True,
    legend=dict(
        x=0.85,
        y=0.99,
        xanchor='right',
        yanchor='top',
        traceorder="normal",
        font=dict(size=12),
        bgcolor="rgba(255,255,255,0.7)",
        bordercolor="LightSteelBlue",
        borderwidth=1
    ),
    height=600, # Adjust chart height
    width=1000  # Adjust chart width
)

# Force Y-axis to start from 0 and ensure it covers the new range
max_y_value = max(df_original_plot['total_rides'].max(), df_extrapolated_plot['total_rides_extrapolated'].max())
fig.update_yaxes(range=[0, max_y_value * 1.1]) # Add some padding to the top

# Customize x-axis ticks to show month and year, and ensure continuity
fig.update_xaxes(
    tickformat="%b %Y", # Format as "Jan 2025"
    dtick="M1",        # Show a tick for every month
    ticklabelmode="period" # Centers tick labels for months
)

fig.show()

print("\nOriginal Data for Total Rides:")
print(df_original_plot[['month', 'total_rides']].to_string())
print("\nExtrapolated Data for Total Rides (including June for continuity):")
print(df_extrapolated_plot[['month', 'total_rides_extrapolated']].to_string())

Polynomial Regression (Degree 2) Coefficients for total_rides:
  Coefficients (c_0, c_1, ..., c_n): [ 1.40842206e+08  7.29601196e+06 -4.78561500e+05]
------------------------------



Original Data for Total Rides:
       month  total_rides
0 2025-01-01    143265585
1 2025-02-01    142850511
2 2025-03-01    155907445
3 2025-04-01    155881885
4 2025-05-01    167409178
5 2025-06-01    162857928

Extrapolated Data for Total Rides (including June for continuity):
       month  total_rides_extrapolated
0 2025-06-01           165,358,228.143
1 2025-07-01           167,390,063.600
2 2025-08-01           168,464,776.057
3 2025-09-01           168,582,365.514
4 2025-10-01           167,742,831.971
5 2025-11-01           165,946,175.429
6 2025-12-01           163,192,395.886


In [437]:
df_total = df_extrapolated_plot[['month', 'total_rides_extrapolated_new']].merge(
    df_future.query("month > '2025-06-01'")[['month', 'monthly_impact_%']],
    left_on='month',
    right_on='month',
    how='inner'
    )

df_total.rename(columns={'total_rides_extrapolated_new':'total_rides', 'monthly_impact_%':'lost_rides'}, inplace=True)

In [462]:
df_future['monthly_impact_%'] = df_future['monthly_impact_%'].astype(int)

df_res_25 = pd.concat(
    [
        df_total,
        res_table[['month',  'lost_rides', 'total_rides']]
        ]
    ).sort_values('month', ascending=True)


df_res_25['impact_%'] = df_res_25['lost_rides'] / df_res_25['total_rides'] *100
df_res_25['total_rides_cumsum'] = df_res_25['lost_rides'].cumsum()
df_res_25['lost_rides_cumsum'] = df_res_25['total_rides'].cumsum()
df_res_25['cumsum_impact_%'] = df_res_25['total_rides_cumsum'] / df_res_25['lost_rides_cumsum'] *100


df_res_25

# df_res_25['lost_rides'].sum(), df_res_25['total_rides'].sum()*100


Unnamed: 0,month,total_rides,lost_rides,impact_%,total_rides_cumsum,lost_rides_cumsum,cumsum_impact_%
0,2025-01-01,143265585,15852,0.011,15852,143265585,0.011
1,2025-02-01,142850511,96866,0.068,112718,286116096,0.039
2,2025-03-01,155907445,106012,0.068,218730,442023541,0.049
3,2025-04-01,155881885,298558,0.192,517288,597905426,0.087
4,2025-05-01,167409178,512372,0.306,1029660,765314604,0.135
5,2025-06-01,162857930,706128,0.434,1735788,928172534,0.187
0,2025-07-01,172411765,705319,0.409,2441107,1100584299,0.222
1,2025-08-01,173518719,830056,0.478,3271163,1274103018,0.257
2,2025-09-01,173639836,954792,0.55,4225955,1447742854,0.292
3,2025-10-01,172775116,1079529,0.625,5305484,1620517970,0.327


In [457]:
df_res_25['total_rides'].to_list()

[143265585,
 142850511,
 155907445,
 155881885,
 167409178,
 162857930,
 172411765,
 173518719,
 173639836,
 172775116,
 170924560,
 168088167]

In [422]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go # Explicitly import go
from scipy.stats import linregress

df = res_table[['month',  'lost_rides', 'total_rides', 'monthly_impact_%']]

# Convert 'month' column to datetime objects
df['month'] = pd.to_datetime(df['month'])

# --- 2. Prepare data for linear regression ---
df['month_numeric'] = (df['month'].dt.year - 2025) * 12 + df['month'].dt.month - 1

X_train = df['month_numeric']
y_train = df['lost_rides']

slope, intercept, r_value, p_value, std_err = linregress(X_train, y_train)

print(f"Linear Regression Results:")
print(f"  Slope (m): {slope:.4f}")
print(f"  Intercept (c): {intercept:.4f}")
print(f"  R-squared: {r_value**2:.4f}")
print("-" * 30)

# --- 3. Generate Future Dates including the last known month (June) ---
last_known_month_dt = df['month'].max() # This is '2025-06-01'

# Generate dates from the last known month up to December
future_months = pd.date_range(start=last_known_month_dt, end='2025-12-01', freq='MS')

df_future = pd.DataFrame({
    'month': future_months
})
df_future['month_numeric'] = (df_future['month'].dt.year - 2025) * 12 + df_future['month'].dt.month - 1

# --- 4. Extrapolate Values ---
df_future['monthly_impact_%'] = slope * df_future['month_numeric'] + intercept

# Round the values for display consistently
df['monthly_impact_%_display'] = df['monthly_impact_%'].round(2)
df_future['monthly_impact_%_display'] = df_future['monthly_impact_%'].round(2)



# --- 5. Plot the Extended Data ---
fig = go.Figure()

# Add Original data trace (all original points)
fig.add_trace(go.Scatter(
    x=df['month'],
    y=df['monthly_impact_%'],
    mode='lines+markers+text',
    name='Original',
    line=dict(color='blue', width=2),
    marker=dict(color='blue', size=7),
    text=df['monthly_impact_%_display'],
    textposition='top center',
    textfont_size=10
))

# Add Extrapolated data trace
# This trace will start from the last original point (June),
# but visually its line will only appear from July onwards,
# creating a seamless transition.
fig.add_trace(go.Scatter(
    x=df_future['month'],
    y=df_future['monthly_impact_%'],
    # Only show lines and markers for the extrapolated part
    # We still include the last original point (June) in this trace's data
    # so the line segment can start from it.
    mode='lines+markers+text',
    name='Extrapolated',
    line=dict(color='red', dash='dot', width=2), # Dotted line for extrapolated
    marker=dict(color='red', size=7),
    # Crucially, we want text labels ONLY for the *new* extrapolated points (July onwards)
    # and not for the common June point in the extrapolated series.
    # We'll use a mask for the text.
    text=[val if date > last_known_month_dt else '' for val, date in zip(df_future['monthly_impact_%_display'], df_future['month'])],
    textposition='top center',
    textfont_size=10
))

fig.update_layout(
    title='Incident rate in 2025, % (Extended with Linear Approximation)',
    xaxis_title='Month',
    yaxis_title='Monthly Impact (%)',
    showlegend=True, # Ensure legend is visible
    width=1400,  # New width
    height=600,
    legend=dict(
        x=0.85, # Adjusted x to move it slightly left
        y=0.99, # Adjusted y to move it slightly down
        xanchor='right', # Anchor from right for x positioning
        yanchor='top', # Anchor from top for y positioning
        traceorder="normal",
        font=dict(size=12),
        bgcolor="rgba(255,255,255,0.7)", # Slightly transparent background
        bordercolor="LightSteelBlue",
        borderwidth=1
    )
)

# Force Y-axis to start from 0 and ensure it covers the new range
max_y_value = max(df['monthly_impact_%'].max(), df_future['monthly_impact_%'].max())
fig.update_yaxes(range=[0, max_y_value * 1.1]) # Add some padding to the top

# Customize x-axis ticks to show month and year, and ensure continuity
fig.update_xaxes(
    tickformat="%b %Y", # Format as "Jan 2025"
    dtick="M1",        # Show a tick for every month
    ticklabelmode="period" # Centers tick labels for months
)

fig.show()

print("\nOriginal Data for Plotting:")
print(df[['month', 'monthly_impact_%', 'monthly_impact_%_display']].to_string())
print("\nExtrapolated Data for Plotting (including June for continuity):")
print(df_future[['month', 'monthly_impact_%', 'monthly_impact_%_display']].to_string())

Linear Regression Results:
  Slope (m): 124736.5429
  Intercept (c): -43099.5238
  R-squared: 0.9225
------------------------------



Original Data for Plotting:
       month  monthly_impact_%  monthly_impact_%_display
0 2025-01-01             0.016                     0.020
1 2025-02-01             0.071                     0.070
2 2025-03-01             0.065                     0.070
3 2025-04-01             0.177                     0.180
4 2025-05-01             0.278                     0.280
5 2025-06-01             0.395                     0.400

Extrapolated Data for Plotting (including June for continuity):
       month  monthly_impact_%  monthly_impact_%_display
0 2025-06-01       580,583.190               580,583.190
1 2025-07-01       705,319.733               705,319.730
2 2025-08-01       830,056.276               830,056.280
3 2025-09-01       954,792.819               954,792.820
4 2025-10-01     1,079,529.362             1,079,529.360
5 2025-11-01     1,204,265.905             1,204,265.900
6 2025-12-01     1,329,002.448             1,329,002.450


In [191]:
liveness_standard = int(794709)
cpf = int(315573)
total_rides = liveness_standard + cpf

df_agg = df.groupby(['ride'], as_index=False)['user_id'].count()
df_agg['total'] = df_agg['user_id'].sum()
df_agg['share'] = df_agg['user_id'] / df_agg['total']
df_agg['cut_users'] = total_rides
df_agg['cut_users_share'] = (df_agg['cut_users'] * df_agg['share']).astype(int)
df_agg['avg_rides'] = df_agg['cut_users_share'] * df_agg['ride']

result = np.round(df_agg['avg_rides'].sum()/920_000_000*100, 4)
result_liv = np.round(3165230/920_000_000*100, 4)
result_cpf = np.round(1253803/920_000_000*100, 4)
result_idg = np.round((36890+5428+11201)/920_000_000*100, 4)


print(f'Total result: {np.round(result+result_idg,4)}%')
print(f'Share of Liveness: {result_liv}%')
print(f'Share of CPF: {result_cpf}%')
print(f'Share of rides by ID Graph: {result_idg}%')


Total result: 0.4862%
Share of Liveness: 0.344%
Share of CPF: 0.1363%
Share of rides by ID Graph: 0.0058%


In [194]:
df_incidents = read_bq("""
SELECT t1.monthly,
       t1.incidents,
       t1.conf_incidents,
       t2.rides,
       (t1.conf_incidents / t1.incidents) * 100 AS share_of_confirmed,
       (t1.incidents / t2.rides) * 100000       AS incident_rate,
       (t1.conf_incidents / t2.rides) * 100000  AS conf_incident_rate
FROM (SELECT DATE_TRUNC(incident_date, WEEK)                               AS monthly,
             COUNT(redmine_id)                                             AS incidents,
             COUNT(IF(information_status = 'Confirmed', redmine_id, NULL)) AS conf_incidents
      FROM indriver-bi.safety.vw_safety_incidents_detail t1
               JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                    ON
                        t1.city_id = geo.city_id
      WHERE incident_date BETWEEN '2025-01-01' AND '2025-06-30'
        AND geo.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
      GROUP BY 1) t1
         LEFT JOIN (SELECT DATE_TRUNC(created_date_order_part, WEEK) AS monthly,
                           COUNT(DISTINCT order_uuid)                AS rides
                    FROM indriver-e6e40.emart.incity_detail t1
                             JOIN indriver-e6e40.heap.vw_macroregion_mapping geo
                                  ON
                                      t1.city_id = geo.city_id
                    WHERE 1 = 1
                      AND created_date_order_part BETWEEN '2025-01-01' AND '2025-06-30'
                      AND driverdone_timestamp IS NOT NULL
                      AND status_order = 'done'
                      AND geo.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
                    GROUP BY 1) t2 ON t1.monthly = t2.monthly
""")

df_incidents.head()

Unnamed: 0,monthly,incidents,conf_incidents,rides,share_of_confirmed,incident_rate,conf_incident_rate
0,2025-05-04,7631,1714,22202051,22.46,34.37,7.72
1,2025-01-05,7328,1695,17750075,23.13,41.28,9.55
2,2024-12-29,4198,886,9452521,21.11,44.41,9.37
3,2025-02-02,6248,1587,20452551,25.4,30.55,7.76
4,2025-04-27,7558,1712,21768050,22.65,34.72,7.86


In [216]:
fig = px.line(
    df_incidents.groupby(['monthly'], as_index=False)[['incident_rate', 'conf_incident_rate', 'conf_incidents', 'incidents', 'share_of_confirmed']].max().round(2), 
    x="monthly", 
    y='incidents', 
    title='Incident rate in 2025, %',
    text='incidents'
    )

fig.update_traces(mode='lines+markers+text', # Add 'text' to mode to show labels
                  textposition='top center'  # Position the text above the marker
                 )

fig.update_yaxes(rangemode="tozero")

fig.show()

In [316]:
df_monthly.groupby(['monthly'], as_index=False)['user_id'].count()

Unnamed: 0,monthly,user_id
0,2025-03-01,6490
1,2025-04-01,4000
2,2025-05-01,3706
3,2025-06-01,3466


In [336]:
coeff = [(4000/6490), (3706/4000), (3466/3706), (3466/3706)*1.01, (3466/3706)*1.013, (3466/3706)*1.0106]
coeff

[0.6163328197226502,
 0.9265,
 0.9352401511063141,
 0.9445925526173773,
 0.9473982730706961,
 0.945153696708041]

In [458]:
df_liveness = read_bq("""
WITH grouped AS (SELECT segment,
                        DATE_TRUNC(event_dt_part, MONTH) AS monthly,
                        COUNT(DISTINCT user_id)          AS users
                 FROM (SELECT t1.user_id,
                              t1.event_dt_part,
                              t1.name,
                              t1.city_id,
                              t2.city_name,
                              CASE
                                  WHEN t1.city_id IN
                                       (6587, 4230, 5495, 4272, 4396, 4155, 4825, 5291, 4234, 4404, 5548, 4143, 4198,
                                        4225, 4227, 4255,
                                        4197,
                                        4243, 5483, 4518, 4377, 4532, 4521, 4537, 4758, 4163, 4534, 4519, 4375)
                                      THEN 'SP 2025'
                                  ELSE 'all time SP'
                                  END segment,
                              t2.country_name
                       FROM indriver-e6e40.emart.product_event t1
                                JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                     ON
                                         t1.city_id = t2.city_id
                       WHERE 1 = 1
                         AND name IN (
                           'client.verification_start.click'
                           )
                         AND t1.event_dt_part BETWEEN '2024-09-01' AND CURRENT_DATE()
                         AND t2.macroregion_name IN ('Latin America', 'Brazil', 'Africa')
                       QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY client_time) =
                               1)
                 GROUP BY 1, 2)
SELECT *,
       SUM(users) OVER (PARTITION BY monthly)                         AS total_users,
       ROUND(users / SUM(users) OVER (PARTITION BY monthly) * 100, 2) AS share_of_checks
FROM grouped
ORDER BY monthly, segment
""")

df_liveness.head()

KeyboardInterrupt: 