# Packages

In [6]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.4f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from sklearn.linear_model import Ridge
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil


# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Analysis

In [13]:
import pandas_gbq

df = pd.read_csv(
    '/Users/renatyunison/Desktop/VSC scripts/Working/user_connections_user_connections.csv',
    header=None
    )

df = df.rename(columns={
    0:'uuid',
    1:'user_id_from',
    2:'user_id_to',
    3:'connection_type',
    4:'connection_deep_level',
    5:'connection_chain',
    6:'created_at',
    7:'updated_at',
    8:'confidence_level',
    9:'confirmation_status',
    10:'subtype'
})

# 1. Приводим все строковые столбцы к типу OBJECT (самый совместимый для pyarrow)
df['uuid'] = df['uuid'].astype('object')
df['connection_type'] = df['connection_type'].astype('object')
df['connection_chain'] = df['connection_chain'].astype('object')
df['confirmation_status'] = df['confirmation_status'].astype('object') # Изменяем 'string' на 'object'

# 2. Обработка дат и чисел
df['created_at'] = pd.to_datetime(df['created_at'], utc=True) # Добавляем utc=True для BigQuery TIMESTAMP
df['updated_at'] = pd.to_datetime(df['updated_at'], utc=True)
df['user_id_from'] = df['user_id_from'].astype('Int64') # Int64 с большой буквы для поддержки NaN
df['user_id_to'] = df['user_id_to'].astype('Int64')
df['subtype'] = df['subtype'].astype('object')
df['connection_deep_level'] = df['connection_deep_level'].astype('Int64')
df['confidence_level'] = df['confidence_level'].astype('float64')


pandas_gbq.to_gbq(df,
                destination_table='renat_yunisov.drivers_connection',
                project_id='analytics-dev-333113',
                if_exists='append')
print('Dataframe was uploaded', datetime.today().strftime('%Y-%m-%d %H:%M:%S'))

100%|██████████| 1/1 [00:00<00:00, 7681.88it/s]

Dataframe was uploaded 2025-10-02 20:54:04





In [66]:
df_anal = read_bq("""
WITH incidents AS (SELECT redmine_id,
                          incident_date,
                          incident_type,
                          incident_level,
                          pass_id,
                          driver_id,
                          aggressor,
                          CASE
                              WHEN aggressor = 'Passenger' THEN pass_id
                              WHEN aggressor = 'Driver' THEN driver_id
                              ELSE NULL END aggressor_id
                   FROM indriver-bi.safety.vw_safety_incidents_detail
                   WHERE 1 = 1
                     AND information_status IN ('Confirmed', 'Automated ML decision')
                     AND incident_date >= '2020-10-01'
                   QUALIFY ROW_NUMBER() OVER (PARTITION BY aggressor_id ORDER BY incident_date DESC) = 1),
     ban AS ((SELECT user_id,
                     type_cd,
                     start_dttm,
                     end_dttm,
                     REGEXP_EXTRACT(
                             moderation_note,
                             r'#(\d+)'
                     )                                    AS moderation_note,
                     DATE_DIFF(end_dttm, start_dttm, DAY) AS diff_days
              FROM dwh-storage-327422.ods_expel.tbl_ban
              WHERE unban_applied_cd = 0
                AND type_cd NOT IN (7, 9)
                AND DATE_DIFF(end_dttm
                        , start_dttm
                        , DAY)
                  > 3600
              QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY DATE_DIFF(end_dttm, start_dttm, DAY) DESC) = 1)),
     rides AS (SELECT user_id,
                      SUM(orders_count)  AS orders_count,
                      SUM(tenders_count) AS tenders_count,
                      SUM(rides_count)   AS rides_count
               FROM (SELECT user_id,
                            SUM(orders_count)  AS orders_count,
                            SUM(tenders_count) AS tenders_count,
                            SUM(rides_count)   AS rides_count,
                     FROM indriver-bi.cargo.tbl_cargo_growth_metrics_detail
                     WHERE user_type = 'driver'
                     GROUP BY 1
                     UNION ALL
                     SELECT user_id,
                            SUM(orders_count)  AS orders_count,
                            SUM(tenders_count) AS tenders_count,
                            SUM(rides_count)   AS rides_count,
                     FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                     WHERE user_type = 'driver'
                     GROUP BY 1
                     UNION ALL
                     SELECT user_id,
                            SUM(orders_count)  AS orders_count,
                            SUM(tenders_count) AS tenders_count,
                            SUM(rides_count)   AS rides_count,
                     FROM indriver-bi.delivery.tbl_couriers_growth_metrics_detail
                     WHERE user_type = 'driver'
                     GROUP BY 1
                     UNION ALL
                     SELECT user_id,
                            SUM(orders_count)  AS orders_count,
                            SUM(tenders_count) AS tenders_count,
                            SUM(rides_count)   AS rides_count,
                     FROM indriver-bi.intercity.tbl_intercity_growth_metrics_detail
                     WHERE user_type = 'driver'
                     GROUP BY 1)
               GROUP BY 1),
     res AS (SELECT t1.*,
                    t2.aggressor_id    AS user_id_from_aggressor,
                    t2.incident_level  AS user_id_from_incident_level,
                    t2.incident_type   AS user_id_from_incident_type,
                    t4.user_id         AS user_id_from_ban,
                    t4.start_dttm      AS user_id_from_start,
                    t4.end_dttm        AS user_id_from_end,
                    t4.type_cd         AS user_id_from_type_ban,
                    t4.moderation_note AS user_id_from_moder_nore,
                    t3.aggressor_id    AS user_id_to_aggressor,
                    t3.incident_level  AS user_id_to_incident_level,
                    t3.incident_type   AS user_id_to_incident_type,
                    t5.user_id         AS user_id_to_ban,
                    t5.start_dttm      AS user_id_to_start,
                    t5.end_dttm        AS user_id_to_end,
                    t5.type_cd         AS user_id_to_type_ban,
                    t5.moderation_note AS user_id_to_moder_nore
             FROM analytics-dev-333113.renat_yunisov.drivers_connection t1
                      LEFT JOIN incidents t2
                                ON t1.user_id_from = t2.aggressor_id
                      LEFT JOIN incidents t3 ON t1.user_id_to = t3.aggressor_id
                      LEFT JOIN ban t4 ON t1.user_id_from = t4.user_id
                      LEFT JOIN ban t5 ON t1.user_id_to = t5.user_id
             WHERE 1 = 1
               AND confidence_level >= 0.99
               AND t4.user_id IS NOT NULL
               AND DATE(t4.start_dttm) BETWEEN DATE_ADD(CURRENT_DATE(), INTERVAL - 1 YEAR) AND CURRENT_DATE()
               AND (t4.type_cd NOT IN (7) OR t2.aggressor_id IS NOT NULL))
SELECT *
FROM res t1
         LEFT JOIN rides t2 ON t1.user_id_to = t2.user_id
WHERE DATE(user_id_to_start) BETWEEN DATE_ADD(CURRENT_DATE(), INTERVAL - 1 YEAR) AND CURRENT_DATE()
""")

df_anal['orders_count'] = df_anal['orders_count'].fillna(0)
df_anal['tenders_count'] = df_anal['tenders_count'].fillna(0)
df_anal['rides_count'] = df_anal['rides_count'].fillna(0)

df_anal.head()

Unnamed: 0,uuid,user_id_from,user_id_to,connection_type,connection_deep_level,connection_chain,created_at,updated_at,confidence_level,confirmation_status,subtype,user_id_from_aggressor,user_id_from_incident_level,user_id_from_incident_type,user_id_from_ban,user_id_from_start,user_id_from_end,user_id_from_type_ban,user_id_from_moder_nore,user_id_to_aggressor,user_id_to_incident_level,user_id_to_incident_type,user_id_to_ban,user_id_to_start,user_id_to_end,user_id_to_type_ban,user_id_to_moder_nore,user_id,orders_count,tenders_count,rides_count
0,0199a10b-7406-70ce-a05a-70c187b55bb1,296524266,296528267,liveness,1,[296528267],2025-10-01 18:31:46+00:00,2025-10-01 18:31:46+00:00,1.0,,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,,296524266,2025-05-26 13:33:24+00:00,2125-05-26 13:33:24+00:00,3,,,,,296528267,2025-05-22 12:55:42+00:00,2125-05-22 12:55:42+00:00,3,,,0,0,0
1,0199a10b-7438-70ea-b634-53e70583b230,296379089,296528267,liveness,1,[296528267],2025-10-01 18:31:46+00:00,2025-10-01 18:31:46+00:00,1.0,,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,,296379089,2025-05-22 12:59:50+00:00,2125-05-22 12:59:50+00:00,3,,,,,296528267,2025-05-22 12:55:42+00:00,2125-05-22 12:55:42+00:00,3,,,0,0,0
2,0199a10b-7459-70ef-9dd1-56d3206c3019,296526122,296528267,liveness,1,[296528267],2025-10-01 18:31:47+00:00,2025-10-01 18:31:46+00:00,1.0,,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,,296526122,2025-05-23 14:27:08+00:00,2125-05-23 14:27:08+00:00,3,,,,,296528267,2025-05-22 12:55:42+00:00,2125-05-22 12:55:42+00:00,3,,,0,0,0
3,01999db7-d854-720d-9539-e7d0eeb2a5e1,261826605,266653091,liveness,1,[266653091],2025-10-01 03:01:35+00:00,2025-10-01 03:01:35+00:00,1.0,,SUBTYPE_CONTRACTOR_PHOTO_CUSTOMER_LIVENESS,,,,261826605,2024-11-22 12:20:38+00:00,2094-11-23 12:20:38+00:00,3,,,,,266653091,2024-11-25 06:06:56+00:00,2094-11-25 06:06:58+00:00,2,,,0,0,0
4,01999c2f-192f-7d9c-b627-79981fec7861,266374131,266860252,liveness,1,[266860252],2025-09-30 19:52:36+00:00,2025-09-30 19:52:36+00:00,0.9967,,SUBTYPE_CONTRACTOR_PHOTO_CUSTOMER_LIVENESS,,,,266374131,2024-11-25 06:06:56+00:00,2094-11-25 06:06:58+00:00,2,,,,,266860252,2024-12-11 10:44:44+00:00,2092-12-29 13:58:45+00:00,3,,266860252.0,3,3,3


In [67]:
df_anal[~df_anal['user_id_from_ban'].isna()][['user_id_from_ban', 'user_id_from_type_ban']]

Unnamed: 0,user_id_from_ban,user_id_from_type_ban
0,296524266,3
1,296379089,3
2,296526122,3
3,261826605,3
4,266374131,2
5,266382238,2
6,266636194,2
7,266344455,2
8,266653091,2
9,261826605,3


In [68]:
df_anal[~df_anal['user_id_from_aggressor'].isna()][['user_id_from_aggressor', 'user_id_to_aggressor']].nunique()

user_id_from_aggressor    17
user_id_to_aggressor       9
dtype: int64

In [69]:
29 / 540

0.053703703703703705

In [70]:
print(1522/(1522+620)*100)
df_anal.groupby('subtype', as_index=False)['user_id_from'].nunique()

71.05508870214753


Unnamed: 0,subtype,user_id_from
0,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,24
1,SUBTYPE_CONTRACTOR_PHOTO_CUSTOMER_LIVENESS,38


In [71]:
df_res = df_anal[[
    'user_id_from_ban', 'user_id_from_type_ban', 'user_id_from_aggressor', 
    'user_id_from_incident_level', 'user_id_to_ban', 'user_id_to_type_ban', 'subtype',
    'user_id_to_aggressor', 'user_id_to_incident_level', 'orders_count', 'tenders_count', 'rides_count'
    ]]

df_res['composite'] = df_res['orders_count'] + df_res['tenders_count'] + df_res['rides_count']


def checker(row):
    if row > 0:
        return '> 0'
    else:
        return '0'
    
df_res['segment_orders'] = df_res['orders_count'].apply(checker)
df_res['segment_rides'] = df_res['rides_count'].apply(checker)
df_res['segment_tenders'] = df_res['tenders_count'].apply(checker)
df_res['segment_composite'] = df_res['composite'].apply(checker)

df_res.head()

Unnamed: 0,user_id_from_ban,user_id_from_type_ban,user_id_from_aggressor,user_id_from_incident_level,user_id_to_ban,user_id_to_type_ban,subtype,user_id_to_aggressor,user_id_to_incident_level,orders_count,tenders_count,rides_count,composite,segment_orders,segment_rides,segment_tenders,segment_composite
0,296524266,3,,,296528267,3,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,0,0,0,0,0,0,0,0
1,296379089,3,,,296528267,3,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,0,0,0,0,0,0,0,0
2,296526122,3,,,296528267,3,SUBTYPE_CONTRACTOR_PHOTO_CONTRACTOR_PHOTO,,,0,0,0,0,0,0,0,0
3,261826605,3,,,266653091,2,SUBTYPE_CONTRACTOR_PHOTO_CUSTOMER_LIVENESS,,,0,0,0,0,0,0,0,0
4,266374131,2,,,266860252,3,SUBTYPE_CONTRACTOR_PHOTO_CUSTOMER_LIVENESS,,,3,3,3,9,> 0,> 0,> 0,> 0


In [72]:
df_res.groupby('user_id_to_type_ban', as_index=False)['user_id_to_ban'].nunique()

Unnamed: 0,user_id_to_type_ban,user_id_to_ban
0,1,3
1,2,8
2,3,41
3,8,1


In [73]:
df_res[(df_res['segment_composite']=='0')&(~df_res['user_id_to_ban'].isna())]['user_id_to_ban'].unique()

<IntegerArray>
[296528267, 266653091, 266374131, 310040520, 296670915, 258117403, 288959314, 296526122, 286120590, 307129147, 266382238, 297001739, 267472912, 297927373, 301459778, 296524266, 266636194, 279365510, 309013222, 296673482, 296382516, 296669164, 266344455, 243519250, 269859589, 249809759, 296667541, 299190682]
Length: 28, dtype: Int64

In [74]:
df_res.groupby(['segment_composite'], as_index=False)[['user_id_from_ban',  'user_id_to_ban']].nunique()

Unnamed: 0,segment_composite,user_id_from_ban,user_id_to_ban
0,0,31,28
1,> 0,35,25
