# Packages

In [None]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.4f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast
import json

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from sklearn.linear_model import Ridge
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil


# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Analysis

In [76]:
df = read_bq("""
WITH complaints AS (SELECT name,
                           user_id,
                           JSON_EXTRACT(payload, '$.tags') AS tags,
                           JSON_EXTRACT(payload, '$.text') AS text,
                           payload,
                           client_time_ts,
                           order_uuid
                    FROM indriver-e6e40.emart.product_event
                    WHERE name IN ('client.driver_feed_complaint.show',
                                   'client.driver_feed_complaint.click')
                      AND event_dt_part >= '2025-10-13'),
     incidents AS (SELECT redmine_id,
                          incident_date,
                          order_timestamp AS incident_ts,
                          driver_id,
                          pass_id         AS aggressor_id,
                          category,
                          incident_level,
                          information_status
                   FROM indriver-bi.safety.vw_safety_incidents_detail
                   WHERE 1 = 1
                     AND aggressor = 'Passenger'
                     AND incident_date >= '2025-10-10'
                   QUALIFY ROW_NUMBER() OVER (PARTITION BY pass_id ORDER BY incident_date DESC) = 1),
     orders AS (SELECT user_id AS pass_id, driver_id, order_uuid, order_id, order_timestamp
                FROM indriver-e6e40.emart.incity_detail
                WHERE created_date_order_part >= '2025-10-10'
                UNION ALL
                SELECT user_id AS pass_id, driver_id, order_uuid, order_id, order_timestamp
                FROM indriver-e6e40.emart.intercity_detail
                WHERE created_date_order_part >= '2025-10-10'
                UNION ALL
                SELECT user_id AS pass_id, driver_id, order_uuid, order_id, order_timestamp
                FROM indriver-e6e40.emart.delivery_detail
                WHERE created_date_order_part >= '2025-10-10')
SELECT *
FROM complaints t1
         JOIN orders t2 ON t1.order_uuid = t2.order_uuid AND t1.user_id = t2.driver_id
         LEFT JOIN incidents t3 ON t2.pass_id = t3.aggressor_id AND t3.incident_ts > t2.order_timestamp
             """)

df.head()

Unnamed: 0,name,user_id,tags,text,payload,client_time_ts,order_uuid,pass_id,driver_id,order_uuid_1,order_id,order_timestamp,redmine_id,incident_date,incident_ts,driver_id_1,aggressor_id,category,incident_level,information_status
0,client.driver_feed_complaint.click,52351915,"""[\""suspicious_activity\""]""","""""","{""appsflyer_device_id"":""1671345840397-2386201557952562867"",""city_id"":""4193"",""order_id"":""0199e518-24fc-800c-c050-ce60c0a32959"",""tags"":""[\""suspicious_activity\""]"",""text"":"""",""user_id"":""52351915""}",2025-10-14 23:41:57+00:00,0199e518-24fc-800c-c050-ce60c0a32959,14289337,52351915,0199e518-24fc-800c-c050-ce60c0a32959,2523204211556775479,2025-10-14 17:39:48+00:00,,NaT,NaT,,,,,
1,client.driver_feed_complaint.click,218384160,"""[\""low_price\""]""","""""","{""appsflyer_device_id"":""1739291837539-6039739127279417428"",""city_id"":""4143"",""order_id"":""0199e381-d114-800c-c07f-05613e23f7e8"",""tags"":""[\""low_price\""]"",""text"":"""",""user_id"":""218384160""}",2025-10-14 16:17:37+00:00,0199e381-d114-800c-c07f-05613e23f7e8,292419497,218384160,0199e381-d114-800c-c07f-05613e23f7e8,1238885464189313317,2025-10-14 10:15:59+00:00,,NaT,NaT,,,,,
2,client.driver_feed_complaint.click,119422529,"""[\""low_price\""]""","""""","{""appsflyer_device_id"":""1757685057344-7702484499022102842"",""city_id"":""5448"",""order_id"":""0199e1f6-6a72-803c-c074-144d197b7b35"",""tags"":""[\""low_price\""]"",""text"":"""",""user_id"":""119422529""}",2025-10-14 09:06:04+00:00,0199e1f6-6a72-803c-c074-144d197b7b35,300311853,119422529,0199e1f6-6a72-803c-c074-144d197b7b35,2179732976460348726,2025-10-14 14:04:06+00:00,,NaT,NaT,,,,,
3,client.driver_feed_complaint.click,176525247,"""[\""sexual_content\"",\""suspicious_activity\""]""","""""","{""appsflyer_device_id"":""1749733543451-6918949578580495983"",""city_id"":""4194"",""order_id"":""0199e509-e451-800c-c06f-ae52837acd73"",""tags"":""[\""sexual_content\"",\""suspicious_activity\""]"",""text"":"""",""user_id"":""176525247""}",2025-10-14 22:27:14+00:00,0199e509-e451-800c-c06f-ae52837acd73,14621968,176525247,0199e509-e451-800c-c06f-ae52837acd73,1209781689953842903,2025-10-14 17:24:14+00:00,,NaT,NaT,,,,,
4,client.driver_feed_complaint.click,159495584,"""[\""low_price\""]""","""""","{""appsflyer_device_id"":""1757368179023-6631326609715579276"",""city_id"":""4255"",""order_id"":""0199e35a-1903-8016-c053-4ce75e488064"",""tags"":""[\""low_price\""]"",""text"":"""",""user_id"":""159495584""}",2025-10-14 15:37:01+00:00,0199e35a-1903-8016-c053-4ce75e488064,281535936,159495584,0199e35a-1903-8016-c053-4ce75e488064,9153992113728156417,2025-10-14 10:32:36+00:00,,NaT,NaT,,,,,


In [77]:
def extract_tags_list(json_str):
    if pd.isna(json_str) or json_str is None:
        return None
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        return None


def safe_literal_eval(val):

    if pd.isna(val) or not isinstance(val, str):
        return []
    
    try:
        return ast.literal_eval(val) if val.startswith('[') and val.endswith(']') else [val]
    except (ValueError, SyntaxError):
        return [val]
    


df['tags_list'] = df['tags'].apply(extract_tags_list)
df['tags_list'] = df['tags_list'].apply(safe_literal_eval)

df_res = df[['pass_id', 'driver_id', 'tags_list', 'text', 'redmine_id', 'category', 'incident_level']]

df_res

Unnamed: 0,pass_id,driver_id,tags_list,text,redmine_id,category,incident_level
0,14289337,52351915,[suspicious_activity],"""""",,,
1,292419497,218384160,[low_price],"""""",,,
2,300311853,119422529,[low_price],"""""",,,
3,14621968,176525247,"[sexual_content, suspicious_activity]","""""",,,
4,281535936,159495584,[low_price],"""""",,,
...,...,...,...,...,...,...,...
72327,116004036,220237551,[drugs],"""""",,,
72328,138739862,278574417,[low_price],"""""",,,
72329,50141575,202138167,"[long_distance, low_price]","""""",,,
72330,48400140,253678349,"[low_price, long_distance]","""""",,,


In [109]:
df_res_1 = df_res.groupby('pass_id', as_index=False)['driver_id'].count()

df_res_2 = df_res_1.merge(df_res[['pass_id', 'redmine_id']].drop_duplicates(), on='pass_id', how='left')

df_me = df_res_2.groupby('driver_id', as_index=False)[['pass_id', 'redmine_id']].nunique()

df_me = df_me.rename(columns={
    'driver_id':'threshold',
    'pass_id':'total_users_at_threshold',
    'redmine_id':'true_positives'
})

TOTAL_POPULATION = df_me['total_users_at_threshold'].sum()
TOTAL_POSITIVES = df_me['true_positives'].sum()
TOTAL_NEGATIVES = TOTAL_POPULATION - TOTAL_POSITIVES

df_me['TP'] = df_me['true_positives']
df_me['FN'] = TOTAL_POSITIVES - df_me['TP'] 
df_me['FP'] = df_me['total_users_at_threshold'] - df_me['TP'] 
df_me['TN'] = TOTAL_NEGATIVES - df_me['FP'] 


df_me['Recall'] = df_me['TP'] / TOTAL_POSITIVES * 100
df_me['Precision'] = df_me['TP'] / (df_me['TP'] + df_me['FP']) * 100
df_me['FPR'] = df_me['FP'] / TOTAL_NEGATIVES * 100
df_me['FNR'] = df_me['FN'] / TOTAL_POSITIVES * 100

df_me['F1_Score'] = 2 * (df_me['Precision'] * df_me['Recall']) / (df_me['Precision'] + df_me['Recall'])

df_me

Unnamed: 0,threshold,total_users_at_threshold,true_positives,TP,FN,FP,TN,Recall,Precision,FPR,FNR,F1_Score
0,1,53914,32,32,22,53882,7638,59.2593,0.0594,87.5845,40.7407,0.1186
1,2,5886,9,9,45,5877,55643,16.6667,0.1529,9.553,83.3333,0.303
2,3,1180,2,2,52,1178,60342,3.7037,0.1695,1.9148,96.2963,0.3241
3,4,337,1,1,53,336,61184,1.8519,0.2967,0.5462,98.1481,0.5115
4,5,125,3,3,51,122,61398,5.5556,2.4,0.1983,94.4444,3.352
5,6,63,3,3,51,60,61460,5.5556,4.7619,0.0975,94.4444,5.1282
6,7,23,2,2,52,21,61499,3.7037,8.6957,0.0341,96.2963,5.1948
7,8,6,0,0,54,6,61514,0.0,0.0,0.0098,100.0,
8,9,12,0,0,54,12,61508,0.0,0.0,0.0195,100.0,
9,10,9,0,0,54,9,61511,0.0,0.0,0.0146,100.0,


In [108]:
df_res['sorted_tags'] = df_res['tags_list'].apply(lambda row: sorted(row))

df_res['sorted_tags'] = df_res['sorted_tags'].astype(str)

df_res.query("sorted_tags != ['List']").groupby('sorted_tags', as_index=False)[['driver_id', 'pass_id', 'redmine_id']].nunique().sort_values('redmine_id', ascending=False).head(10)\
    .rename(columns={
        'driver_id':'drivers',
        'pass_id':'reports',
        'redmine_id':'incidents',
    })

Unnamed: 0,sorted_tags,drivers,reports,incidents
118,['suspicious_activity'],4059,4666,25
119,[],2196,3849,15
95,"['long_distance', 'low_price']",5102,12834,11
111,['low_price'],13313,29261,7
86,"['drugs', 'suspicious_activity']",197,226,7
87,['drugs'],929,1253,5
117,['sexual_content'],844,978,4
115,['other'],1279,1473,4
84,"['drugs', 'sexual_content', 'suspicious_activity']",132,193,2
114,"['other', 'suspicious_activity']",126,132,2


In [61]:
df_exploded = df_res.explode('tags_list')

df_exploded.query("tags_list != 'List'").groupby('tags_list', as_index=False)[['driver_id', 'pass_id', 'redmine_id']].nunique().sort_values('redmine_id', ascending=False)

Unnamed: 0,tags_list,driver_id,pass_id,redmine_id
6,suspicious_activity,6489,9510,41
2,long_distance,7095,17045,21
3,low_price,17749,42787,21
1,drugs,2477,3988,16
4,other,2566,4020,10
0,advertisement,1677,2602,9
5,sexual_content,2236,3424,6


In [64]:
df_exploded[df_exploded['tags_list']=='suspicious_activity']['text'].unique()

array(['"ladrones modo operativos "', '""',
       '"estafa Robo , destino siempre roban "', '"es un estafador"',
       '"roban"', '"raro sube y baja el precio "',
       '"es para llevar drogas "', '"fake rided"', '"robo"',
       '"fake account "', '"orderan palsu"', '"Orderan Fiktif "',
       '"orderan fiktif"',
       '"es una estafa cierren le la cuenta a este animal "',
       '"handa Pani uthaudai na"', '"membuat orderan palsu "', '"penipu"',
       '"fake ride "', '"robo intento de hurto"',
       '"la ves pasada me estafaron me quitaron 180.000 de mi cuenta "',
       '"fake ID laga raha hai"', '"facke ride "', '"estafadores "',
       '"posible estafa"', '"sospechoso "',
       '"customer froad hai ismein driver ko Gali Di Hai"',
       '"Estafadores "', '"estafa"',
       '"pone viajes para robar la gente se llama Ronald "',
       '"estafador pide dinero por nequi el maldito perro deberían tener más control con ese tipo de ratas "',
       '"es un cliente falso"',
       