# Packages

In [4]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

## Functions

In [5]:
def get_connections(df, max_level=3):

    """
    Library for gathering the whole connections between users, it uses BFS algorithm to search the users in network system (Graphs)
    """

    G = nx.Graph()
    edges = list(zip(df["user_id_from"], df["user_id_to"]))
    G.add_edges_from(edges)

    connections_data = []

    for user in G.nodes:
        user_connections = defaultdict(set)

        levels = {user: 0}
        queue = [(user, 0)]
        visited = set([user])

        while queue:
            current, level = queue.pop(0)

            if level >= max_level:
                continue

            for neighbor in G.neighbors(current):
                if neighbor not in visited:
                    visited.add(neighbor)
                    levels[neighbor] = level + 1
                    queue.append((neighbor, level + 1))
                    user_connections[level + 1].add(neighbor)

        sorted_connections = tuple(sorted(user_connections[lvl]) for lvl in sorted(user_connections))
        connections_data.append((user, sorted_connections))

    return pd.DataFrame(connections_data, columns=["user_id_from", "connections"])

def visualize_graph(df, output_file="graph.html"):

    """
    This library makes a rendering of graph networks as HTML file with interactive  
    """

    G = nx.Graph()
    
    for index, row in df.iterrows():
        user_from = row["user_id_from"]
        connections = row["connections"]
        
        for level, conn_tuple in enumerate(connections, start=1):
            for user_to in conn_tuple:
                G.add_edge(user_from, user_to, weight=level)
    
    net = Network(height="1000px", width="100%", bgcolor="#ffffff", font_color="black", notebook=True, cdn_resources='remote')
    net.force_atlas_2based(gravity=-100, central_gravity=0.005, spring_length=150, spring_strength=0.08, damping=0.4)
    
    # Добавляем узлы и рёбра
    for node in G.nodes():
        net.add_node(node, label=str(node), color="blue", physics=True)
    
    for edge in G.edges(data=True):
        weight = edge[2]['weight']
        color = "green" if weight == 1 else "#E0B228" if weight == 2 else "red"
        net.add_edge(edge[0], edge[1], color=color, width=3, physics=True, smooth=True)

    net.show(output_file)

def level_depth(row):
    return len(row['connections'])

def number_connections(row):
    connections = 0
    for i in row['connections']:
        connections += len(i)

    return connections

def number_of_1st_connections(row):
        if len(row['connections']) > 0:
            return len(row['connections'][0])
        else:
            return 0 

def number_of_2nd_connections(row):
        if len(row['connections']) > 1:
            return len(row['connections'][1])
        else:
            return 0  

def number_of_3rd_connections(row):
        if len(row['connections']) > 2:
            return len(row['connections'][2])
        else:
            return 0  
    
def segment_connections(row):
    if 1 <= row['connections_number'] <= 10:
        return '1 - 10'
    elif 11 <= row['connections_number'] <= 30:
        return '10 - 30'
    elif 31 <= row['connections_number'] <= 60:
        return '30 - 60'
    elif 61 <= row['connections_number'] <= 100:
        return '60 - 100'
    elif 101 <= row['connections_number'] <= 150:
        return '100 - 150'
    elif 151 <= row['connections_number'] <= 500:
        return '150 - 500'
    elif 501 <= row['connections_number']:
        return '500+'

def segment_lifetime(row):
    if 1 <= row['lifetime_days'] <= 10:
        return '1 - 10'
    elif 11 <= row['lifetime_days'] <= 30:
        return '10 - 30'
    elif 31 <= row['lifetime_days'] <= 60:
        return '30 - 60'
    elif 61 <= row['lifetime_days'] <= 100:
        return '60 - 100'
    elif 101 <= row['lifetime_days'] <= 150:
        return '100 - 150'
    elif 151 <= row['lifetime_days'] <= 500:
        return '150 - 500'
    elif 501 <= row['lifetime_days']:
        return '500+'

def liveness_checked(row):
    if row['status'] == 'success':
        return 1
    else:
        return 0

def has_incident(row):
    if row['incidents'] > 0:
        return 1
    else:
        return 0

def user_type(row):
    if row['user_type'] == 'driver, pass' or row['user_type'] == 'pass, driver':
        return 'pass, driver'
    if row['user_type'] == 'pass' or row['user_type'] == 'driver':
        return row['user_type']

# Merging with another tables 
### (We're considering the key countries which are [46, 25, 12, 43, 24, 11, 22, 10, 75])

### Plan 

1 iteration:
1) Добавить график с кол-во по уровням всего, без срезов
2) Добавить по банам, сколько людей, которые забанены, но у них еще есть незабанненые связи (Потенциальная угроза) - уровня связи
3) Поисследрвать инциденты, сколько людей совершили, сколько у них связей и есть ли какие-то активности по ним. Есть ли те, кто совершает и совершает инциденты в рамках связей
4) Посмотреть поездки и деньги: сколько уже ездят, а также сколько тратят - нужно ответить на вопрос, а сколько мы срежем поездок и денег, если сейчас забаним 10-30-50-70-100% связей 2 уровня
5) Посмотреть тех, кто прошел лайвнесс с разными статусами, проверить несколько кейсов в ручную через sumsub, сколько у них связей


2 iteration:
- Посмотреть, если у юзера есть связи по тем, кто совершает инциденты, то учесть это и чем больше совершено - тем он важнее
- Посмотреть тех, кто совершил ицнидент - есть ли у него связи с банами, инцидентами
- Посмотреть тех, у кого есть бан - сколько аккаунтов у них и есть ли те, кто уже забанен и сколько не забанено
- Посмотреть 2 уровни связей на поездки, сколько они совершили поездок + деньги

3 iteration:
- Выгрузить данные, которые обсудили с Настей
- Посчитать (важно!), сколько повторных инцидентов мы можем исключить при бане 2-3 уровней

4 iteration
- Посмотреть значения не в абсолюте, а относительно тотала за этот период по поездкам + GMV
- Посчитать деньги не всего за период, а от момента совершения инцидента (Подтянуть дату инцидента и считать по всем внутри с этого момента)

In [6]:
# Reading the file from Kirill with all 1st level of depth (User_id_from - User_id_to)
df = pd.read_csv('/Users/renatyunison/Downloads/user_connections_user_connections.csv', usecols=['id', 'user_id_from', 'user_id_to'])
df.style.set_table_attributes('class="dataframe"')

# BFS algorithm to collect the whole connections between users as a dataframe with 1 - key (user_id_from), 2 - values (List of tuples with all connections as part of a 3 level of depth)
df_graph = get_connections(df, 3)
df_graph['level_number'] = df_graph.apply(level_depth, axis=1)
df_graph['connections_number'] = df_graph.apply(number_connections, axis=1)

# Generating the HTML file with interactive visualisation to check the correctness
# visualize_graph(df_graph[df_graph['user_id_from'].isin([278439263, 183025070, 1, 278441507, 265608548])])

# SQL query with all additional features on users (Rides, liveness, bans..)
# df_total_data = read_bq("""
#   WITH mode AS (SELECT t1.id,
#                      t1.mode,
#                      DATE(t1.created) AS created,
#                      t1.city_id,
#                      t2.city_name,
#                      t2.country_id,
#                      t2.country_name
#               FROM dwh-storage-327422.personal_data.tbl_user_act t1
#                        JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
#                             ON t1.city_id = t2.city_id
#               WHERE t1.country_id IN (46, 25, 12, 43, 24, 11, 22, 10, 75)),
#      incidents AS (SELECT driver_id         AS aggressor_id,
#                           incident_level,
#                           'Driver'          AS aggressor,
#                           city_id,
#                           COUNT(redmine_id) AS incidents
#                    FROM indriver-bi.safety.vw_safety_incidents_detail
#                    WHERE incident_date >= '2024-11-01'
#                      AND information_status = 'Confirmed'
#                      AND aggressor = 'Driver'
#                    GROUP BY 1, 2, 3, 4
#                    UNION ALL
#                    SELECT pass_id           AS aggressor_id,
#                           incident_level,
#                           'Passenger'       AS aggressor,
#                           city_id,
#                           COUNT(redmine_id) AS incidents
#                    FROM indriver-bi.safety.vw_safety_incidents_detail
#                    WHERE incident_date >= '2024-11-01'
#                      AND information_status = 'Confirmed'
#                      AND aggressor = 'Passenger'
#                    GROUP BY 1, 2, 3, 4),
#      rides AS (SELECT user_id,
#                       STRING_AGG(DISTINCT user_type, ', ')             AS user_type,
#                       MAX(CASE WHEN rides_count > 0 THEN 1 ELSE 0 END) AS has_ride,
#                       SUM(rides_count)                                 AS rides,
#                       SUM(orders_count)                                AS orders,
#                       SUM(gmv_clean_usd)                               AS gmv
#                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
#                WHERE metric_date_utc >= '2024-11-01'
#                  AND country_id IN (46, 25, 12, 43, 24, 11, 22, 10, 75)
#                GROUP BY 1),
#      bans AS (SELECT user_id,
#                      MAX(CASE WHEN sanction_type IS NOT NULL THEN 1 ELSE 0 END) AS has_ban,
#                      STRING_AGG(DISTINCT CASE
#                                              WHEN sanction_type = 'to_black_list' THEN 'black_listed'
#                                              WHEN sanction_type = 'ban_user_in_expel' THEN 'expel_ban'
#                                              ELSE 'other_sanctions'
#                          END)                                                   AS sanction_type
#               FROM indriver-e6e40.antifraud.rule_sanctions_log
#               WHERE rule_sanctions_log_dt_part >= '2024-11-01'
#               GROUP BY 1),
#      liveness AS (SELECT user_id, status, MAX(created_at) AS last_date
#                   FROM indriver-e6e40.ods_facechecker.user_liveness
#                   WHERE created_at >= '2024-11-01'
#                   GROUP BY 1, 2)
#        SELECT t1.id,
#        t1.created,
#        DATE_DIFF(CURRENT_DATE(), t1.created, DAY) AS lifetime_days,
#        t1.city_name,
#        t1.city_id,
#        t1.country_name,
#        t1.country_id,
#        t1.mode,
#        t1.city_id,
#        t2.incident_level,
#        t2.incidents,
#        t3.has_ride,
#        t3.user_type,
#        t3.rides,
#        t3.orders,
#        t3.gmv,
#        t4.sanction_type,
#        t4.has_ban,
#        t5.status
#        FROM mode t1
#          LEFT JOIN incidents t2 ON t1.id = t2.aggressor_id
#          LEFT JOIN rides t3 ON t1.id = t3.user_id
#          LEFT JOIN bans t4 ON t1.id = t4.user_id
#          LEFT JOIN liveness t5 ON t1.id = t5.user_id
# """)

df_features = pd.read_csv('Result_42.csv')


In [8]:
df_total = df_graph.merge(
    df_features, 
    how='inner', 
    left_on='user_id_from', 
    right_on='id'
    )

df_total['user_type_mode'] = df_total.apply(user_type, axis=1)
df_total['segment_connections'] = df_total.apply(segment_connections, axis=1)
df_total['segment_lifetime'] = df_total.apply(segment_lifetime, axis=1)
df_total['liveness_checked'] = df_total.apply(liveness_checked, axis=1)
df_total['number_of_1st_levels'] = df_total.apply(number_of_1st_connections, axis=1)
df_total['number_of_2nd_levels'] = df_total.apply(number_of_2nd_connections, axis=1)
df_total['number_of_3rd_levels'] = df_total.apply(number_of_3rd_connections, axis=1)
df_total['has_incident'] = df_total.apply(has_incident, axis=1)
df_total['has_ban'] = df_total['has_ban'].fillna(0)
df_total['has_ban'] = df_total['has_ban'].astype('int')


df_result_2nd_level = df_total[(df_total['level_number'].isin([2,3])) & (df_total['number_of_2nd_levels'] <= 10)]


df_total.head()

Unnamed: 0,user_id_from,connections,level_number,connections_number,id,created,lifetime_days,city_name,city_id,country_name,country_id,mode,city_id_1,incident_level,incidents,has_ride,user_type,rides,orders,gmv,sanction_type,has_ban,status,user_type_mode,segment_connections,segment_lifetime,liveness_checked,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels,has_incident
0,259265530,"([221359], [158166513, 234715131, 244900491, 2...",2,8,259265530,2024-10-06,117,San Luis Potosi,4231,Mexico,12,client,4231,,,,,,,,,0,,,1 - 10,100 - 150,0,1,7,0,0
1,216655126,"([396600], [530567, 617936, 704032, 907782, 13...",3,401,216655126,2024-01-29,368,Mexico city,4143,Mexico,12,client,4143,,,,,,,,,0,,,150 - 500,150 - 500,0,1,44,356,0
2,256077851,"([469793, 1895532, 256048752], [975839, 154424...",3,1130,256077851,2024-09-18,135,East London,836,South Africa,10,client,836,,,,,,,,,0,,,500+,100 - 150,0,3,846,281,0
3,260513991,"([469793], [975839, 1544245, 1665261, 1934312,...",3,437,260513991,2024-10-14,109,Paragominas,5048,Brazil,11,client,5048,,,,,,,,black_listed,1,,,150 - 500,100 - 150,0,1,325,111,0
4,262939401,"([469793], [975839, 1544245, 1665261, 1934312,...",3,437,262939401,2024-10-29,94,Aracaju,4518,Brazil,11,driver,4518,,,,,,,,,0,,,150 - 500,60 - 100,0,1,325,111,0


# Network analysis

### How many users in each levels (Total)

In [9]:
df_total_pivot = df_total.pivot_table(
    columns=['country_name'],
    index=['level_number'],
    values=['user_id_from'],
    aggfunc='count'
    )

df_total_pivot_pct = df_total_pivot.pct_change()*100
df_total_pivot_pct = df_total_pivot_pct.fillna(100)

pivot_combined = df_total_pivot.copy()

for col in df_total_pivot.columns:
    pivot_combined[col] = df_total_pivot[col].astype(str) + " (" + df_total_pivot_pct[col].round(2).astype(str) + "%)"

pivot_combined

Unnamed: 0_level_0,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from,user_id_from
country_name,Brazil,Chile,Colombia,Dominican Republic,Ecuador,Honduras,India,Mexico,Peru,South Africa
level_number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,329295.0 (100.0%),76037.0 (100.0%),309154.0 (100.0%),53631.0 (100.0%),81254.0 (100.0%),5345.0 (100.0%),nan (100.0%),279986.0 (100.0%),227544.0 (100.0%),80483.0 (100.0%)
2,89907.0 (-72.7%),24406.0 (-67.9%),72089.0 (-76.68%),15724.0 (-70.68%),21260.0 (-73.84%),940.0 (-82.41%),nan (100.0%),70140.0 (-74.95%),55245.0 (-75.72%),22686.0 (-71.81%)
3,20515.0 (-77.18%),15843.0 (-35.09%),23346.0 (-67.62%),9866.0 (-37.26%),6992.0 (-67.11%),182.0 (-80.64%),1.0 (100.0%),21706.0 (-69.05%),29721.0 (-46.2%),23130.0 (1.96%)


In [10]:
df_total.groupby('level_number', as_index=False)['user_id_from'].nunique()

Unnamed: 0,level_number,user_id_from
0,1,1438907
1,2,371050
2,3,150510


In [11]:
df_result_2nd_level_agg = df_result_2nd_level.groupby(['has_incident', 'has_ban', 'mode'], as_index=False)[['user_id_from', 'number_of_2nd_levels']].agg({'user_id_from':'count', 'number_of_2nd_levels':'sum'})

df_result_2nd_level_agg['avg_by_user'] = (df_result_2nd_level_agg['number_of_2nd_levels'] / df_result_2nd_level_agg['user_id_from']).round(2)

df_result_2nd_level_agg

Unnamed: 0,has_incident,has_ban,mode,user_id_from,number_of_2nd_levels,avg_by_user
0,0,0,client,320319,574709,1.79
1,0,0,driver,106175,280282,2.64
2,0,1,client,25216,49972,1.98
3,0,1,driver,23889,70409,2.95
4,1,0,client,812,2396,2.95
5,1,0,driver,128,323,2.52
6,1,1,client,163,538,3.3
7,1,1,driver,225,701,3.12


### Aggressors who committed the incidents

In [12]:
# Incidents in last 6 months
print('The first script is uploading. Approximately 70k rows')
df_all_incidents = read_bq("""
  WITH incidents AS (SELECT driver_id AS aggressor_id,
                            'driver'  AS mode,
                            incident_level,
                            city_id,
                            city_name,
                            country_name,
                    FROM indriver-bi.safety.vw_safety_incidents_detail
                    WHERE incident_date >= '2024-05-01'
                      AND information_status = 'Confirmed'
                      AND aggressor = 'Driver'
                    QUALIFY ROW_NUMBER() OVER (PARTITION BY aggressor_id ORDER BY incident_date DESC) = 1
                    UNION ALL
                    SELECT pass_id AS aggressor_id,
                            'pass'  AS mode,
                            incident_level,
                            city_id,
                            city_name,
                            country_name,
                    FROM indriver-bi.safety.vw_safety_incidents_detail
                    WHERE incident_date >= '2024-05-01'
                      AND information_status = 'Confirmed'
                      AND aggressor = 'Passenger'
                    QUALIFY ROW_NUMBER() OVER (PARTITION BY aggressor_id ORDER BY incident_date DESC) = 1),
      rides AS (SELECT user_id,
                        user_type,
                        MAX(CASE WHEN rides_count > 0 THEN 1 ELSE 0 END) AS has_ride,
                        SUM(rides_count)                                 AS rides,
                        SUM(orders_count)                                AS orders,
                        SUM(gmv_clean_usd)                               AS gmv
                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                WHERE metric_date_utc >= '2024-01-01'
                GROUP BY 1, 2)
  SELECT t1.aggressor_id,
        t1.mode,
        t1.incident_level,
        t1.city_id,
        t1.city_name,
        t1.country_name,
        t2.has_ride,
        t2.rides,
        t2.orders,
        t2.gmv
  FROM incidents t1
          LEFT JOIN rides t2 ON t1.aggressor_id = t2.user_id AND t1.mode = t2.user_type
""")

df_all_incidents = df_all_incidents.fillna(0)
list_of_aggressors = df_all_incidents['aggressor_id'].to_list()

print('The second script is uploading. Keep in mind that query contains of more than 80 Mln rows')
df_rides = read_bq("""
    SELECT user_id,
        MAX(CASE WHEN rides_count > 0 THEN 1 ELSE 0 END) AS has_ride,
        SUM(rides_count)                                 AS rides,
        SUM(orders_count)                                AS orders,
        SUM(gmv_clean_usd)                               AS gmv
    FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
    WHERE metric_date_utc >= '2024-06-01'
    GROUP BY 1
""")

rides_dict = dict(zip(df_rides['user_id'], df_rides['rides']))
gmv_dict = dict(zip(df_rides['user_id'], df_rides['gmv']))
list_of_users_ride = df_rides['user_id'].to_list()
list_of_users_ride = set(list_of_users_ride)


The first script is uploading. Approximately 70k rows
The second script is uploading. Keep in mind that query contains of more than 80 Mln rows


In [13]:
df_aggressors_for_analysis = df_total[(df_total['incidents']>0)]

df_aggressors_for_analysis = df_aggressors_for_analysis.merge(df_all_incidents, left_on='user_id_from', right_on='aggressor_id', how='left')

df_aggressors_for_analysis = df_aggressors_for_analysis[[
    'user_id_from', 
    'has_incident',
    'connections', 
    'level_number',
    'connections_number',
    'lifetime_days',
    'segment_lifetime',
    'liveness_checked',
    'country_name_x',
    'mode_y',
    'incident_level_x',
    'number_of_1st_levels',
    'number_of_2nd_levels',
    'number_of_3rd_levels'
    ]]

# Deleting the users with more than 90-95% percentile treshold 
percentile_1st = np.percentile(df_aggressors_for_analysis['number_of_1st_levels'], 99)
percentile_2nd = np.percentile(df_aggressors_for_analysis['number_of_2nd_levels'], 95)
percentile_3rd = np.percentile(df_aggressors_for_analysis['number_of_3rd_levels'], 95)

drop_index = df_aggressors_for_analysis[(df_aggressors_for_analysis['number_of_2nd_levels'] > percentile_2nd) | (df_aggressors_for_analysis['number_of_3rd_levels'] > percentile_3rd)].index
df_aggressors_for_analysis = df_aggressors_for_analysis.drop(index=drop_index)

#### Reccuring incidents and GMV/Rides by connected users

In [215]:
def level_1st_aggressors(row): # Check everyone who committed the incidents in 1st level
        aggressors = []
        aggressor_numbers = 0
        result_dict = {}
        iterable = row['connections'][0]
        for user in iterable:
            if user in set(list_of_aggressors):
                aggressors.append(user)
                aggressor_numbers += 1
            else:
                continue

        result_dict[aggressor_numbers] = aggressors
        return result_dict

def level_2nd_aggressors(row): # if there are connections of over than 1st level, so check everyone in terms of current level

    if len(row['connections']) > 1:
        aggressors = []
        aggressor_numbers = 0
        result_dict = {}
        iterable = row['connections'][1]
        for user in iterable:
            if user in set(list_of_aggressors):
                aggressors.append(user)
                aggressor_numbers += 1
    
        result_dict[aggressor_numbers] = aggressors
        
        return result_dict
    else:
        return 0

def level_3rd_aggressors(row): # if there are connections of over than 2nd level, then check everyone in 3rd level


    if len(row['connections']) > 2:
        aggressors = []
        aggressor_numbers = 0
        result_dict = {}
        iterable = row['connections'][2]
        for user in iterable:
            if user in set(list_of_aggressors):
                aggressors.append(user)
                aggressor_numbers += 1
    
        result_dict[aggressor_numbers] = aggressors
        
        return result_dict
    else:
        return 0

def level_1st_aggressors_number(row):
        aggressor_numbers = 0
        iterable = row['connections'][0]
        for user in iterable:
            if user in list_of_aggressors:
                aggressor_numbers += 1
            else:
                continue

        return aggressor_numbers

def level_2nd_aggressors_number(row):
    if len(row['connections']) > 1:
        aggressor_numbers = 0
        iterable = row['connections'][1]
        for user in iterable:
            if user in set(list_of_aggressors):
                aggressor_numbers += 1
        
        return aggressor_numbers
    else:
        return 0

def level_3rd_aggressors_number(row):


    if len(row['connections']) > 2:
        aggressor_numbers = 0
        iterable = row['connections'][2]
        for user in iterable:
            if user in set(list_of_aggressors):
                aggressor_numbers += 1
        
        return aggressor_numbers
    else:
        return 0

def level_1_rides(row):

    return np.sum(rides_dict.get(user, 0) for user in row['connections'][0] if user in list_of_users_ride)

def level_1_gmv(row):
    
    return np.sum(gmv_dict.get(user, 0) for user in row['connections'][0] if user in list_of_users_ride)

def level_2_rides(row):

    if len(row['connections']) > 1:
        return np.sum(rides_dict.get(user, 0) for user in row['connections'][1] if user in list_of_users_ride)
    else:
        return 0

def level_2_gmv(row):

    if len(row['connections']) > 1:    
        return np.sum(gmv_dict.get(user, 0) for user in row['connections'][1] if user in list_of_users_ride)
    else:
        return 0

def level_3_rides(row):

    if len(row['connections']) > 2:
        return np.sum(rides_dict.get(user, 0) for user in row['connections'][2] if user in list_of_users_ride)
    else:
        return 0

def level_3_gmv(row):

    if len(row['connections']) > 2:
        return np.sum(gmv_dict.get(user, 0) for user in row['connections'][2] if user in list_of_users_ride)
    else:
        return 0

def incident_level_category(row):
    if row['incident_level_x'] == 'Green':
        return 1
    elif row['incident_level_x'] == 'Yellow':
        return 2
    else:
        return 3


In [16]:
df_aggressors_for_analysis['aggressors_1st_level'] = df_aggressors_for_analysis.apply(level_1st_aggressors, axis=1)
df_aggressors_for_analysis['aggressors_2nd_level'] = df_aggressors_for_analysis.apply(level_2nd_aggressors, axis=1)
df_aggressors_for_analysis['aggressors_3rd_level'] = df_aggressors_for_analysis.apply(level_3rd_aggressors, axis=1)

df_aggressors_for_analysis['level_1st_aggressors_number'] = df_aggressors_for_analysis.apply(level_1st_aggressors_number, axis=1)
df_aggressors_for_analysis['level_2nd_aggressors_number'] = df_aggressors_for_analysis.apply(level_2nd_aggressors_number, axis=1)
df_aggressors_for_analysis['level_3rd_aggressors_number'] = df_aggressors_for_analysis.apply(level_3rd_aggressors_number, axis=1)

df_aggressors_for_analysis['level_1st_rides'] = df_aggressors_for_analysis.progress_apply(level_1_rides, axis=1)
df_aggressors_for_analysis['level_1st_gmv'] = df_aggressors_for_analysis.progress_apply(level_1_gmv, axis=1)
df_aggressors_for_analysis['level_2nd_rides'] = df_aggressors_for_analysis.progress_apply(level_2_rides, axis=1)
df_aggressors_for_analysis['level_2nd_gmv'] = df_aggressors_for_analysis.progress_apply(level_2_gmv, axis=1)
df_aggressors_for_analysis['level_3rd_rides'] = df_aggressors_for_analysis.progress_apply(level_3_rides, axis=1)
df_aggressors_for_analysis['level_3rd_gmv'] = df_aggressors_for_analysis.progress_apply(level_3_gmv, axis=1)

df_aggressors_for_analysis['incident_level_category'] = df_aggressors_for_analysis.apply(incident_level_category, axis=1)


In [217]:
df_aggressors_for_analysis_total = df_aggressors_for_analysis.groupby([
    'user_id_from', 'has_incident', 'level_number', 'connections_number', 
    'lifetime_days', 'segment_lifetime', 'liveness_checked', 'country_name_x', 
    'mode_y', 'number_of_1st_levels', 'number_of_2nd_levels', 
    'number_of_3rd_levels', 'level_1st_rides', 'level_1st_gmv', 'level_2nd_gmv', 'level_2nd_rides', 'level_3rd_rides', 
    'level_3rd_gmv', 'level_1st_aggressors_number', 'level_2nd_aggressors_number', 'level_3rd_aggressors_number'], as_index=False)['incident_level_category'].max()

def incident_level_category_reverse(row):
    if row['incident_level_category'] == 1:
        return 'Green'
    elif row['incident_level_category'] == 2:
        return 'Yellow'
    else:
        return 'Red'

df_aggressors_for_analysis_total['incident_level'] = df_aggressors_for_analysis_total.apply(incident_level_category_reverse, axis=1)

In [335]:
df_aggressors_for_analysis_total_agg = df_aggressors_for_analysis_total.groupby([
    'mode_y', 
    # 'country_name_x', 
    'incident_level'
    ], as_index=False)[[
    'user_id_from', 
    'number_of_1st_levels', 
    'number_of_2nd_levels',
    'number_of_3rd_levels',
    'level_1st_aggressors_number',
    'level_2nd_aggressors_number',
    'level_3rd_aggressors_number',
    'level_1st_rides',
    'level_1st_gmv',
    'level_2nd_rides',
    'level_2nd_gmv',
    'level_3rd_rides',
    'level_3rd_gmv'
]].agg({
    'user_id_from':'count',
    'number_of_1st_levels':'sum',
    'number_of_2nd_levels':'sum',
    'number_of_3rd_levels':'sum',
    'level_1st_aggressors_number':'sum',
    'level_2nd_aggressors_number':'sum',
    'level_3rd_aggressors_number':'sum',
    'level_1st_rides':'sum',
    'level_1st_gmv':'sum',
    'level_2nd_rides':'sum',
    'level_2nd_gmv':'sum',
    'level_3rd_rides':'sum',
    'level_3rd_gmv':'sum'
    })

df_aggressors_for_analysis_total_agg

Unnamed: 0,mode_y,incident_level,user_id_from,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels,level_1st_aggressors_number,level_2nd_aggressors_number,level_3rd_aggressors_number,level_1st_rides,level_1st_gmv,level_2nd_rides,level_2nd_gmv,level_3rd_rides,level_3rd_gmv
0,driver,Green,347,715,702,542,15,1,1,117633,418610.77,127515,404351.95,134348,415284.19
1,driver,Red,34,70,89,50,0,0,1,9537,38035.93,13276,48114.28,13580,48797.12
2,driver,Yellow,424,857,679,496,14,5,2,112456,387962.91,129493,448621.2,80459,317000.76
3,pass,Green,534,744,906,441,46,90,19,53888,207300.28,37882,135454.25,104688,355562.21
4,pass,Red,198,278,305,95,38,56,10,17370,77856.62,12526,46582.98,5264,32565.75
5,pass,Yellow,1035,1396,2089,758,155,305,55,78871,314014.88,60759,223161.74,83933,284680.98


In [336]:
df_aggressors_for_analysis_total_agg[[
    'mode_y', 'incident_level', 'user_id_from', 'number_of_1st_levels', 'number_of_2nd_levels', 'number_of_3rd_levels'
    ]].rename(columns={
        'user_id_from':'users_with_incidents',
        'mode_y':'mode'
        })

Unnamed: 0,mode,incident_level,users_with_incidents,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels
0,driver,Green,347,715,702,542
1,driver,Red,34,70,89,50
2,driver,Yellow,424,857,679,496
3,pass,Green,534,744,906,441
4,pass,Red,198,278,305,95
5,pass,Yellow,1035,1396,2089,758


In [347]:
df_recurring_incidents = df_aggressors_for_analysis_total_agg[[
    'mode_y', 'incident_level', 'user_id_from', 'level_1st_aggressors_number', 'level_2nd_aggressors_number', 'level_3rd_aggressors_number'
    ]].rename(columns={'user_id_from':'users_with_incidents'})

df_recurring_incidents['level_1st_aggressors_number_share'] = df_recurring_incidents['level_1st_aggressors_number'] / df_recurring_incidents['users_with_incidents'] *100
df_recurring_incidents['level_2nd_aggressors_number_share'] = df_recurring_incidents['level_2nd_aggressors_number'] / df_recurring_incidents['users_with_incidents'] *100
df_recurring_incidents['level_3rd_aggressors_number_share'] = df_recurring_incidents['level_3rd_aggressors_number'] / df_recurring_incidents['users_with_incidents'] *100
df_recurring_incidents['total_recurring_incidents'] = (df_recurring_incidents['level_1st_aggressors_number'] + df_recurring_incidents['level_2nd_aggressors_number'] + df_recurring_incidents['level_3rd_aggressors_number'])
df_recurring_incidents['total_share_of_recurring_incidents'] = (df_recurring_incidents['level_1st_aggressors_number'] + df_recurring_incidents['level_2nd_aggressors_number'] + df_recurring_incidents['level_3rd_aggressors_number'])/df_recurring_incidents['users_with_incidents']*100


for col, col_share in zip(df_recurring_incidents[[
    'level_1st_aggressors_number', 
    'level_2nd_aggressors_number', 
    'level_3rd_aggressors_number', 
    'total_recurring_incidents'
    ]].columns, df_recurring_incidents[[
    'level_1st_aggressors_number_share', 
    'level_2nd_aggressors_number_share', 
    'level_3rd_aggressors_number_share', 
    'total_share_of_recurring_incidents'
    ]].columns):
    df_recurring_incidents[col] = df_recurring_incidents[col].astype(str) + " (" + df_recurring_incidents[col_share].round(2).astype(str) + "%)"

df_recurring_incidents[[
    'mode_y', 
    'incident_level',
    'users_with_incidents',
    'level_1st_aggressors_number',
    'level_2nd_aggressors_number',
    'level_3rd_aggressors_number',
    'total_recurring_incidents',
    ]]

Unnamed: 0,mode_y,incident_level,users_with_incidents,level_1st_aggressors_number,level_2nd_aggressors_number,level_3rd_aggressors_number,level_1st_aggressors_number_share,level_2nd_aggressors_number_share,level_3rd_aggressors_number_share,total_recurring_incidents,total_share_of_recurring_incidents
0,driver,Green,347,15,1,1,4.32,0.29,0.29,17,4.9
1,driver,Red,34,0,0,1,0.0,0.0,2.94,1,2.94
2,driver,Yellow,424,14,5,2,3.3,1.18,0.47,21,4.95
3,pass,Green,534,46,90,19,8.61,16.85,3.56,155,29.03
4,pass,Red,198,38,56,10,19.19,28.28,5.05,104,52.53
5,pass,Yellow,1035,155,305,55,14.98,29.47,5.31,515,49.76


In [305]:
df_aggressors_for_analysis_total_agg[[
    'mode_y', 'incident_level', 'user_id_from',
    'level_1st_rides', 'level_1st_gmv', 'level_2nd_rides', 'level_2nd_gmv', 'level_3rd_rides', 'level_3rd_gmv'
    ]].rename(columns={'user_id_from':'users_with_incidents'})

Unnamed: 0,mode_y,incident_level,users_with_incidents,level_1st_rides,level_1st_gmv,level_2nd_rides,level_2nd_gmv,level_3rd_rides,level_3rd_gmv
0,driver,Green,347,117633,418610.77,127515,404351.95,134348,415284.19
1,driver,Red,34,9537,38035.93,13276,48114.28,13580,48797.12
2,driver,Yellow,424,112456,387962.91,129493,448621.2,80459,317000.76
3,pass,Green,534,53888,207300.28,37882,135454.25,104688,355562.21
4,pass,Red,198,17370,77856.62,12526,46582.98,5264,32565.75
5,pass,Yellow,1035,78871,314014.88,60759,223161.74,83933,284680.98


In [354]:
share_of_reccuring_total = (df_aggressors_for_analysis_total_agg['level_2nd_aggressors_number'].sum() + df_aggressors_for_analysis_total_agg['level_3rd_aggressors_number'].sum()) / df_all_incidents[df_all_incidents['country_name'].isin(set(df_aggressors_for_analysis_total['country_name_x'].to_list()))]['aggressor_id'].nunique()*100
share_of_reccuring = (df_aggressors_for_analysis_total_agg['level_2nd_aggressors_number'].sum() + df_aggressors_for_analysis_total_agg['level_3rd_aggressors_number'].sum()) / df_aggressors_for_analysis_total_agg['user_id_from'].sum()*100
df_aggressors_for_analysis_total_rec_inc = df_aggressors_for_analysis_total[(df_aggressors_for_analysis_total['level_1st_aggressors_number']>0)|(df_aggressors_for_analysis_total['level_2nd_aggressors_number']>0)|(df_aggressors_for_analysis_total['level_3rd_aggressors_number']>0)]
total_rides_rec_inc = df_aggressors_for_analysis_total_rec_inc['level_2nd_rides'].sum() + df_aggressors_for_analysis_total_rec_inc['level_3rd_rides'].sum()
total_gmv_rec_inc = df_aggressors_for_analysis_total_rec_inc['level_2nd_gmv'].sum() + df_aggressors_for_analysis_total_rec_inc['level_3rd_gmv'].sum()



print(f'Total share of reccuring incidents 2-3 levels: {share_of_reccuring_total:.2f}%')
print(f'Total share of reccuring incidents 2-3 levels: {share_of_reccuring:.2f}%')
print(f'Total rides by aggressors 1-2-3 levels: {total_rides_rec_inc}')
print(f'Total GMV by aggressors 1-2-3 levels: {total_gmv_rec_inc:.2f}')

Total share of reccuring incidents 2-3 levels: 1.17%
Total share of reccuring incidents 2-3 levels: 21.19%
Total rides by aggressors 1-2-3 levels: 206685
Total GMV by aggressors 1-2-3 levels: 759879.40


#### Checking the certain cases

In [None]:
df_aggressors_for_analysis[df_aggressors_for_analysis['user_id_from']==274030202]

In [None]:
df_all_incidents[df_all_incidents['aggressor_id']==276454228]


In [None]:
df_total[df_total['user_id_from']==193373527]


In [None]:
df_rides[df_rides['user_id'].isin([193373527, 278320120])]

In [None]:
df_aggressors_for_analysis_total.query("connections_number < 11").sort_values(['level_1st_gmv', 'level_2nd_gmv', 'level_3rd_gmv'], ascending=False)

#### Total users by levels (rides, gmv, )

In [569]:
df_aggressors_for_analysis.groupby(['country_name_x', 'incident_level_x'], as_index=False)[['user_id_from', 'number_of_1st_levels', 'number_of_2nd_levels', 'number_of_3rd_levels']]\
                            .agg({
                                'user_id_from':'count',
                                'number_of_1st_levels':'sum',
                                'number_of_2nd_levels':'sum',
                                'number_of_3rd_levels':'sum',
                            })



Unnamed: 0,country_name_x,incident_level_x,user_id_from,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels
0,Brazil,Green,259,419,351,202
1,Brazil,Red,83,117,85,14
2,Brazil,Yellow,362,534,468,219
3,Chile,Green,70,128,126,82
4,Chile,Red,18,23,18,5
5,Chile,Yellow,73,97,104,71
6,Colombia,Green,137,222,394,90
7,Colombia,Red,49,76,130,89
8,Colombia,Yellow,342,478,920,266
9,Dominican Republic,Green,34,49,70,34


#### By modes (Driver & Pax) and both with categories

In [581]:
df_aggressors_for_analysis.groupby(['mode_y', 'liveness_checked', 'incident_level_x'], as_index=False)[['user_id_from', 'number_of_1st_levels', 'number_of_2nd_levels', 'number_of_3rd_levels']]\
                            .agg({
                                'user_id_from':'count',
                                'number_of_1st_levels':'sum',
                                'number_of_2nd_levels':'sum',
                                'number_of_3rd_levels':'sum',
                            })



Unnamed: 0,mode_y,liveness_checked,incident_level_x,user_id_from,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels
0,driver,0,Green,341,705,689,547
1,driver,0,Red,33,69,89,50
2,driver,0,Yellow,397,800,632,464
3,driver,1,Green,15,25,20,11
4,driver,1,Red,1,1,0,0
5,driver,1,Yellow,29,60,52,34
6,pass,0,Green,477,672,895,380
7,pass,0,Red,168,245,241,63
8,pass,0,Yellow,926,1261,1915,654
9,pass,1,Green,116,149,171,94


In [582]:
df_aggressors_for_analysis.groupby(['mode_y', 'segment_lifetime', 'incident_level_x'], as_index=False)[['user_id_from', 'number_of_1st_levels', 'number_of_2nd_levels', 'number_of_3rd_levels']]\
                            .agg({
                                'user_id_from':'count',
                                'number_of_1st_levels':'sum',
                                'number_of_2nd_levels':'sum',
                                'number_of_3rd_levels':'sum',
                            })

Unnamed: 0,mode_y,segment_lifetime,incident_level_x,user_id_from,number_of_1st_levels,number_of_2nd_levels,number_of_3rd_levels
0,driver,10 - 30,Green,2,2,2,0
1,driver,10 - 30,Red,1,1,2,0
2,driver,10 - 30,Yellow,3,3,1,0
3,driver,100 - 150,Green,14,20,51,39
4,driver,100 - 150,Yellow,15,18,38,35
5,driver,150 - 500,Green,90,156,201,128
6,driver,150 - 500,Red,7,14,18,5
7,driver,150 - 500,Yellow,74,152,145,136
8,driver,30 - 60,Green,4,6,9,0
9,driver,30 - 60,Red,1,1,6,0


#### Compared to total rides and gmv (Not just absolute values)

### Calculating the rides and gmv after incident date, and comparing it with total numbers (not absolute values)