# Packages

In [20]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network
import shap


# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast
from ydata_profiling import ProfileReport
import re
from typing import Set, List
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
import random

# System library
import os
import ipywidgets
import warnings
import pandas_gbq
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')

import cv2
import json
import re
import glob
import numpy as np
import pandas as pd
import logging
import typing as t
import yaml
import json

from enum import Enum
from datetime import date, datetime, timedelta

from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, Image

from concurrent.futures import ThreadPoolExecutor, as_completed
from ratelimit import limits, sleep_and_retry


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 


# Research

In [615]:
query = """
WITH last_incident AS (SELECT redmine_id     AS id_last_incident,
                              city_id,
                              city_name,
                              country_name,
                              macroregion_name,
                              from_latitude  AS main_from_latitude,
                              from_longitude AS main_from_longitude,
                              from_address   AS main_from_address
                       FROM indriver-bi.safety.vw_safety_incidents_detail
                       WHERE 1 = 1
                         AND incident_date BETWEEN DATE_ADD(CURRENT_DATE(), INTERVAL -15 DAY) AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
                         AND information_status IN ('Confirmed')
                       QUALIFY ROW_NUMBER() OVER (PARTITION BY city_id ORDER BY incident_date DESC) = 1),
     same_location_incidents AS (SELECT redmine_id,
                                        pass_id,
                                        driver_id,
                                        incident_date,
                                        information_status,
                                        incident_level,
                                        incident_type,
                                        city_id AS city_id_same_loc,
                                        from_latitude,
                                        from_longitude,
                                        from_address
                                 FROM indriver-bi.safety.vw_safety_incidents_detail
                                 WHERE 1 = 1
                                   AND incident_date BETWEEN DATE_ADD(CURRENT_DATE(), INTERVAL -30 DAY) AND DATE_ADD(CURRENT_DATE(), INTERVAL -1 DAY)
                                   AND information_status IN ('Not confirmed', 'Confirmed', 'Automated ML decision')
                                   AND (from_longitude IS NOT NULL AND from_latitude IS NOT NULL)),
     aggregated AS (SELECT *,
                           ST_DISTANCE(
                                   ST_GEOGPOINT(t1.main_from_longitude, t1.main_from_latitude),
                                   ST_GEOGPOINT(t2.from_longitude, t2.from_latitude)
                           ) AS              distance
                    FROM last_incident t1
                             LEFT JOIN same_location_incidents t2
                                       ON t1.city_id = t2.city_id_same_loc 
                    ORDER BY distance)
SELECT city_id,
       city_name,
       country_name,
       macroregion_name,
       id_last_incident                              AS main_incident_id,
       main_from_latitude,
       main_from_longitude,
       main_from_address,
       redmine_id                                    AS near_incident_id,
       incident_date                                 AS near_incident_date,
       CURRENT_DATE()                                AS current_date,
       DATE_DIFF(CURRENT_DATE(), incident_date, DAY) AS date_diff,
       from_latitude                                 AS near_from_latitude,
       from_longitude                                AS near_from_longitude,
       from_address                                  AS near_from_address,
       IF(information_status IN ('Confirmed', 'Automated ML decision'), 'Confirmed',
          'Not confirmed')                           AS information_status,
       incident_level                                AS incident_level,
       incident_type                                 AS incident_type,
       distance                                      AS distance
FROM aggregated
WHERE 1 = 1
  AND city_id_same_loc IS NOT NULL
  AND distance IS NOT NULL
  AND distance <= 1000
"""

In [616]:
df_raw = read_bq(query)
df_raw['near_incident_date'] = pd.to_datetime(df_raw['near_incident_date'])

df_raw_agg = df_raw[df_raw['date_diff']<=14].groupby([
    'city_id',
    'city_name',
    'country_name',
    'macroregion_name',
    'main_incident_id',
    'main_from_latitude',
    'main_from_longitude',
    'main_from_address',
    'information_status',
    'incident_level'
], as_index=False)[
    'near_incident_id'
].agg(list)

df_raw_agg['count_inc'] = df_raw_agg['near_incident_id'].apply(len)
df_raw_agg['near_incident_id'] = df_raw_agg['near_incident_id'].apply(tuple)

df_raw_agg_piv = df_raw_agg.pivot_table(
    index=[
        'city_id', 'city_name', 'country_name', 
        'macroregion_name', 'main_from_latitude', 'main_incident_id',
        'main_from_longitude', 'main_from_address', 'information_status', 
        ],
    columns='incident_level',
    values='count_inc',
    fill_value=0
).reset_index()

for column in df_raw_agg['incident_level'].unique():
    df_raw_agg_piv[column] = df_raw_agg_piv[column].astype(int)

total = 0
for column in df_raw_agg['incident_level'].unique():
    total += df_raw_agg_piv[column]

df_raw_agg_piv['total'] = total

df_raw_agg_piv['main_from_latitude'] = df_raw_agg_piv['main_from_latitude'].round(3).astype(str)
df_raw_agg_piv['main_from_longitude'] = df_raw_agg_piv['main_from_longitude'].round(3).astype(str)

df_raw_agg_piv['coordinates'] = df_raw_agg_piv['main_from_latitude'] + ' ' + df_raw_agg_piv['main_from_longitude']

df_raw_agg_piv.sort_values('total', ascending=False).head(10)


incident_level,city_id,city_name,country_name,macroregion_name,main_from_latitude,main_incident_id,main_from_longitude,main_from_address,information_status,Green,Red,Yellow,total,coordinates
123,4258,Arequipa,Peru,Latin America,-16.392,SQ-1041094,-71.54,Clínica Arequipa,Confirmed,23,0,9,32,-16.392 -71.54
82,4199,Lima,Peru,Latin America,-12.057,SQ-1040967,-77.001,Pje. Plutón 160,Confirmed,28,0,3,31,-12.057 -77.001
146,4300,Cape Town,South Africa,Africa,-33.991,SQ-1042637,18.567,Sokhanyo Public Primary School,Confirmed,20,1,4,25,-33.991 18.567
121,4257,Trujillo,Peru,Latin America,-8.124,SQ-1036866,-79.038,Mi Facultad Bar,Confirmed,12,0,6,18,-8.124 -79.038
108,4243,Barranquilla,Colombia,Latin America,10.983,SQ-1042334,-74.784,Baranoa,Confirmed,9,0,5,14,10.983 -74.784
212,4825,Santo Domingo,Dominican Republic,Latin America,18.482,SQ-1042097,-69.83,C. Cam. de Juan López 25,Confirmed,10,0,3,13,18.482 -69.83
139,4275,Piura,Peru,Latin America,-5.187,SQ-1030971,-80.623,Hospital Regional José Cayetano Heredia,Confirmed,8,0,5,13,-5.187 -80.623
256,5388,Lahore,Pakistan,SA,31.448,SQ-1040766,74.259,Ayesha Girls Hostel,Confirmed,9,0,4,13,31.448 74.259
292,5548,Panama,Panama,Latin America,8.953,SQ-1041961,-79.533,C. 3a Este,Confirmed,11,0,1,12,8.953 -79.533
276,5472,Bulawayo,Zimbabwe,Africa,-20.156,SQ-1022948,28.581,Chicken Slice,Not confirmed,5,0,7,12,-20.156 28.581


In [617]:
df_raw_agg_now = df_raw[df_raw['date_diff']<=14].groupby([
    'city_id',
    'city_name',
    'country_name',
    'macroregion_name',
    'main_incident_id',
    'main_from_latitude',
    'main_from_longitude',
    'main_from_address'
], as_index=False)[
    'near_incident_id'
].agg(list)

df_raw_agg_now['count_inc'] = df_raw_agg_now['near_incident_id'].apply(len)
df_raw_agg_now['near_incident_id'] = df_raw_agg_now['near_incident_id'].apply(tuple)

df_raw_agg_before = df_raw[df_raw['date_diff']>14].groupby([
    'city_id',
    'city_name',
    'country_name',
    'macroregion_name',
    'main_incident_id',
    'main_from_latitude',
    'main_from_longitude',
    'main_from_address'
], as_index=False)[
    'near_incident_id'
].agg(list)

df_raw_agg_before['count_inc'] = df_raw_agg_before['near_incident_id'].apply(len).astype(int)
df_raw_agg_before['near_incident_id'] = df_raw_agg_before['near_incident_id'].apply(tuple)

df_check_diff = df_raw_agg_now[['main_incident_id', 'count_inc']].merge(df_raw_agg_before[['main_incident_id', 'count_inc']], how='left', on='main_incident_id').rename(columns={
    'count_inc_x':'count_now',
    'count_inc_y':'count_before'  
})

df_check_diff['count_before'] = df_check_diff['count_before'].fillna(0).astype(int)

df_check_diff['diff'] = ((df_check_diff['count_now'] - df_check_diff['count_before'])/df_check_diff['count_before'] * 100).round(1)
df_check_diff['diff_perc'] = ((df_check_diff['count_now'] - df_check_diff['count_before'])/df_check_diff['count_before'] * 100).round(1).astype(str) + '%'


df_check_diff.query("diff_perc != 'inf%'").sort_values('diff', ascending=False).head()

Unnamed: 0,main_incident_id,count_now,count_before,diff,diff_perc
112,SQ-1002960,7,1,600.0,600.0%
169,SQ-1042420,5,1,400.0,400.0%
208,SQ-1039476,13,3,333.3,333.3%
260,SQ-1041352,4,1,300.0,300.0%
7,SQ-1042344,4,1,300.0,300.0%


In [619]:

def send_telegram_text(
    report_text: str, 
    chat_id: str, 
    token: str
) -> bool:

    try:
        url = f"https://api.telegram.org/bot{token}/sendMessage"
        payload = {
            'chat_id': chat_id,
            'text': report_text,
            'parse_mode': 'Markdown' 
        }
        
        response = requests.post(url, data=payload)
        
        if response.status_code == 200:
            print("Sent successfully")
            return True
        else:
            print(f"Error while sending. Code: {response.status_code}, Response: {response.text}")
            return False

    except Exception as e:
        print(f"There is an error: {e}")
        return False

def send_telegram_csv(
    df: pd.DataFrame, 
    file_name,
    chat_id, 
    token
) -> bool:

    
    file_path = f'./{file_name}'
    success = False
    
    try:
        df.to_csv(file_path, index=False, encoding='utf-8')
        print(f"File {file_name} saved locally")

        document_url = f"https://api.telegram.org/bot{token}/sendDocument"
        
        with open(file_path, 'rb') as f:
            document_payload = {
                'chat_id': chat_id
            }
            document_files = {
                'document': (file_name, f, 'text/csv')
            }
            
            document_response = requests.post(
                document_url, 
                data=document_payload, 
                files=document_files
            )

        if document_response.status_code == 200:
            print("CSV-File successfully has been sent to Telegram.")
            success = True
        else:
            print(f"Error while sending. Code: {document_response.status_code}, Response: {document_response.text}")

    except Exception as e:
        print(f"There is an error: {e}")
        
    finally:
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Local file {file_name} was deleted.")
            
        return success

# df_raw = read_bq(query)

TELEGRAM_BOT_TOKEN = "8076954277:AAG34ytld2GrnUqh_Lgz2ATWoYADuBQ-e64"
TELEGRAM_CHAT_ID = "-1003078511881"

min_date = df_raw[df_raw['date_diff']<=15]['near_incident_date'].min()
max_date = df_raw[df_raw['date_diff']<=15]['near_incident_date'].max()
start_date = str(min_date.day) + ' ' +  min_date.strftime('%b')
end_date = str(max_date.day) + ' ' +  max_date.strftime('%b') + ' ' + str(df_raw[df_raw['date_diff']<=10]['near_incident_date'].max().year)
start_date_dig = df_raw[df_raw['date_diff']>15]['near_incident_date'].min().strftime('%d.%m')
end_date_dig = df_raw[df_raw['date_diff']>15]['near_incident_date'].max().strftime('%d.%m')

information_status = ['Confirmed', 'Not confirmed']
top_5 = df_raw_agg_piv.sort_values('total', ascending=False)['main_incident_id'].unique()[:5]
report_table = ""


for index, redmine in enumerate(top_5, 1):

    lat, lon = df_raw_agg_piv[(df_raw_agg_piv['main_incident_id'] == redmine)]['main_from_latitude'].unique()[0], df_raw_agg_piv[(df_raw_agg_piv['main_incident_id'] == redmine)]['main_from_longitude'].unique()[0]
    region = df_raw_agg_piv[(df_raw_agg_piv['main_incident_id'] == redmine)][['city_id', 'city_name', 'country_name', 'main_from_address']].iloc[0].to_list()
    city_name, city_id, country_name, address = region[1], region[0], region[2], region[3]
    change_20d = df_check_diff[df_check_diff['main_incident_id']==redmine]['diff'].item()


    if change_20d < 0:
        report_table += f'{index}. *{city_name}* (`{city_id}`), *{country_name}*   *(↓ {change_20d}% vs. {start_date_dig}-{end_date_dig}):*\n'
    else:
        report_table += f'{index}. *{city_name}* (`{city_id}`), *{country_name}*   *(↑ {change_20d}% vs. {start_date_dig}-{end_date_dig}):*\n'

    df_flat = df_raw_agg_piv[(df_raw_agg_piv['main_incident_id'] == redmine)].copy()
    levels = df_raw_agg[df_raw_agg['main_incident_id']==redmine]['incident_level'].unique()
    total = 0

    for column in levels:
        total += df_raw_agg_piv[column]

    df_flat['total_levels'] = total
    new_columns = list(levels)
    new_columns.append('total_levels')
    df_summary = df_flat[new_columns]

    for status in information_status:
        incident_cnt = df_raw_agg_piv[(df_raw_agg_piv['main_incident_id'] == redmine)&(df_raw_agg_piv['information_status'] == status)]['total'].sum()
        report_table += f"   - *{incident_cnt}* {str.lower(status)} incidents\n"

    report_table += f'   - Levels:\n'

    for column in levels:
        share =(df_summary[column].sum() / df_summary['total_levels'].sum()*100).round(1)
        absolute = df_summary[column].sum()
        report_table += f'          *{column} - {share}% ({absolute})*\n'
    report_table += f"   - Location: `{lat}`, `{lon}`\n"
    report_table += f"   - Address: __[{address}](https://geo-gis.console3.com/#mapHash=5.94/{lat}/{lon}&mapSettings=layers//filters/%7B%7D/baseMap//h3/)__\n\n"


message_template = f"""
*🚨 DAILY REPORT: DANGER ZONES 🚨* 

Have prepared a list of the locations with the highest number of incidents that occurred within *1 km*. These are our priority **"hot spots"** for immediate review.

Period: *{start_date} - {end_date}*

*📍 Top 5 Danger Locations:*

{report_table}
Please follow [the link](https://geo-gis.console3.com/) and indicate the potentially danger areas

*📊 Raw Data Access:*
The full dataset for all incidents is available via the CSV file below

*Thank you for your attention!*

"""

try:
    send_telegram_text(message_template, TELEGRAM_CHAT_ID, TELEGRAM_BOT_TOKEN)
    send_telegram_csv(df_raw[['city_id', 'city_name', 'country_name', 'macroregion_name', 'main_from_latitude', 'main_from_longitude', 
        'main_from_address', 'near_incident_id', 'near_incident_date', 'near_from_latitude', 'near_from_longitude', 'near_from_address', 
        'information_status', 'incident_level', 'incident_type', 'distance']]\
        .reset_index(drop=True)\
        .rename(columns={'distance':'distance_meters'})\
        .sort_values(['city_id', 'distance_meters']), 'Nearby incidents raw file.csv', TELEGRAM_CHAT_ID, TELEGRAM_BOT_TOKEN)
    send_telegram_csv(df_raw_agg_piv.sort_values('total', ascending=False).reset_index(drop=True), 'Nearby incidents aggregated file.csv', TELEGRAM_CHAT_ID, TELEGRAM_BOT_TOKEN)
except Exception as e:
    print(e)
    

Sent successfully
File Nearby incidents raw file.csv saved locally
CSV-File successfully has been sent to Telegram.
Local file Nearby incidents raw file.csv was deleted.
File Nearby incidents aggregated file.csv saved locally
CSV-File successfully has been sent to Telegram.
Local file Nearby incidents aggregated file.csv was deleted.
