In [1]:
%pip install xgboost scikit-lego scikit-learn pandas numpy pyarrow polars -U --quiet


Note: you may need to restart the kernel to use updated packages.




# Notes

CUDA is being used because I (Ethan) can. The notebook should still run even if you don't have an Nvidia GPU, but cupy will still be installed.

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sqlite3
import misc_assist as ma
import polars as pl

from pathlib import Path
from tqdm import tqdm
from time import sleep

In [3]:
# Leads to ~\project_root\kafka-server\health_events.db
kafka_path = Path().cwd().parent.parent / 'kafka-server' / 'health_events.db'

max_tries = 10
for i in range(max_tries):
    try:
        with sqlite3.connect(kafka_path) as con:
            kafka_data = pd.read_sql_query("SELECT * FROM health_events", con)
        break
    except sqlite3.OperationalError as e:
        print(f"Attempt {i+1}: {e}")
        sleep(1)

    if i == max_tries - 1:
        print("Failed to connect to the database")
        raise Exception("Failed to connect to the database")
        
kafka_data.head()

Unnamed: 0,event_type,timestamp,location,severity,details
0,general_health_report,2024-04-09 18:39:33,Berlin,medium,This is a simulated general_health_report event.
1,routine_checkup,2024-04-09 18:39:41,London,low,This is a simulated routine_checkup event.
2,general_health_report,2024-04-09 18:39:49,Boston,medium,This is a simulated general_health_report event.
3,emergency_incident,2024-04-09 18:40:00,Paris,high,This is a simulated emergency_incident event.
4,general_health_report,2024-04-09 18:40:13,Paris,medium,This is a simulated general_health_report event.


Make sure you have the `1m_health_events_dataset.csv` file in the same directory as the notebook. It is not a part of git/GitHub due to the large size of it.

In [4]:
data_path = Path().cwd().parent / 'data'
data = pd.read_csv(data_path / '1m_health_events_dataset.csv', engine='pyarrow')
data.head()

Unnamed: 0,EventType,Timestamp,Location,Severity,Details,Is_Anomaly
0,emergency_incident,2022-01-01 00:00:00,Boston,high,This is a simulated emergency_incident event.,0
1,health_mention,2022-01-01 00:01:00,Tokyo,low,This is a simulated health_mention event.,0
2,health_mention,2022-01-01 00:01:00,Tokyo,medium,This is a simulated health_mention event.,0
3,vaccination,2022-01-01 00:01:00,Boston,medium,This is a simulated vaccination event.,0
4,general_health_report,2022-01-01 00:03:00,Tokyo,medium,This is a simulated general_health_report event.,0


The next two cells are for finding the time since the last event of the same time and same location. There are two implementations, one using pandas and one using polars. The polars is faster and the output has been saved as a parquet file. The Pandas implementation is still there for reference.

In [5]:
augmented_parquet = data_path / 'health_events_dataset.parquet'

if Path(augmented_parquet).exists():
    pltdata = pl.read_parquet(augmented_parquet)

else:
    # create a duplicate of the data DataFrame
    pldata = pl.from_pandas(data)
    pldata_dupe = pldata.clone()

    # create an empty list to store the new column values
    new_col = []

    # iterate over the rows of the data DataFrame
    num = 0
    for row in tqdm(pldata.iter_rows(), total=pldata.height, unit='rows'):
        # get the index of the current row
        idx = row[-1]

        # drop the current row from the data_dupe DataFrame
        pldata_dupe = pldata_dupe[idx+1:]
        
        # filter the data_dupe DataFrame for the next events
        next_events = pldata_dupe.filter(pl.col("Location") == row[2]) \
                            .filter(pl.col("EventType") == row[0])
        
        if len(next_events) == 0:
            new_col.append(row[1] - row[1]) # this is needed instead of 0 due to typing
        else:
            new_col.append(next_events["Timestamp"][0] - row[1])

    # add the new column to the data DataFrame
    new_col = pl.Series("TimeToNextEvent", new_col, strict=False)
    pldata.insert_column(3, new_col)

    # display the modified data DataFrame
    display(pldata)
    
    pldata.write_parquet(augmented_parquet)

The process for encoding the days as a basis is pulled from [an article from Nvidia](https://developer.nvidia.com/blog/three-approaches-to-encoding-time-information-as-features-for-ml-models/).

In [6]:
data = ma.repeating_basis_day(data)
data.head()

Unnamed: 0,EventType,Timestamp,Location,Severity,Details,Is_Anomaly,nday,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,emergency_incident,2022-01-01 00:00:00,Boston,high,This is a simulated emergency_incident event.,0,1,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,1.125352e-07,0.000123,0.018316,0.367879
1,health_mention,2022-01-01 00:01:00,Tokyo,low,This is a simulated health_mention event.,0,1,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,1.125352e-07,0.000123,0.018316,0.367879
2,health_mention,2022-01-01 00:01:00,Tokyo,medium,This is a simulated health_mention event.,0,1,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,1.125352e-07,0.000123,0.018316,0.367879
3,vaccination,2022-01-01 00:01:00,Boston,medium,This is a simulated vaccination event.,0,1,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,1.125352e-07,0.000123,0.018316,0.367879
4,general_health_report,2022-01-01 00:03:00,Tokyo,medium,This is a simulated general_health_report event.,0,1,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,1.125352e-07,0.000123,0.018316,0.367879


In [7]:
data['Severity'] = data['Severity'].map({'low': 0, 'medium': 1, 'high': 2}).astype(int)
data.drop(columns=['Timestamp', 'nday', 'Details'], inplace=True)
data = ma.make_dummies(data)
data.head()

Unnamed: 0,Severity,Is_Anomaly,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,...,EventType_routine_checkup,EventType_vaccination,Location_Berlin,Location_Bordeaux,Location_Boston,Location_Chicago,Location_Los Angeles,Location_New York,Location_Paris,Location_Tokyo
0,2,0,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,...,0,0,0,0,1,0,0,0,0,0
1,0,0,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,...,0,0,0,0,0,0,0,0,0,1
2,1,0,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,...,0,0,0,0,0,0,0,0,0,1
3,1,0,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,...,0,1,0,0,1,0,0,0,0,0
4,1,0,1.0,0.367879,0.018316,0.000123,1.125352e-07,1.388794e-11,2.319523e-16,1.388794e-11,...,0,0,0,0,0,0,0,0,0,1


In [8]:
corrs = data.corr()[['Is_Anomaly']].sort_values(by='Is_Anomaly', ascending=False)
corrs

Unnamed: 0,Is_Anomaly
Is_Anomaly,1.0
EventType_hospital_admission,0.031492
Location_Paris,0.005358
Severity,0.003403
Aug,0.001662
Jul,0.001312
Location_New York,0.001072
Location_Berlin,0.001064
Jun,0.000931
May,0.00047


The correlations are absolute garbage.

In [9]:
model = xgb.XGBClassifier(objective='binary:logistic', tree_method='hist')
ma.eval_model(model, data)

Mean Absolute Error: 0.00013
Mean Squared Error: 0.00013
R2 Score: 0.3808223536466465
Accuracy: 0.99987
Total number of anomalies: 21
True positives: 8 (38.10%)
False negatives: 13 (61.90%)
