## Imports 

In [5]:
from __future__ import annotations

import time

import pandas as pd; pd.set_option('display.float_format', lambda x: '%.5f' % x)
import numpy as np; np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt

# anomaly detection
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

## Overview
In `exploratory-data-analysis.ipynb`, we discover that only the `sensor_data` dataset contains any anomalies. Data cleansing will only be done on `sensor_data`. Afterwards, we merge the 3 datasets to create a data-target dataset, with data being anything from the datasets and the target being the `safety_labels['label']` column.

## Load Datasets

In [2]:
# read data from hdf files
driver_data = pd.read_hdf('../data/driver_data.h5')
safety_labels = pd.read_hdf('../data/safety_labels.h5')
sensor_data = pd.read_hdf('../data/sensor_data.h5')

## Anomaly Detection
- There are many anomaly detection algorithms, e.g.: `IsolationForest`, `LocalOutlierFactor`, etc.
- We instantiate and use a few models to generate a set of clean datasets
- Each clean dataset will pass through the same machine learning workflow, giving us a tree of paths that tell us which workflow gives us the best model from our raw data.

### Strategy
- Anomalies are very extreme but also very rare, a low contamination value of $0.025$ will be used.
- Anomaly detection will be run on each column
- Any row with 3 or more anomalies will be removed.

In [9]:
base_models = [IsolationForest(contamination=0.025),
               OneClassSVM(nu=0.025),
               EllipticEnvelope(contamination=0.025),
               LocalOutlierFactor()]
results_df = {'model': [], 'n_anomalies': [], 'anomaly_%': [], 'time': []}

for model in base_models:
    start = time.perf_counter()

    y_pred = model.fit_predict(sensor_data['second'].to_numpy().reshape(-1, 1))
    mask = y_pred != -1

    results_df['model'].append(model.__class__.__name__)
    results_df['n_anomalies'] = len(sensor_data['second'][mask])
    results_df['anomaly_%'] = len(sensor_data['second'][mask]) / len(sensor_data) * 100
    results_df['time'] = time.perf_counter() - start

    print(f'{model.__class__.__name__} DONE')

display(results_df)

In [7]:
IsolationForest(contamination=0.025).__class__.__name__

'IsolationForest'

In [4]:
mask = yhat != -1
len(sensor_data['second'][mask]) / len(sensor_data)

NameError: name 'yhat' is not defined

In [None]:
sensor_data['second'][mask].max()

1646

In [None]:

sensor_data['second'].sort_values().tolist()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
