# DRIFT DETECTION EXPERIMENTS

##### To Do
- Evaluate influence of p value (do a graph)
- Compare results of differents librairies

## ALIBI DETECT

Experimenting with some open source drift detection algorithms a d tryting to implement a simple drift detection algorithm from scratch

In [26]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

import alibi
from alibi_detect.cd import TabularDrift

import evidently


import warnings
warnings.filterwarnings('ignore')

In [3]:
wine_data = load_wine()

### Data drift

In [3]:
def alibi_detect_drift (X_ref, X_test, p_val) : 
    '''
    Simple function to detect drift with alibi package
    '''

    # Initialise the drift detector with reference data and a p value for statistical significance
    cd = TabularDrift(x_ref=X_ref, p_val=p_val, categories_per_feature=None)
    
    # Predict if there is drift
    preds = cd.predict(X_test)

    # Format and return result
    labels = {1:'Drift detected', 0: 'No drift'}
    result = labels[preds['data']['is_drift']]
    return result

In [4]:
# Split the data between reference and test
feature_names = wine_data.feature_names
X,y = wine_data.data, wine_data.target
X_ref, X_test, y_ref, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

In [12]:
alibi_detect_drift(X_ref, X_test, p_val= 0.05)

'No drift'

In [6]:
# Simulate a drift
X_test_drifted = X_test + 100

In [7]:
alibi_detect_drift(X_ref, X_test_drifted, p_val= 0.05)

'Drift detected'

### Label drift

In [11]:
# Same thing but with on y_ref and y_test
alibi_detect_drift(y_ref, y_test, p_val= 0.05)

'No drift'

In [20]:
# Simulate a drift
y_test_drifted = y_test * 1.1
alibi_detect_drift(y_ref, y_test_drifted, p_val= 0.05)

'Drift detected'

### Concept drift

In [1]:
# todo
from alibi_detect.cd import MMDDriftOnline

In [5]:
ert = 50
window_size = 10
cd = MMDDriftOnline(X_ref, ert, window_size, backend='pytorch',n_bootstraps=2500)

No GPU detected, fall back on CPU.


Generating permutations of kernel matrix..


100%|██████████| 2500/2500 [00:00<00:00, 29397.90it/s]
Computing thresholds: 100%|██████████| 10/10 [00:01<00:00,  5.95it/s]


In [15]:
cd.predict(X_test[0])['data'] ['is_drift']

0

## EVIDENTLY

In [23]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

In [27]:
X_ref_df = pd.DataFrame(X_ref)

In [31]:
# Initialize the report with the desired metrics
data_drift_report = Report(metrics=[DataDriftPreset()])

# Run the report with reference data and current data
data_drift_report.run(reference_data=X_ref_df, current_data=X_ref_df)

In [30]:
data_drift_report

KeyError: '0'

<evidently.report.report.Report at 0x346b56650>

In [29]:
data_drift_report.save_html('data_drift_report.html')

KeyError: '0'

## SIMPLE DRIFT DETECTION ALGORITHM FROM SCRATCH

In [38]:
# todo
# choose 1 or 2 methods and implement from scratch in python