## Imports

In [None]:
import os
import json
import sys
sys.path.append("..")

import matplotlib.pyplot as plt
import mlflow
import pandas as pd

import tasks.analysis as analysis
import config.config as config

##  Load environment variable overrides and configuration

In [None]:
%load_ext dotenv
%dotenv
mlflow.set_tracking_uri('../mlruns')
conf = config.read_config('../config/gemini.cfg')

## Dataset Drift Report (by year)

In [None]:
# run dataset drift analysis and generate html report: compare two years 2016 and 2017
conf.html = True
conf.slice = 'year'
conf.data_ref = [2016]
conf.data_eval = [2017]

conf.report_full_path = 'dataset_2016_2017.html'

# Uncomment to use data with noise added:
# config.input =  '/mnt/nfs/project/delirium/data/all_before_2018_with_noise.csv'

response = analysis.main(conf)
print(response)

## Dataset Drift Report (by hospital)

In [None]:
conf.slice = 'hospital_id'
conf.data_ref = [3]
conf.data_eval = [7]
conf.report_full_path = 'dataset_hospitals.html'
response = analysis.main(conf)
print(response)

## Model Performance Comparison Report

In [None]:
# run analysis and generate html report (this time model performance comparison)
# looks at the prediction of the model trained on all data before 2018. Compares 2017(val) and 2018 (test).
conf.type = 'performance'
conf.reference= '/mnt/nfs/project/delirium/data/demo/results_2017.csv' 
conf.test = '/mnt/nfs/project/delirium/data/demo/results_2018.csv'
conf.report_full_path = 'dataset_performance_comparison.html'
name = analysis.main(conf)
print(name)

## System Monitoring with MLFlow

In [None]:
# MLFlow history for analysis
# Display dataset drift analysis runs

def get_dataset_metrics_list(data):
     if ('metrics' in data.keys()) and data['metrics']:
        metrics = data['metrics']
        #timestamp = data['params']['timestamp']
        drift = 'No' if metrics['dataset_drift']==0 else 'Yes'
        return [drift, metrics['n_features'], metrics['n_drifted_features']]
     else:
        return ['-', '-', '-']

# List all existing experiments
all_experiments = mlflow.list_experiments()
exp_data = []
for exp in all_experiments:
    row = [exp.name, exp.artifact_location, exp.lifecycle_stage]
    exp_data.append(row)
exp_frame = pd.DataFrame(exp_data, columns = ['Name', 'Artifacts', 'Status'])
display(exp_frame)
    
exp = mlflow.get_experiment_by_name('DatasetAnalysis')
runs = mlflow.list_run_infos(exp.experiment_id, max_results=5)
table = []
for r in runs:
    exp_run = mlflow.get_run(r.run_id).to_dictionary()
    path = exp_run['info']['artifact_uri'][6:]
    config_file = os.path.join(path, 'config.json')
    if not os.path.isfile(config_file):
        continue
    with open(config_file) as f:
        data = json.load(f)
        row = [data['input'], data['slice'], data['data_ref'], data['data_eval']]
        row = row + get_dataset_metrics_list(exp_run['data'])
        table.append(row)
frame = pd.DataFrame(table, columns=['Input', 'Slice', 'Ref Slice', 'Eval Slice', 'Drift', 'Feat', 'Drift_Feat'])
print('------------------- Dataset Analysis ----------------------')
display(frame)

In [None]:
# get all the executions folders
executions_folder = '/mnt/nfs/project/delirium/data/demo/executions'
from os import listdir
from os.path import isfile, join
files = [f for f in listdir(executions_folder)]
files.sort()

## Continuous Pipeline Simulation and Analysis

In [None]:
# plot how dataset drift changes over time compared to the reference data

table = []
y = []
for dir_run in files:
    with open(os.path.join(executions_folder, dir_run, 'dataset_report.json')) as f:
        data = json.load(f)
        results = data['data_drift']['data']['metrics']
    
        feat = results['n_features']
        drift = 'Yes' if results['dataset_drift'] else 'No'
        feat_drift = results['n_drifted_features']
        
        y.append(feat_drift)
        row = [dir_run, drift, str(feat_drift)+'/'+str(feat)]
        table.append(row)
    
frame = pd.DataFrame(table, columns=['Period', 'Drift Detected', 'Drifted Features / All Features'])
print('                  Dataset Analysis Results: number of drifted features  ')
# print(frame)

fig = plt.figure()
ax = plt.axes()
fig.set_size_inches(16.5, 8.5)
x = frame['Period'].values
ax.bar(x,y)
ax.xaxis.set_ticks(frame['Period'].values)
ax.set_xticklabels(frame['Period'].values, rotation='vertical', fontsize=11)
plt.show()

In [None]:
y = []
for dir_run in files:
    with open(os.path.join(executions_folder, dir_run, 'model_report.json')) as f:
        data = json.load(f)
        results = data['classification_performance']['data']['metrics']
        baseline = results['reference']['accuracy']
        y.append(results['current']['accuracy'])
    

fig = plt.figure()
fig.set_size_inches(16.5, 8.5)
ax = plt.axes()

plt.plot(x, y, '-', linewidth=3)
ax.hlines(y=baseline, xmin = 0, xmax = max(x), linewidth=2, color='r')

# Set ticks labels for x-axis
ax.xaxis.set_ticks(frame['Period'].values)
ax.set_xticklabels(frame['Period'].values, rotation='vertical', fontsize=11)

plt.grid()
plt.show()