In [1]:
import requests
import datetime
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import QuantileTransformer

from joblib import load, dump
from tqdm import tqdm

In [2]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

In [3]:
data = pd.read_csv('healthcare-dataset.csv')

In [4]:
data.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [5]:
imputer = SimpleImputer(strategy = 'mean')
data['bmi']=imputer.fit_transform(data[['bmi']])
encoded_data= data.copy()

features_to_scale=['age','bmi']
scaler = MinMaxScaler()
encoded_data[features_to_scale]=scaler.fit_transform(encoded_data[features_to_scale])

scaler = QuantileTransformer(output_distribution='uniform')

encoded_data['avg_glucose_level'] = scaler.fit_transform(encoded_data[['avg_glucose_level']])
df = encoded_data.copy()
categorical_features = ['Residence_type', 'work_type', 'smoking_status','ever_married','gender']

for column in categorical_features:
    encoded_column = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, encoded_column], axis=1)
    df = df.drop(columns=[column],axis=1)

df = df.astype(int)
df.drop('id',axis=1,inplace=True)

In [9]:
df.shape

(5110, 22)

In [27]:
## Split the data
X = df.drop('stroke',axis=1)
y = df['stroke']

X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.2,random_state=42) 

In [28]:
## Train the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [29]:
X_train['prediction'] = y_train

In [30]:
## Predict the result
y_pred = logreg.predict(X_valid)

## Performance metrics
accuracy = accuracy_score(y_valid, y_pred)

In [31]:
X_valid['prediction'] = y_pred

In [32]:
accuracy

0.9393346379647749

Dump model

In [33]:
with open('models/logreg.bin', 'wb') as f_out:
    dump(logreg, f_out)

Evidently Report

In [39]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    #categorical_features=categorical_features
)

In [40]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [41]:
report.run(reference_data=X_train, current_data=X_valid, column_mapping=column_mapping)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



In [42]:
result = report.as_dict()
result

{'metrics': [{'metric': 'ColumnDriftMetric',
   'result': {'column_name': 'prediction',
    'column_type': 'cat',
    'stattest_name': 'Jensen-Shannon distance',
    'stattest_threshold': 0.1,
    'drift_score': np.float64(0.12696946258208358),
    'drift_detected': True,
    'current': {'small_distribution': {'x': [0, 1], 'y': [1022, 0]}},
    'reference': {'small_distribution': {'x': [0, 1], 'y': [3901, 187]}}}},
  {'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 22,
    'number_of_drifted_columns': 1,
    'share_of_drifted_columns': 0.045454545454545456,
    'dataset_drift': False}},
  {'metric': 'DatasetMissingValuesMetric',
   'result': {'current': {'different_missing_values': {'': 0,
      -inf: 0,
      None: 0,
      inf: 0},
     'number_of_different_missing_values': 0,
     'different_missing_values_by_column': {'age': {'': 0,
       -inf: 0,
       None: 0,
       inf: 0},
      'hypertension': {'': 0, -inf: 0, None: 0, inf: 0},
   

In [43]:
report.show(mode='inline')

In [48]:
result['metrics']

[{'metric': 'ColumnDriftMetric',
  'result': {'column_name': 'prediction',
   'column_type': 'cat',
   'stattest_name': 'Jensen-Shannon distance',
   'stattest_threshold': 0.1,
   'drift_score': np.float64(0.12696946258208358),
   'drift_detected': True,
   'current': {'small_distribution': {'x': [0, 1], 'y': [1022, 0]}},
   'reference': {'small_distribution': {'x': [0, 1], 'y': [3901, 187]}}}},
 {'metric': 'DatasetDriftMetric',
  'result': {'drift_share': 0.5,
   'number_of_columns': 22,
   'number_of_drifted_columns': 1,
   'share_of_drifted_columns': 0.045454545454545456,
   'dataset_drift': False}},
 {'metric': 'DatasetMissingValuesMetric',
  'result': {'current': {'different_missing_values': {'': 0,
     -inf: 0,
     None: 0,
     inf: 0},
    'number_of_different_missing_values': 0,
    'different_missing_values_by_column': {'age': {'': 0,
      -inf: 0,
      None: 0,
      inf: 0},
     'hypertension': {'': 0, -inf: 0, None: 0, inf: 0},
     'heart_disease': {'': 0, -inf: 0, N

In [44]:
#prediction drift
result['metrics'][0]['result']['drift_score']

np.float64(0.12696946258208358)

In [45]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

1

Evidently Dashboard

In [49]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [50]:
ws = Workspace("workspace")

In [51]:
project = ws.create_project("Health Stroke Prediction Project")
project.description = "My project description"
project.save()

Project(id=UUID('3807600f-15fd-4789-8b8e-3f39e0b26c5e'), name='Health Stroke Prediction Project', description='My project description', dashboard=DashboardConfig(name='Health Stroke Prediction Project', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None, created_at=datetime.datetime(2024, 7, 2, 16, 30, 21, 556522))

In [53]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2024,7,2)
)

regular_report.run(reference_data=None,
                  current_data=X_valid,
                  column_mapping=column_mapping)

regular_report

In [54]:
ws.add_report(project.id, regular_report)

In [58]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)
project.save()

Project(id=UUID('3807600f-15fd-4789-8b8e-3f39e0b26c5e'), name='Health Stroke Prediction Project', description='My project description', dashboard=DashboardConfig(name='Health Stroke Prediction Project', panels=[DashboardPanelCounter(type='evidently.ui.dashboards.reports.DashboardPanelCounter', id=UUID('c169361b-0a65-46f3-b445-9747e1cf9bc0'), title='NYC taxi data dashboard', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.FULL: 2>, agg=<CounterAgg.NONE: 'none'>, value=None, text=None), DashboardPanelPlot(type='evidently.ui.dashboards.reports.DashboardPanelPlot', id=UUID('25140314-af98-4ac9-9224-f70993ab1c0c'), title='Inference Count', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.number_of_rows', metric_id='DatasetSummaryMetric', metric_fingerprint=None, metric_args={}, legend='count')], plot_type=<PlotType.BAR: 'bar'>), DashboardPane