# Modeling

In this section, we will setup a Docker with MLFLOW and Postgres, to keep track of the models trained.  
I used the instructions on https://github.com/aganse/docker_mlflow_db with a change to add the jupyter notebook to the composer.

First, we will try to overfit a few models with a selection of features, to make a sanity check on weather it's possible to train a model to a reasonable degree.  


## Import Libraries

In [1]:
import os
from datetime import datetime
import pickle

import logging
import warnings
warnings.filterwarnings("ignore")

_ABSOLUTE_DIR = os.getcwd()
_NOTEBOOK_START = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
if not os.path.exists('logs'):
  # Create a new directory because it does not exist 
  os.makedirs('logs')
logging.basicConfig(filename=f'logs/heart_processing_{_NOTEBOOK_START}.log', encoding='utf-8', level=logging.DEBUG, format='%(levelname)s:%(asctime)s:%(message)s')

import mlflow

import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
logging.info(f'Imports loaded')

In [2]:
_SUBSET_SVC_EXPERIMENT = False
_SUBSET_XGBOOST_EXPERIMENT = False

_TOTAL_SVC_EXPERIMENT = False
_TOTAL_XGBOOST_EXPERIMENT = False

## Import dataset


In [3]:
dfHeart_processed = pd.read_parquet(f'{_ABSOLUTE_DIR}/../artifacts/data/heart/heart_processed.parquet')
logging.info(f'Load dataset from {_ABSOLUTE_DIR}/../artifacts/data/heart/heart_processed.parquet')

In [4]:
dfHeart_processed

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,GenHealth,...,age_80orolder_fe,AgeCategory_seq_fe,HeartDisease,FLAG_REAL,Set,BMI_scaled,PhysicalHealth_weeks,MentalHealth_weeks,Habits,PriorDiseases
0,16.600000,True,False,False,3,30,False,1,True,4,...,0,8,False,1,test,-1.942090,1,4,2,1
1,20.340000,False,False,True,0,0,False,1,True,4,...,1,13,False,1,train,-1.343988,0,0,3,1
2,26.580000,True,False,False,20,30,False,0,True,2,...,0,10,False,1,train,-0.346086,3,4,2,1
3,24.210000,False,False,False,0,0,False,1,False,3,...,0,12,False,1,train,-0.725097,0,0,2,1
4,23.710000,False,False,False,28,0,True,1,True,4,...,0,5,False,1,train,-0.805057,4,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582696,25.984777,True,False,False,0,1,False,1,True,4,...,0,11,True,0,train,-0.441274,0,1,2,0
582697,26.543700,False,False,False,0,0,False,0,True,4,...,0,10,True,0,test,-0.351891,0,0,3,1
582698,26.458366,True,False,True,30,0,True,0,True,2,...,1,13,True,0,train,-0.365538,4,0,2,1
582699,27.956522,True,False,True,15,30,True,0,True,1,...,0,9,True,0,train,-0.125952,2,4,2,1


In [5]:
columns_v1 = ['AgeCategory_seq_fe', 
              'BMI_scaled', 
              'GenHealth', 
              'Smoking', 
              'PriorDiseases', 
              'diabetic_no_fe', 
              'DiffWalking', 
              'MentalHealth', 
              'PhysicalHealth']

columns_v2 = ['AgeCategory_seq_fe', 
              'BMI_scaled', 
              'GenHealth', 
              'Smoking', 
              'diabetic_no_fe', 
              'DiffWalking', 
              'MentalHealth', 
              'PhysicalHealth',
              'Stroke',
              'SkinCancer',
              'KidneyDisease',
              'Asthma']

In [6]:
train_data = dfHeart_processed[(dfHeart_processed['Set'] == 'train')]
test_data = dfHeart_processed[(dfHeart_processed['Set'] == 'test')]
val_data = dfHeart_processed[(dfHeart_processed['Set'] == 'valid')]

In [7]:
val_data

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,GenHealth,...,age_80orolder_fe,AgeCategory_seq_fe,HeartDisease,FLAG_REAL,Set,BMI_scaled,PhysicalHealth_weeks,MentalHealth_weeks,Habits,PriorDiseases
10,34.300000,True,False,False,30,0,True,0,False,1,...,0,9,True,1,valid,0.888498,4,0,1,1
14,29.290000,True,False,False,0,30,True,1,False,3,...,0,9,False,1,valid,0.087298,0,4,1,0
18,29.860000,True,False,False,0,0,True,1,False,2,...,0,12,False,1,valid,0.178452,0,0,1,1
21,28.900000,False,False,False,2,5,False,1,False,4,...,0,11,False,1,valid,0.024929,1,1,2,0
38,21.930000,False,False,False,3,2,False,1,True,2,...,0,11,False,1,valid,-1.089715,1,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582607,27.911956,True,False,False,0,0,False,0,True,3,...,1,13,True,0,valid,-0.133079,0,0,2,0
582653,22.895875,True,False,True,30,0,True,0,False,1,...,0,12,True,0,valid,-0.935252,4,0,1,1
582664,24.465520,False,False,False,0,0,False,0,True,4,...,0,10,True,0,valid,-0.684234,0,0,3,1
582687,31.494174,False,False,False,0,0,False,0,True,4,...,0,9,True,0,valid,0.439790,0,0,3,0


## Test with subset 

Lets first test if the models perform well with a subset of 100 observations

### SVC

In [8]:
if _SUBSET_SVC_EXPERIMENT:
    logging.info(f'START EXPERIMENT RUN ON SVC WITH SUBSET')
    try:
        experiment_id = mlflow.create_experiment('SVC Subset')
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name('SVC Subset')
    os.environ['MLFLOW_EXPERIMENT_NAME'] = 'SVC Subset'
    experiment = mlflow.get_experiment(experiment_id)
    print("Name: {}".format(experiment.name))
    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    
    train_subset = train_data.sample(n=100, random_state = 42)
    val_subset = val_data.sample(n=100, random_state = 42)
    # enable autologging
    mlflow.sklearn.autolog()

    subset_C_param = np.linspace(0.01, 10, num = 40)
    subset_Kernel_param = ['linear', 'poly', 'rbf']

    os.environ['MLFLOW_EXPERIMENT_NAME'] = 'SVC Subset'
    # train a model
    for dataset in ['v1', 'v2']:
        columns = columns_v1 if dataset == 'v1' else columns_v2
        X_train_subset=train_subset[columns]
        y_train_subset=train_subset['HeartDisease']
        X_val_subset=val_subset[columns]
        y_val_subset=val_subset['HeartDisease']
        for C_param in subset_C_param:
            for Kernel_param in subset_Kernel_param:
                with mlflow.start_run() as run:
                    subset_SVC_model = SVC(C= C_param, kernel=Kernel_param)
                    subset_SVC_model.fit(X_train_subset, y_train_subset)
                    mlflow.log_param('features', columns)
                    mlflow.set_tag('Dataset version', dataset)
                    mlflow.set_tag('Sample Size', 200)
                    metrics = mlflow.sklearn.eval_and_log_metrics(subset_SVC_model, X_val_subset, y_val_subset, prefix="val_")
    logging.info(f'END EXPERIMENT RUN ON SVC WITH SUBSET')


### XGBoost

In [9]:
if _SUBSET_XGBOOST_EXPERIMENT:
    logging.info(f'START EXPERIMENT RUN ON XGBOOST WITH SUBSET')
    os.environ['MLFLOW_EXPERIMENT_NAME'] = 'XGBoost Subset'
    try:
        experiment_id = mlflow.create_experiment("XGBoost Subset")
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name("XGBoost Subset")

    print("Name: {}".format(experiment.name))
    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

    train_subset = train_data.sample(n=100, random_state = 42)
    val_subset = val_data.sample(n=100, random_state = 42)
    # enable autologging
    mlflow.xgboost.autolog()

    subset_numRound_param = [100,200,300,400,500]
    subset_maxDepth_param = np.linspace(1, 4, num = 4, dtype=int)
    subset_L1_param = np.linspace(0, 1, num = 7)

    # train a model
    for dataset in ['v1', 'v2']:
        columns = columns_v1 if dataset == 'v1' else columns_v2
        X_train_subset=train_subset[columns]
        y_train_subset=train_subset['HeartDisease']
        X_val_subset=val_subset[columns]
        y_val_subset=val_subset['HeartDisease']
        for numRound in subset_numRound_param:
            for maxDepth_param in subset_maxDepth_param:
                for L1_param in subset_L1_param:
                    with mlflow.start_run() as run:
                        # params = {'max_depth': maxDepth_param, 'reg_alpha': L1_param, 'objective':'binary:logistic'}
                        subset_XGBoost_model = XGBClassifier(max_depth=maxDepth_param, reg_alpha=L1_param, n_estimators=numRound)
                        subset_XGBoost_model.fit(X_train_subset, y_train_subset)
                        mlflow.log_param('features', columns)
                        mlflow.set_tag('Dataset version', dataset)
                        mlflow.set_tag('Sample Size', 100)
                        metrics_t = mlflow.sklearn.eval_and_log_metrics(subset_XGBoost_model, X_train_subset, y_train_subset, prefix="train_")
                        metrics_v = mlflow.sklearn.eval_and_log_metrics(subset_XGBoost_model, X_val_subset, y_val_subset, prefix="val_")
                    
    logging.info(f'END EXPERIMENT RUN ON XGBOOST WITH SUBSET')

## Modeling

The best models that came out of these experiments were


* 1. SVC
    - Features: ['AgeCategory_seq_fe', 'BMI_scaled', 'GenHealth', 'Smoking', 'PriorDiseases', 'diabetic_no_fe', 'DiffWalking', 'MentalHealth', 'PhysicalHealth']
    - Kernel: linear
    - C: 5.6
* 2. SVC
    - Features: ['AgeCategory_seq_fe', 'BMI_scaled', 'GenHealth', 'Smoking', 'PriorDiseases', 'diabetic_no_fe', 'DiffWalking', 'MentalHealth', 'PhysicalHealth']
    - Kernel: linear
    - C: 1.1
* 1. XGBoost
    - Features: ['AgeCategory_seq_fe', 'BMI_scaled', 'GenHealth', 'Smoking', 'diabetic_no_fe', 'DiffWalking', 'MentalHealth', 'PhysicalHealth', 'Stroke', 'SkinCancer', 'KidneyDisease', 'Asthma']
    - max_depth: 4
    - reg_alpha: 0.0
    - num_boost_round: 100
* 2. XGBoost
    - Features: ['AgeCategory_seq_fe', 'BMI_scaled', 'GenHealth', 'Smoking', 'PriorDiseases', 'diabetic_no_fe', 'DiffWalking', 'MentalHealth', 'PhysicalHealth']
    - max_depth: 3
    - reg_alpha: 0.0
    - num_boost_round: 400
    
Lets train the models with this hyperparameters and checkout the results.




In [10]:
X_train_v1 = train_data[columns_v1]
X_train_v2 = train_data[columns_v2]
y_train = train_data['HeartDisease']

X_val_v1 = val_data[columns_v1]
X_val_v2 = val_data[columns_v2]
y_val = val_data['HeartDisease']

In [11]:
if _TOTAL_SVC_EXPERIMENT:
    logging.info(f'START EXPERIMENT RUN ON SVC')
    try:
        experiment_id = mlflow.create_experiment("SVC Total Set")
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name("SVC Total Set")
    os.environ['MLFLOW_EXPERIMENT_NAME'] = "SVC Total Set"

    experiment = mlflow.get_experiment(experiment_id)
    print("Name: {}".format(experiment.name))
    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    
    # enable autologging
    mlflow.sklearn.autolog()
    
    with mlflow.start_run() as run:
        SVC_model_1 = LinearSVC(C= 5.6)
        SVC_model_1.fit(X_train_v1, y_train)
        mlflow.set_tag('Sample Size', 'All')
        metrics = mlflow.sklearn.eval_and_log_metrics(SVC_model_1, X_val_v1, y_val, prefix="val_")
        
    with mlflow.start_run() as run:
        SVC_model_2 = LinearSVC(C= 1.1)
        SVC_model_2.fit(X_train_v1, y_train)
        mlflow.set_tag('Sample Size', 'All')
        metrics = mlflow.sklearn.eval_and_log_metrics(SVC_model_2, X_val_v1, y_val, prefix="val_")

    logging.info(f'END EXPERIMENT RUN ON SVC')


In [12]:
if _TOTAL_XGBOOST_EXPERIMENT:
    logging.info(f'START EXPERIMENT RUN ON XGBOOST')
    try:
        experiment_id = mlflow.create_experiment("XGBOOST Total Set")
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name("XGBOOST Total Set")
    os.environ['MLFLOW_EXPERIMENT_NAME'] = "XGBOOST Total Set"

    experiment = mlflow.get_experiment(experiment_id)
    print("Name: {}".format(experiment.name))
    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    
    # enable autologging
    mlflow.xgboost.autolog()
    
    with mlflow.start_run() as run:
        XGBoost_model_2=XGBClassifier(max_depth=4, reg_alpha=0, n_estimators=100)
        XGBoost_model_2.fit(X_train_v2, y_train)
        metrics_t = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_2, X_train_v2, y_train, prefix="train_")
        metrics_v = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_2, X_val_v2, y_val, prefix="val_")
        
    with mlflow.start_run() as run:
        XGBoost_model_1=XGBClassifier(max_depth=3, reg_alpha=0, n_estimators=400)
        XGBoost_model_1.fit(X_train_v1, y_train)
        metrics_t = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_1, X_train_v1, y_train, prefix="train_")
        metrics_v = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_1, X_val_v1, y_val, prefix="val_")

    logging.info(f'END EXPERIMENT RUN ON XGBOOST')


## Load Colab Models

We had train the models, especially SVC, in Colab, due to training time.

In [16]:
with open(f'../artifacts/models/heart_svc_model_1.pickle', 'rb') as pickle_file:
    SVC_model_1 = pickle.load(pickle_file)

with open(f'../artifacts/models/heart_xgb_model_1.pickle', 'rb') as pickle_file:
    XGBoost_model_1 = pickle.load(pickle_file)


In [20]:
with mlflow.start_run():
    logging.info(f'START EXPERIMENT RUN ON SVC')
    os.environ['MLFLOW_EXPERIMENT_NAME'] = "SVC Total Set"
        
    try:
        experiment_id = mlflow.create_experiment("SVC Total Set")
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name("SVC Total Set")
    
    # enable autologging
    mlflow.sklearn.autolog()
    
    mlflow.log_artifact(f'../artifacts/models/heart_svc_model_1.pickle')
    metrics_t = mlflow.sklearn.eval_and_log_metrics(SVC_model_1, X_train_v1, y_train, prefix="train_")
    metrics_v = mlflow.sklearn.eval_and_log_metrics(SVC_model_1, X_val_v1, y_val, prefix="val_")

MlflowException: The run c3cd210528fd4387ad2d0ddfd23e6445 must be in the 'active' state. Current state is deleted.

In [None]:
os.environ['MLFLOW_EXPERIMENT_NAME'] = "XGBOOST Total Set"

with mlflow.start_run():
    logging.info(f'START EXPERIMENT RUN ON XGBOOST')
        
    try:
        experiment_id = mlflow.create_experiment("XGBOOST Total Set")
        experiment = mlflow.get_experiment(experiment_id)
    except:
        experiment = mlflow.get_experiment_by_name("XGBOOST Total Set")
        
    mlflow.xgboost.autolog()
    
    mlflow.log_artifact(f'../artifacts/models/heart_xgb_model_1.pickle')
    metrics_t = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_1, X_train_v1, y_train, prefix="train_")
    metrics_v = mlflow.sklearn.eval_and_log_metrics(XGBoost_model_1, X_val_v1, y_val, prefix="val_")