In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

import os
import hydra
from omegaconf import DictConfig
from hydra.experimental import compose, initialize_config_dir

In [2]:
!pip install hydra-core








In [3]:
def process_train():
    """Function to process the data"""
    
    # Specify the root directory where your MLOps project is located
    root_directory = "C:/Github/MLOPs_WebApp/MLOps_Assignment"

    # Construct the absolute path to the configuration directory
    config_dir = os.path.normpath(os.path.join(root_directory, "config", "process"))

    # Initialize the Hydra config directory
    initialize_config_dir(config_dir=config_dir)

    # Load the Hydra configuration
    config = compose(config_name="process_Shermaine")
    print('All parameters in process_Shermaine.yaml: ' + str(config))

    # Load raw data
    medical = pd.read_csv(config.data.raw)

    # Filter to above 0.2 resting BP
    medical_clean = medical[medical['resting_BP'] > 0.2]

    # Perform PyCaret setup
    medical_setup = setup(data = medical_clean, 
                          target = config.setup.target, 
                          normalize = config.setup.norm, 
                          normalize_method=config.setup.norm_mtd, 
                          train_size = config.setup.trainsize, 
                          remove_outliers=config.setup.rmoutlier, 
                          bin_numeric_features=config.setup.bin,
                          session_id=config.setup.session, 
                          log_experiment=config.setup.logexp, 
                          experiment_name=config.setup.expname, 
                          fold=config.setup.fold
                         )

    # Print features information
    print(f'Numeric features: {medical_setup._fxs["Numeric"]}')
    print(f'Categorical features: {medical_setup._fxs["Categorical"]}')
    print(f'Ordinal features: {medical_setup._fxs["Ordinal"]}')

    # Get transformed dataset
    df = medical_setup.get_config('dataset_transformed')
    # print('Processed Data: ', df)

    # Save processed data
    file_name = 'medical_processed.csv'
    full_file = os.path.join(config.data.ppath, file_name)
    df.to_csv(full_file, index=False)
    
    print(f'Data was processed and written at {full_file}.')

    # Compare models (excluding specified models)
    best = compare_models()
    print(best)

    # Tune models
    tuned_best = tune_model(best, return_tuner=True) 
    best_tuned_model = tuned_best[0]  # Extract the best tuned model from the tuple
    print(best_tuned_model)

    # predict on test set
    best_predict = predict_model(best_tuned_model)
    print(best_predict)
    
    evaluate_model(best_tuned_model)

    # SAVE PIPELINE
    # finalize the model
    finalized = finalize_model(best_tuned_model)

    # # save model to disk
    save_model(finalized, 'medical_pipeline_hydra')
    # load pipeline
    loaded_best_pipeline = load_model('medical_pipeline_hydra')

    # MODEL REGISTRATION
    # Calculate evaluation metrics
    y_true = best_predict['cv_issue']
    y_pred = best_predict['prediction_label']
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mlflow.end_run()
    # Register the pipeline with MLflow
    with mlflow.start_run() as run:
        mlflow.log_param('train_size', 0.8)
        mlflow.log_param('transformation', True)
        mlflow.log_param('normalize', True)
        mlflow.log_param('remove_outliers', True)
        mlflow.log_param('fold', 10)
        
        # Log metrics
        mlflow.log_metric('Accuracy', accuracy)
        mlflow.log_metric('AUC', auc)
        mlflow.log_metric('Precision', precision)
        mlflow.log_metric('Recall', recall)
        mlflow.log_metric('F1-Score', f1)
        
        # Log the experiment name and run ID
        mlflow.set_experiment('medical_final')
        mlflow.log_param('experiment_name', 'medical_final')
        mlflow.log_param('run_id', run.info.run_id)
        
        # Log the pipeline object using mlflow.sklearn
        mlflow.sklearn.log_model(sk_model=best_tuned_model, registered_model_name='medical_final', artifact_path='sk_model')
    


process_train()

All parameters in process_Shermaine.yaml: {'data': {'raw': '../../data/raw/02_medical_records.csv', 'ppath': '../../data/processed/'}, 'setup': {'target': 'cv_issue', 'trainsize': 0.8, 'norm': True, 'norm_mtd': 'minmax', 'session': 123, 'logexp': True, 'expname': 'medical', 'rmoutlier': True, 'fold': 10, 'bin': ['age', 'max_HR']}}


Unnamed: 0,Description,Value
0,Session id,123
1,Target,cv_issue
2,Target type,Binary
3,Original data shape,"(917, 12)"
4,Transformed data shape,"(880, 19)"
5,Transformed train set shape,"(696, 19)"
6,Transformed test set shape,"(184, 19)"
7,Ordinal features,2
8,Numeric features,6
9,Categorical features,5


Numeric features: ['age', 'resting_BP', 'cholesterol', 'fasting_BS', 'max_HR', 'old_peak']
Categorical features: ['gender', 'chest_pain', 'resting_ECG', 'exercise_angina', 'ST_slope']
Ordinal features: {'gender': ['F', 'M'], 'exercise_angina': ['N', 'Y']}
Data was processed and written at ../../data/processed/medical_processed.csv.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8745,0.9185,0.899,0.8811,0.8882,0.7446,0.7484,1.588
rf,Random Forest Classifier,0.8731,0.9284,0.9088,0.8709,0.8883,0.7413,0.7447,3.479
lightgbm,Light Gradient Boosting Machine,0.8705,0.9285,0.904,0.8701,0.8851,0.7365,0.741,1.44
lda,Linear Discriminant Analysis,0.8664,0.922,0.9065,0.8608,0.882,0.7279,0.7319,1.435
ridge,Ridge Classifier,0.865,0.0,0.904,0.8605,0.8806,0.7252,0.7292,2.211
gbc,Gradient Boosting Classifier,0.865,0.9294,0.9015,0.8641,0.881,0.7248,0.7292,1.953
lr,Logistic Regression,0.8595,0.9233,0.9013,0.8552,0.8766,0.7135,0.7173,5.626
knn,K Neighbors Classifier,0.8582,0.8962,0.8892,0.8608,0.8737,0.7118,0.7149,3.685
svm,SVM - Linear Kernel,0.8487,0.0,0.8673,0.8631,0.8621,0.6942,0.7001,1.75
nb,Naive Bayes,0.8486,0.8996,0.8717,0.8599,0.8646,0.6925,0.6949,3.232


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=123, verbose=0, warm_start=False)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8784,0.9416,0.9512,0.8478,0.8966,0.7502,0.7575
1,0.8243,0.8581,0.8293,0.85,0.8395,0.6455,0.6458
2,0.8514,0.9261,0.8537,0.875,0.8642,0.7001,0.7003
3,0.8767,0.9512,0.9268,0.8636,0.8941,0.747,0.7497
4,0.7945,0.846,0.9024,0.7708,0.8315,0.5724,0.5842
5,0.8904,0.9371,0.875,0.9211,0.8974,0.78,0.7811
6,0.8767,0.9477,0.9,0.878,0.8889,0.7505,0.7508
7,0.863,0.9379,0.95,0.8261,0.8837,0.719,0.7295
8,0.9315,0.9629,0.95,0.9268,0.9383,0.8614,0.8617
9,0.9178,0.9311,0.95,0.9048,0.9268,0.8332,0.8345


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=123, verbose=0, warm_start=False)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8696,0.9183,0.8529,0.9062,0.8788,0.7379,0.7395


     age gender chest_pain  resting_BP  cholesterol  fasting_BS resting_ECG  \
862   65      F        NAP         155          269           0      Normal   
161   49      M        ASY         128          212           0      Normal   
872   55      M        ASY         132          353           0      Normal   
47    50      M        ATA         140          216           0      Normal   
300   60      M        ASY         160            0           1      Normal   
..   ...    ...        ...         ...          ...         ...         ...   
333   40      M        ASY          95            0           1          ST   
629   57      F        ASY         128          303           0         LVH   
810   55      F        ATA         135          250           0         LVH   
306   55      M        ASY         115            0           1      Normal   
57    58      M        NAP         130          213           0          ST   

     max_HR exercise_angina  old_peak ST_slope  cv_

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Registered model 'medical_final' already exists. Creating a new version of this model...
2023/08/21 18:56:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: medical_final, version 2
Created version '2' of model 'medical_final'.
