In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

import os
import hydra
from omegaconf import DictConfig
from hydra.experimental import compose, initialize_config_dir

In [2]:
def process_train():
    """Function to process the data"""
    
    # Specify the root directory where your MLOps project is located
    root_directory = "C:/Github/MLOPs_WebApp/MLOps_Assignment"

    # Construct the absolute path to the configuration directory
    config_dir = os.path.normpath(os.path.join(root_directory, "config", "process"))

    # Initialize the Hydra config directory
    initialize_config_dir(config_dir=config_dir)

    # Load the Hydra configuration
    config = compose(config_name="process_Shermaine")
    print('All parameters in process_Shermaine.yaml: ' + str(config))

    # Load raw data
    medical = pd.read_csv(config.data.raw)

    # Filter to above 0.2 resting BP
    medical_clean = medical[medical['resting_BP'] > 0.2]

    # Perform PyCaret setup
    medical_setup = setup(data = medical_clean, 
                          target = config.setup.target, 
                          normalize = config.setup.norm, 
                          normalize_method=config.setup.norm_mtd, 
                          train_size = config.setup.trainsize, 
                          remove_outliers=config.setup.rmoutlier, 
                          bin_numeric_features=config.setup.bin,
                          session_id=config.setup.session, 
                          log_experiment=config.setup.logexp, 
                          experiment_name=config.setup.expname, 
                          fold=config.setup.fold
                         )

    # Print features information
    print(f'Numeric features: {medical_setup._fxs["Numeric"]}')
    print(f'Categorical features: {medical_setup._fxs["Categorical"]}')
    print(f'Ordinal features: {medical_setup._fxs["Ordinal"]}')

    # Get transformed dataset
    df = medical_setup.get_config('dataset_transformed')
    # print('Processed Data: ', df)

    # Save processed data
    file_name = 'medical_processed.csv'
    full_file = os.path.join(config.data.ppath, file_name)
    df.to_csv(full_file, index=False)
    
    print(f'Data was processed and written at {full_file}.')

    # Compare models (excluding specified models)
    best = compare_models()
    print(best)

    # Tune models
    tuned_best = tune_model(best, return_tuner=True) 
    best_tuned_model = tuned_best[0]  # Extract the best tuned model from the tuple
    print(best_tuned_model)

    # predict on test set
    best_predict = predict_model(best_tuned_model)
    print(best_predict)
    
    evaluate_model(best_tuned_model)

    # SAVE PIPELINE
    # finalize the model
    finalized = finalize_model(best_tuned_model)

    # # save model to disk
    save_model(finalized, 'medical_pipeline_hydra')
    # load pipeline
    loaded_best_pipeline = load_model('medical_pipeline_hydra')

    # MODEL REGISTRATION
    # Calculate evaluation metrics
    y_true = best_predict['cv_issue']
    y_pred = best_predict['prediction_label']
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mlflow.end_run()
    # Register the pipeline with MLflow
    with mlflow.start_run() as run:
        mlflow.log_param('train_size', 0.8)
        mlflow.log_param('transformation', True)
        mlflow.log_param('normalize', True)
        mlflow.log_param('remove_outliers', True)
        mlflow.log_param('fold', 10)
        
        # Log metrics
        mlflow.log_metric('Accuracy', accuracy)
        mlflow.log_metric('AUC', auc)
        mlflow.log_metric('Precision', precision)
        mlflow.log_metric('Recall', recall)
        mlflow.log_metric('F1-Score', f1)
        
        # Log the experiment name and run ID
        mlflow.set_experiment('medical_final')
        mlflow.log_param('experiment_name', 'medical_final')
        mlflow.log_param('run_id', run.info.run_id)
        
        # Log the pipeline object using mlflow.sklearn
        mlflow.sklearn.log_model(sk_model=best_tuned_model, registered_model_name='medical_final', artifact_path='sk_model')
    


process_train()

All parameters in process_Shermaine.yaml: {'data': {'raw': '../../data/raw/02_medical_records.csv', 'ppath': '../../data/processed/'}, 'setup': {'target': 'cv_issue', 'trainsize': 0.8, 'norm': True, 'norm_mtd': 'minmax', 'session': 123, 'logexp': True, 'expname': 'medical', 'rmoutlier': True, 'fold': 10, 'bin': ['age', 'max_HR']}}


Unnamed: 0,Description,Value
0,Session id,123
1,Target,cv_issue
2,Target type,Binary
3,Original data shape,"(917, 12)"
4,Transformed data shape,"(880, 19)"
5,Transformed train set shape,"(696, 19)"
6,Transformed test set shape,"(184, 19)"
7,Ordinal features,2
8,Numeric features,6
9,Categorical features,5


Numeric features: ['age', 'resting_BP', 'cholesterol', 'fasting_BS', 'max_HR', 'old_peak']
Categorical features: ['gender', 'chest_pain', 'resting_ECG', 'exercise_angina', 'ST_slope']
Ordinal features: {'gender': ['F', 'M'], 'exercise_angina': ['N', 'Y']}
Data was processed and written at ../../data/processed/medical_processed.csv.


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html