In [24]:
import pandas as pd
import mlflow
import mlflow.sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,train_test_split

In [10]:
# Loanding the data
basic_data = pd.read_csv("Datasets\Analyzed\Analyzed_Health_Condition_Data_Base.csv")
basic_data.head()

Unnamed: 0,heart_rate,oxygen_saturation,healthcare_target
0,62.923262,97.661459,1
1,84.893731,95.516325,0
2,99.247113,95.862619,1
3,67.604306,97.839871,0
4,91.703805,96.481788,1


In [11]:
enhanced_data = pd.read_csv("Datasets\Analyzed\Analyzed_Health_Condition_Data_added_features.csv")
enhanced_data.head()

Unnamed: 0,heart_rate,oxygen_saturation,healthcare_target,hr_o2_interaction,hr_o2_ratio
0,62.923262,97.661459,1,6145.177643,0.6443
1,84.893731,95.516325,0,8108.737165,0.888788
2,99.247113,95.862619,1,9514.088264,1.035306
3,67.604306,97.839871,0,6614.39661,0.690969
4,91.703805,96.481788,1,8847.747045,0.950478


### Running baseline LR with basic Data

In [12]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Baseline LR - without feature eng"):

    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "baseline_lr_without_feature_eng")


Accuracy: 0.6950
              precision    recall  f1-score   support

           0       0.72      0.72      0.72       110
           1       0.66      0.67      0.66        90

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.69       200
weighted avg       0.70      0.69      0.70       200





### Running baseline LR with added features Data

In [13]:
# Features & Target
X = features = enhanced_data.drop('healthcare_target',axis=1)
y = enhanced_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Baseline LR - Enhanced Data - without feature eng"):

    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "basic_lr_enhanced_data_without_feature_eng")


Accuracy: 0.7000
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       110
           1       0.66      0.68      0.67        90

    accuracy                           0.70       200
   macro avg       0.70      0.70      0.70       200
weighted avg       0.70      0.70      0.70       200





As enhanced data has a small improvement, we decided to continue with the basic 3 features we have in order to continue this model

### Feature Engineering of the dataset's given medical features

In [14]:
# Minmax Scaling the data
scaler = MinMaxScaler()
basic_data[['heart_rate', 'oxygen_saturation']] = scaler.fit_transform(basic_data[['heart_rate', 'oxygen_saturation']])

In [15]:
basic_data

Unnamed: 0,heart_rate,oxygen_saturation,healthcare_target
0,0.073103,0.533478,1
1,0.622614,0.103470,0
2,0.981612,0.172888,1
3,0.190182,0.569242,0
4,0.792943,0.297004,1
...,...,...,...
995,0.067253,0.955552,1
996,0.595106,0.060334,0
997,0.739191,0.298300,0
998,0.032447,0.298717,0


### Checking the LR with scaled data and hyperparameter tuned

In [16]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Parameter Grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Parameters
best_parameters = grid_search.best_params_
print("Best Parameters:", best_parameters)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Parameter Tuned LR - Feature Engineered "):

    model = LogisticRegression(**best_parameters)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "param_tuned_lr_with_feature_eng")


2025/04/01 20:55:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7c750356c90a474a94ef5842b56dc073', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


2025/04/01 20:55:45 INFO mlflow.sklearn.utils: Logging the 5 best runs, 67 runs will be omitted.


Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7000
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       110
           1       0.66      0.68      0.67        90

    accuracy                           0.70       200
   macro avg       0.70      0.70      0.70       200
weighted avg       0.70      0.70      0.70       200





as there's no improvements with the LR lets try different other ML Models to fit these data to see any improvements

### Base - Decision Tree Performance Testing

In [17]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Baseline Decision Tree - Not Tuned"):

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "decision_trees_not_tuned")


Accuracy: 0.6800
              precision    recall  f1-score   support

           0       0.73      0.65      0.69       110
           1       0.63      0.71      0.67        90

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.68       200
weighted avg       0.69      0.68      0.68       200





### Parameter Tuned - Decision Tree Performance Testing

In [18]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter Grid for tuning
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
}

# Grid search for find best parameters
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
best_parameters = grid_search.best_params_
print(f"Best Hyperparameters: {best_parameters}")

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Decision Tree - Tuned"):
    
    model = DecisionTreeClassifier(random_state=42,**best_parameters)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "decision_trees_tuned")


2025/04/01 20:56:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b7305f244c7e40d397ffbda30c76d502', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/04/01 20:56:13 INFO mlflow.sklearn.utils: Logging the 5 best runs, 571 runs will be omitted.


Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.7450
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       110
           1       0.75      0.66      0.70        90

    accuracy                           0.74       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.75      0.74      0.74       200





### Base - Random Forest Performance Testing

In [19]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Baseline Random Forest - Not Tuned"):

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "random_forest_not_tuned")


Accuracy: 0.6700
              precision    recall  f1-score   support

           0       0.71      0.68      0.69       110
           1       0.63      0.66      0.64        90

    accuracy                           0.67       200
   macro avg       0.67      0.67      0.67       200
weighted avg       0.67      0.67      0.67       200





### Parameter Tuned - Random Forest Performance Testing

In [20]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

param_grid = {
    "n_estimators": [50, 100, 200, 300], 
    "max_depth": [None, 10, 20, 30],  
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],  
    "bootstrap": [True, False]
}

rf = RandomForestClassifier(random_state=42)

# Perform Randomized Search
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20, 
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1 
)

# ML Flow for model tracking
with mlflow.start_run(run_name="Random Forest - Tuned"):

    # Fit model with hyperparameter search
    rf_search.fit(X_train, y_train) 

    best_model = rf_search.best_estimator_
    
    # Predictions
    y_pred = best_model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "random_forest_tuned")


Fitting 5 folds for each of 20 candidates, totalling 100 fits


2025/04/01 20:56:41 INFO mlflow.sklearn.utils: Logging the 5 best runs, 15 runs will be omitted.


Accuracy: 0.7300
              precision    recall  f1-score   support

           0       0.73      0.82      0.77       110
           1       0.74      0.62      0.67        90

    accuracy                           0.73       200
   macro avg       0.73      0.72      0.72       200
weighted avg       0.73      0.73      0.73       200





### Base - Support Vector Machine - Performance Testing

In [21]:
# Features & Target
X = features = basic_data.drop('healthcare_target',axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog() # For auto log metrics

# ML Flow for model tracking
with mlflow.start_run(run_name="Baseline SVM - Not Tuned"):

    model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "SVM_not_tuned")


Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.75      0.96      0.84       110
           1       0.93      0.60      0.73        90

    accuracy                           0.80       200
   macro avg       0.84      0.78      0.79       200
weighted avg       0.83      0.80      0.79       200





In [22]:
# Features & Target
X = basic_data.drop('healthcare_target', axis=1)
y = enhanced_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['rbf', 'sigmoid']
}

# Perform Grid Search
grid_search = GridSearchCV(SVC(probability=True), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_parameters = grid_search.best_params_

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog()  # Enable auto logging

# ML Flow for model tracking
with mlflow.start_run(run_name="SVM - Tuned"):

    # model
    model = SVC(**best_parameters)

    model.fit(X_train, y_train) 

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Save classification report
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "SVM_tuned")


2025/04/01 20:56:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6f00e43d82ea4a54a2126cd033ad585b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2025/04/01 20:57:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


Accuracy: 0.8050
              precision    recall  f1-score   support

           0       0.74      0.99      0.85       110
           1       0.98      0.58      0.73        90

    accuracy                           0.81       200
   macro avg       0.86      0.78      0.79       200
weighted avg       0.85      0.81      0.79       200





### XG-Boost Performance Testing

In [25]:
# Features & Target
X = features = basic_data.drop('healthcare_target', axis=1)
y = basic_data['healthcare_target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("Healthcare Condition Prediction")
mlflow.sklearn.autolog()  # Auto log metrics

# MLflow for model tracking
with mlflow.start_run(run_name="Baseline XGBoost - Not Tuned"):

    # Define XGBoost Model
    model = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",  
        use_label_encoder=False,
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=6
    )

    # Train Model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Model artifacts
    report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report_text)
    mlflow.log_artifact("classification_report.txt")

    # Log feature importance
    importance = model.feature_importances_
    for i, col in enumerate(X.columns):
        mlflow.log_metric(f"Feature_{col}_importance", importance[i])

    # Save Model in MLflow
    mlflow.sklearn.log_model(model, "XGBoost_not_tuned")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6800
              precision    recall  f1-score   support

           0       0.71      0.72      0.71       110
           1       0.65      0.63      0.64        90

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.68       200
weighted avg       0.68      0.68      0.68       200





##### Base on these results and compare the parameters and scores from MLFlow we came up with random-forest classification and SVM as the best choosed ML Models for the given scenario, Therefore they will be choosen for further enhancements.