# Modeling

In this notebook, we will build and train machine learning models using the processed data. We will explore various algorithms, tune hyperparameters, and evaluate model performance.

In [1]:
import sys
print(sys.executable)

c:\Users\siddu\healthcare-disease-prediction\healthcare-ml-env\Scripts\python.exe


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Load the processed dataset
data = pd.read_csv('../data/processed/processed_data.csv')

# Split the data into features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(probability=True, random_state=42)
}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Save model
    joblib.dump(model, f'../models/{name}_model.pkl')
    print(f"{name} model saved to ../models/{name}_model.pkl")
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    print(f"Results for {name}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    print("-" * 50)

ensemble_models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

for name, model in ensemble_models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f'../models/{name}_model.pkl')
    print(f"{name} model saved to ../models/{name}_model.pkl")

RandomForest model saved to ../models/RandomForest_model.pkl
Results for RandomForest:
Confusion Matrix:
[[71 10]
 [18 24]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84        81
           1       0.71      0.57      0.63        42

    accuracy                           0.77       123
   macro avg       0.75      0.72      0.73       123
weighted avg       0.77      0.77      0.77       123

Accuracy Score: 0.7723577235772358
--------------------------------------------------
DecisionTree model saved to ../models/DecisionTree_model.pkl
Results for DecisionTree:
Confusion Matrix:
[[62 19]
 [18 24]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77        81
           1       0.56      0.57      0.56        42

    accuracy                           0.70       123
   macro avg       0.67      0.67      0.67       123
weighted avg       0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model saved to ../models/XGBoost_model.pkl
[LightGBM] [Info] Number of positive: 171, number of negative: 320
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 578
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348269 -> initscore=-0.626657
[LightGBM] [Info] Start training from score -0.626657
LightGBM model saved to ../models/LightGBM_model.pkl
CatBoost model saved to ../models/CatBoost_model.pkl
