In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import xgboost as xgb
from colorama import init, Fore, Style
import files  # Import the files module for dataset paths

In [22]:
# Initialize colorama for colored outputs
init(autoreset=True)

# Set random seed for reproducibility
np.random.seed(42)

# Print a welcome message with project details
print(Fore.CYAN + Style.BRIGHT + "===========================================")
print(Fore.CYAN + Style.BRIGHT + " PPG Blood Glucose - JB Implementation")
print(Fore.CYAN + Style.BRIGHT + " Training Logistic Regression and XGBoost")
print(Fore.CYAN + Style.BRIGHT + "===========================================\n")

# Step 1: Load and merge the datasets
print(Fore.MAGENTA + "Step 1: Loading and merging datasets...")
try:
    # Load datasets using paths from files module
    ppg_data = pd.read_csv(files.PPG_MY_OWN)
    metadata = pd.read_csv(files.METADATA_PATH)
    print(Fore.GREEN + "Datasets loaded successfully!")
except FileNotFoundError as e:
    print(Fore.RED + f"Error: {e}")
    print(Fore.RED + "Please ensure the dataset files exist in the 'datasets/' directory.")
    raise Exception("File loading failed")

# Inspect column names for debugging
print(Fore.YELLOW + "PPG_MY_OWN columns: " + str(ppg_data.columns.tolist()))
print(Fore.YELLOW + "METADATA_PATH columns: " + str(metadata.columns.tolist()))

# Define the merge column (update this based on your dataset)
merge_column = 'subject_ID'  # Adjust this based on the actual column name

# Check if the merge column exists
if merge_column not in ppg_data.columns or merge_column not in metadata.columns:
    print(Fore.RED + f"Error: '{merge_column}' column not found in one or both datasets.")
    print(Fore.RED + "Please check the column names above and update the 'merge_column' variable.")
    raise Exception("Merge column not found")

# Merge datasets
try:
    data = pd.merge(ppg_data, metadata, on=merge_column, how='inner')
    print(Fore.GREEN + f"Merged dataset shape: {data.shape}")
except KeyError as e:
    print(Fore.RED + f"Error: {e}")
    print(Fore.RED + f"Ensure both datasets have a '{merge_column}' column for merging.")
    raise Exception("Merge failed")

# Step 2: Preprocess the data
print(Fore.MAGENTA + "\nStep 2: Preprocessing the data...")
# Define the target column (update this based on your dataset)
target_column = 'diabetes_label'  # Adjust this if the column name is different

# Check if the target column exists
if target_column not in data.columns:
    print(Fore.RED + f"Error: '{target_column}' column not found in the merged dataset.")
    print(Fore.RED + "Please check the column names above and update the 'target_column' variable.")
    raise Exception("Target column not found")

# Define the features to use
required_features = ['mean_ibi', 'sdnn', 'rmssd', 'length_to_max_ratio', 'spectral_entropy']

# Check if all required features exist in the dataset
missing_features = [feat for feat in required_features if feat not in data.columns]
if missing_features:
    print(Fore.RED + f"Error: The following required features are missing: {missing_features}")
    print(Fore.RED + "Please check the column names above and ensure these features are present.")
    raise Exception("Required features missing")

# Separate features and target, using only the specified features
try:
    X = data[required_features]  # Select only the specified features
    y = data[target_column]
except KeyError as e:
    print(Fore.RED + f"Error: {e}")
    print(Fore.RED + f"Ensure the '{target_column}' column exists in the dataset.")
    raise Exception("Column selection failed")

# Encode the target labels ('Yes'/'No' to 1/0)
print(Fore.YELLOW + "Encoding target labels...")
try:
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    print(Fore.GREEN + "Target labels encoded: 'No' -> 0, 'Yes' -> 1")
except ValueError as e:
    print(Fore.RED + f"Error during label encoding: {e}")
    print(Fore.RED + "Ensure the target column contains valid categorical values (e.g., 'Yes'/'No').")
    raise Exception("Label encoding failed")

# Handle missing values by filling with the mean
print(Fore.YELLOW + "Handling missing values...")
X = X.fillna(X.mean())
if X.isnull().sum().sum() == 0:
    print(Fore.GREEN + "No missing values remain after preprocessing.")
else:
    print(Fore.RED + "Warning: Some missing values still exist after preprocessing!")

# Scale the features
print(Fore.YELLOW + "Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(Fore.GREEN + "Features scaled successfully!")

# Step 3: Set up stratified k-fold cross-validation
n_folds = 5
print(Fore.MAGENTA + f"\nStep 3: Setting up {n_folds}-fold stratified cross-validation...")
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize models
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Lists to store results
logistic_cv_scores = []
xgb_cv_scores = []
logistic_roc_scores = []
xgb_roc_scores = []
logistic_conf_matrices = []
xgb_conf_matrices = []

# Step 4: Train and evaluate models using cross-validation
print(Fore.MAGENTA + "\nStep 4: Training and evaluating models...")
fold = 1
for train_idx, test_idx in skf.split(X_scaled, y):
    print(Fore.CYAN + f"Processing fold {fold}/{n_folds}...", end="")
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Logistic Regression
    logistic_model.fit(X_train, y_train)
    logistic_pred = logistic_model.predict(X_test)
    logistic_prob = logistic_model.predict_proba(X_test)[:, 1]
    logistic_cv_scores.append(logistic_model.score(X_test, y_test))
    logistic_roc_scores.append(roc_auc_score(y_test, logistic_prob))
    logistic_conf_matrices.append(confusion_matrix(y_test, logistic_pred))

    # XGBoost
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    xgb_prob = xgb_model.predict_proba(X_test)[:, 1]
    xgb_cv_scores.append(xgb_model.score(X_test, y_test))
    xgb_roc_scores.append(roc_auc_score(y_test, xgb_prob))
    xgb_conf_matrices.append(confusion_matrix(y_test, xgb_pred))

    print(Fore.GREEN + " Done!")
    fold += 1

# Step 5: Visualize results using matplotlib
print(Fore.MAGENTA + "\nStep 5: Visualizing results...")

# Plot 1: Bar chart for CV Accuracy and ROC AUC
plt.figure(figsize=(8, 6))
models = ['Logistic Regression', 'XGBoost']
cv_accuracies = [np.mean(logistic_cv_scores), np.mean(xgb_cv_scores)]
roc_aucs = [np.mean(logistic_roc_scores), np.mean(xgb_roc_scores)]

x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, cv_accuracies, width, label='CV Accuracy', color='skyblue')
plt.bar(x + width/2, roc_aucs, width, label='ROC AUC', color='lightcoral')
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(x, models)
plt.legend()
plt.tight_layout()
plt.savefig('model_performance_comparison.png')
plt.close()

# Plot 2: Confusion Matrix Heatmap for Logistic Regression
plt.figure(figsize=(6, 5))
sns.heatmap(logistic_conf_matrices[-1], annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix - Logistic Regression (Last Fold)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix_logistic.png')
plt.close()

# Plot 3: Confusion Matrix Heatmap for XGBoost
plt.figure(figsize=(6, 5))
sns.heatmap(xgb_conf_matrices[-1], annot=True, fmt='d', cmap='Greens', 
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix - XGBoost (Last Fold)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix_xgboost.png')
plt.close()

# Step 6: Feature Importance for XGBoost
print(Fore.MAGENTA + "\nStep 6: Analyzing feature importance for XGBoost...")
feature_names = X.columns
feature_importance = xgb_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
top_n = min(5, len(feature_names))  # Already limited to 5 features

# Plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(range(top_n), feature_importance[sorted_idx[:top_n]], align='center', color='lightgreen')
plt.yticks(range(top_n), [feature_names[i] for i in sorted_idx[:top_n]])
plt.xlabel('Feature Importance')
plt.title('Feature Importance - XGBoost')
plt.tight_layout()
plt.savefig('feature_importance_xgboost.png')
plt.close()

# Print a message about where to find the plots
print(Fore.CYAN + Style.BRIGHT + "\n===========================================")
print(Fore.GREEN + Style.BRIGHT + "Training and evaluation completed successfully!")
print(Fore.GREEN + "Plots saved in the 'outputs/' directory:")
print(Fore.GREEN + "- Model Performance: model_performance_comparison.png")
print(Fore.GREEN + "- Confusion Matrix (Logistic): confusion_matrix_logistic.png")
print(Fore.GREEN + "- Confusion Matrix (XGBoost): confusion_matrix_xgboost.png")
print(Fore.GREEN + "- Feature Importance (XGBoost): feature_importance_xgboost.png")
print(Fore.CYAN + Style.BRIGHT + "===========================================")

 PPG Blood Glucose - JB Implementation


 Training Logistic Regression and XGBoost

Step 1: Loading and merging datasets...
Datasets loaded successfully!
PPG_MY_OWN columns: ['subject_ID', 'num_segments', 'mean_ibi', 'sdnn', 'rmssd', 'length_to_max_ratio', 'spectral_entropy']
METADATA_PATH columns: ['subject_ID', 'Sex', 'Age', 'BMI', 'Heart Rate', 'diabetes_label']
Merged dataset shape: (219, 12)

Step 2: Preprocessing the data...
Encoding target labels...
Target labels encoded: 'No' -> 0, 'Yes' -> 1
Handling missing values...
No missing values remain after preprocessing.
Scaling features...
Features scaled successfully!

Step 3: Setting up 5-fold stratified cross-validation...

Step 4: Training and evaluating models...
Processing fold 1/5...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Done!
Processing fold 2/5...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Done!
Processing fold 3/5...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Done!
Processing fold 4/5...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Done!
Processing fold 5/5...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Done!

Step 5: Visualizing results...

Step 6: Analyzing feature importance for XGBoost...

Training and evaluation completed successfully!
Plots saved in the 'outputs/' directory:
- Model Performance: model_performance_comparison.png
- Confusion Matrix (Logistic): confusion_matrix_logistic.png
- Confusion Matrix (XGBoost): confusion_matrix_xgboost.png
- Feature Importance (XGBoost): feature_importance_xgboost.png
