In [13]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, Dense, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import joblib  # For saving scaler
import matplotlib.pyplot as plt

In [14]:
# Paths
processed_folder = "processed_2Wavelenghts"  # Folder containing CSV files
subject_info_path = "subject_info.csv"  # Subject info file
model_save_path = "hemoglobin_cnn_model.h5"  # Path to save the trained model
scaler_save_path = "scaler.pkl"  # Path to save the scaler
predictions_save_path = "predictions.csv"  # Path to save predictions

In [15]:
# Load subject data
subjects_df = pd.read_csv(subject_info_path)

# Convert Gender to numeric (Male=1, Female=0)
label_encoder = LabelEncoder()
subjects_df["Gender"] = label_encoder.fit_transform(subjects_df["Gender"])

# Initialize dataset list
all_data = []

# Regular expression to extract ID from filename (assumes filenames like "1234_data.csv")
id_pattern = re.compile(r"(\d+)")  # Looks for numbers in the filename

# Process each CSV file
for file in os.listdir(processed_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(processed_folder, file)

        # Extract ID from filename
        match = id_pattern.search(file)
        if match:
            file_id = int(match.group(1))  # Convert ID to integer
            print(f"Processing file: {file} (ID: {file_id})")  # Debugging line

            # Read the processed CSV file
            df = pd.read_csv(file_path)

            # Find the corresponding subject data
            subject_row = subjects_df[subjects_df["id"] == file_id]
            
            if not subject_row.empty:
                # Merge subject data with CSV data
                subject_repeated = pd.concat([subject_row] * len(df), ignore_index=True)
                merged_df = pd.concat([df.reset_index(drop=True), subject_repeated.reset_index(drop=True)], axis=1)
                all_data.append(merged_df)
            else:
                print(f"⚠️ Skipping {file}: No matching ID in subject info!")

# Ensure data exists before concatenation
if not all_data:
    raise ValueError("❌ No valid CSV files found! Check file formats and ID matching.")

# Combine all data
data = pd.concat(all_data, ignore_index=True)


Processing file: processed_50_PPGdata.csv (ID: 50)
Processing file: processed_9_PPGdata.csv (ID: 9)
Processing file: processed_20_PPGdata.csv (ID: 20)
Processing file: processed_12_PPGdata.csv (ID: 12)
Processing file: processed_29_PPGdata.csv (ID: 29)
Processing file: processed_45_PPGdata.csv (ID: 45)
Processing file: processed_35_PPGdata.csv (ID: 35)
Processing file: processed_5_PPGdata.csv (ID: 5)
Processing file: processed_30_PPGdata.csv (ID: 30)
Processing file: processed_40_PPGdata.csv (ID: 40)
Processing file: processed_25_PPGdata.csv (ID: 25)
Processing file: processed_55_PPGdata.csv (ID: 55)
Processing file: processed_49_PPGdata.csv (ID: 49)
Processing file: processed_39_PPGdata.csv (ID: 39)
Processing file: processed_17_PPGdata.csv (ID: 17)
Processing file: processed_14_PPGdata.csv (ID: 14)
Processing file: processed_26_PPGdata.csv (ID: 26)
Processing file: processed_56_PPGdata.csv (ID: 56)
Processing file: processed_33_PPGdata.csv (ID: 33)
Processing file: processed_43_PPGda

In [16]:
# Define features and target
X = data.drop(columns=["id", "Hemoglobin(g/L)"])  # Drop ID and target
y = data["Hemoglobin(g/L)"]  # Target variable

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for future predictions
joblib.dump(scaler, scaler_save_path)
print(f"✅ Scaler saved to {scaler_save_path}")

# Reshape X for CNN: (samples, timesteps, features)
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build the 1D CNN Model
model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(1)  # Regression output for Hemoglobin prediction
])

# Compile the model
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=8, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Save the trained model
model.save(model_save_path)
print(f"✅ Model saved to {model_save_path}")

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)

# Print performance metrics
print(f"Test MAE: {mae:.4f}")
print(f"Test R^2: {r2:.4f}")

# Save predictions to a CSV file
predictions_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred.flatten()})
predictions_df.to_csv(predictions_save_path, index=False)
print(f"✅ Predictions saved to {predictions_save_path}")

✅ Scaler saved to scaler.pkl
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 10ms/step - loss: 3090.4607 - mae: 23.6060 - val_loss: 548.3335 - val_mae: 18.5431
Epoch 2/50
[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m685s[0m 10ms/step - loss: 27419.0605 - mae: 75.7523 - val_loss: 25137.5254 - val_mae: 136.3159
Epoch 3/50
[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m703s[0m 10ms/step - loss: 58352.6016 - mae: 105.2540 - val_loss: 4485.9204 - val_mae: 53.3385
Epoch 4/50
[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m690s[0m 10ms/step - loss: 97203.5312 - mae: 133.8696 - val_loss: 42060.0703 - val_mae: 181.5327
Epoch 5/50
[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 10ms/step - loss: 123508.9688 - mae: 148.8688 - val_loss: 31310.7676 - val_mae: 152.6255
Epoch 6/50
[1m45679/69600[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m4:12[0m 11ms/step - loss: 167025.8281 - mae: 173.8214

KeyboardInterrupt: 

In [None]:

# Plot Actual vs Predicted
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred, alpha=0.7, label="Predictions")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle="dashed", color="red", label="Perfect Fit")
plt.xlabel("Actual Hemoglobin (g/L)")
plt.ylabel("Predicted Hemoglobin (g/L)")
plt.title(f"Actual vs Predicted Hemoglobin (R²={r2:.4f})")
plt.legend()
plt.show()

# Display predictions
import ace_tools as tools
tools.display_dataframe_to_user(name="Hemoglobin Predictions", dataframe=predictions_df)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-14 14:40:32.624873: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-03-14 14:40:32.624895: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-03-14 14:40:32.624907: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-03-14 14:40:32.625121: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-03-14 14:40:32.625132: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/50


2025-03-14 14:40:33.926533: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m69600/69600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m704s[0m 10ms/step - loss: 2388.9751 - mae: 24.8233 - val_loss: 3977.9172 - val_mae: 49.9807
Epoch 2/50
[1m 2480/69600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:28[0m 9ms/step - loss: 20921.9277 - mae: 84.5153

KeyboardInterrupt: 