In [None]:
# Created/Modified files during execution:
#    1. cnn_model.pt (PyTorch CNN weights)
#    2. xgboost_model.json (Trained XGBoost model)
#    3. combined_embeddings.csv (Optional CSV storing extracted embeddings)

import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

###############################################################################
# 1. DATA LOADING AND PREPROCESSING
###############################################################################

class HeartImageDataset(Dataset):
    """
    Custom Dataset to load (image, label) pairs.
    Expects:
        - image_paths: List of filepaths to images.
        - labels: Numpy array or list of labels (0 or 1, for example).
        - transform: Any torchvision transform to apply.
    """
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        label = self.labels[idx]

        if self.transform is not None:
            image = self.transform(image)

        return image, label

def load_structured_data(csv_path):
    """
    Loads tabular data (e.g., age, cholesterol, blood pressure, etc.)
    The CSV must contain columns like:
       'patient_id', 'age', 'bp', 'cholesterol', 'label', ...
    """
    df = pd.read_csv(csv_path)
    return df

###############################################################################
# 2. DEFINE A SIMPLE CNN FOR IMAGE EMBEDDING
###############################################################################

class SimpleCNN(nn.Module):
    """
    A small CNN to extract embeddings from heart-related images.
    Modify layers for your dataset and problem complexity.
    """
    def __init__(self, num_embedding_features=128):
        super(SimpleCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4,4))  # adjustable
        )
        # Flatten + final linear layer to produce embeddings
        self.fc = nn.Sequential(
            nn.Linear(64 * 4 * 4, num_embedding_features),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

###############################################################################
# 3. TRAIN THE CNN TO EXTRACT IMAGE EMBEDDINGS
###############################################################################

def train_cnn(cnn_model, dataloader, num_epochs=2, lr=1e-3):
    """
    Trains the CNN on the labeled images to learn relevant features.
    In practice, you might:
      - Train a classifier head, or
      - Use a pre-trained CNN and fine-tune.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cnn_model = cnn_model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(cnn_model.parameters(), lr=lr)

    cnn_model.train()
    for epoch in range(num_epochs):
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            embeddings = cnn_model(images)
            # Suppose we treat the final embedding dimension as classes for simplicity;
            # in a real scenario, you might have a separate classification head.
            # Here, set "num_embedding_features = number_of_classes" if you want direct classification.
            loss = criterion(embeddings, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {loss.item():.4f}")

    # Save CNN weights (optional)
    torch.save(cnn_model.state_dict(), "cnn_model.pt")

###############################################################################
# 4. EXTRACT EMBEDDINGS FROM IMAGES AND COMBINE WITH TABULAR DATA
###############################################################################

def extract_embeddings(cnn_model, dataloader):
    """
    Passes images through the trained CNN to get embeddings.
    Returns a numpy array of dimension [num_images, embedding_size].
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cnn_model = cnn_model.to(device)
    cnn_model.eval()

    all_embeddings = []
    all_labels = []
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            embeddings = cnn_model(images)
            all_embeddings.append(embeddings.cpu().numpy())
            all_labels.append(labels.numpy())

    all_embeddings = np.concatenate(all_embeddings, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return all_embeddings, all_labels

###############################################################################
# 5. TRAIN XGBOOST ON THE COMBINED FEATURES
###############################################################################

def train_xgboost(tabular_data, image_embeddings, labels):
    """
    Combines tabular features and image embeddings, trains an XGBoost classifier.
    Expects:
       tabular_data: 2D array or DataFrame of shape [num_samples, num_tabular_features].
       image_embeddings: 2D array of shape [num_samples, embedding_size].
       labels: 1D array of length num_samples (0 or 1).

    Returns the trained XGBoost model.
    """
    # Combine with image embeddings
    combined_features = np.hstack([tabular_data, image_embeddings])

    X_train, X_val, y_train, y_val = train_test_split(
        combined_features, labels, test_size=0.2, random_state=42
    )

    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        use_label_encoder=False
    )

    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=10,
        eval_metric='logloss',
        verbose=False
    )

    # Evaluate
    y_pred = xgb_model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"Validation Accuracy: {acc:.4f}")

    # Save the trained XGBoost model (optional)
    xgb_model.save_model("xgboost_model.json")

    return xgb_model

###############################################################################
# 6. MAIN EXECUTION (DEMO)
###############################################################################

def main():
    """
    This main function demonstrates how to:
      1) Load tabular data
      2) Load images and build a dataset
      3) Train CNN and extract embeddings
      4) Train an XGBoost classifier on combined data
    NOTE: Replace file paths, CSV columns, label definitions, and
          hyperparameters as appropriate for your dataset.
    """

    # ---------------------------
    # A. Load your structured data
    # ---------------------------
    structured_df = load_structured_data("kegel_heart_patients.csv")
    # Example assumption of columns: ['patient_id', 'age', 'bp', 'cholesterol', 'label']
    # Filter out the columns you want as features (besides 'patient_id' and 'label')
    # Make sure your CSV or data source is correct
    feature_cols = ["age", "bp", "cholesterol"]
    X_tabular = structured_df[feature_cols].values
    y = structured_df["label"].values  # 0 or 1 for heart disease presence/absence

    # ---------------------------
    # B. Prepare image paths
    # ---------------------------
    # Suppose 'image_path' column in CSV holds the file path for the corresponding patient's heart-related image
    image_paths = structured_df["image_path"].values

    # ---------------------------
    # C. Build PyTorch dataset/dataloader
    # ---------------------------
    transform = T.Compose([
        T.Resize((64, 64)),
        T.ToTensor(),
    ])

    dataset = HeartImageDataset(image_paths, y, transform=transform)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=2)

    # ---------------------------
    # D. Train your CNN (or load pre-trained)
    # ---------------------------
    cnn_model = SimpleCNN(num_embedding_features=64)
    # For demonstration, train only a few epochs. Adjust as needed.
    train_cnn(cnn_model, dataloader, num_epochs=2, lr=1e-3)

    # ---------------------------
    # E. Extract image embeddings
    # ---------------------------
    # Use a new dataloader without shuffle for consistent ordering
    inference_dataloader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=2)
    image_embeddings, all_labels = extract_embeddings(cnn_model, inference_dataloader)

    # ---------------------------
    # F. Train XGBoost classifier
    # ---------------------------
    xgb_model = train_xgboost(X_tabular, image_embeddings, all_labels)

    print("Multi-modal heart disease detection pipeline complete.")

if __name__ == "__main__":
    main()