In [262]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.nn import Module, LSTM, Dropout, Linear, Sigmoid, BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, auc, precision_recall_curve, classification_report


In [263]:
# Constants
DATASET_PATH = "./hbp_dataset.csv"
TARGET_NAME = DATASET_PATH.split("/")[-1].split("_")[0]
TARGET_CLASS = "class_" + TARGET_NAME + "_w8"

LEARNING_RATE = 0.001
MAX_EPOCHS = 100
BATCH_SIZE = 32
BIDIRECTIONAL = True
DROPOUT = 0.5
LSTM_UNITS = 100
DENSE_UNITS = 1
ACTIVATION = 'sigmoid'
LOSS = BCEWithLogitsLoss()
METRICS = ['accuracy']
VALIDATION_SPLIT = 0.1
PATIENCE = 10
NUM_LAYERS = 2
TEST_SIZE = 0.1

print(f"Dataset: {DATASET_PATH}")
print(f"Target: {TARGET_NAME}")
print(f"Target class: {TARGET_CLASS}")

Dataset: ./hbp_dataset.csv
Target: hbp
Target class: class_hbp_w8


In [265]:
data = pd.read_csv(DATASET_PATH)
data.replace('?', np.nan, inplace=True)
data = data.apply(pd.to_numeric, errors='coerce')
data.fillna(0, inplace=True)

In [266]:
target_wave_suffix = TARGET_CLASS.split('_')[-1]  # This will give you "w6"
target_wave_number = int(target_wave_suffix[1:])  # Extract the wave number as an integer
print(f"Target wave number: {target_wave_number}")
print(f"Target wave suffix: {target_wave_suffix}")

# Remove class variables except the target wave
class_vars_to_remove = [col for col in data.columns if col.startswith("class_") and TARGET_CLASS not in col]
features_to_remove = [col for col in data.columns if any(col.endswith(f'w{wave}') for wave in range(target_wave_number + 1, 9))]

# Combine lists to remove both class variables and features from later waves
columns_to_remove = class_vars_to_remove + features_to_remove

data_copy = data.drop(columns=columns_to_remove)
print(f"Removed class variables: {class_vars_to_remove}")
print(f"Removed columns: {columns_to_remove}")

print(data_copy.head())


Target wave number: 8
Target wave suffix: w8
Removed class variables: ['class_hbp_w3', 'class_hbp_w4', 'class_hbp_w5', 'class_hbp_w6', 'class_hbp_w7']
Removed columns: ['class_hbp_w3', 'class_hbp_w4', 'class_hbp_w5', 'class_hbp_w6', 'class_hbp_w7']
   indager_wave8  sex  apoe_wave2  bmiobe_w2  bmiobe_w4  bmiobe_w6   cfib_w2  \
0           0.38    0    0.135755      0.667      0.667      0.667  0.352429   
1           0.72    0    0.135755      0.667      0.667      0.667  0.345429   
2           0.34    1    0.135755      1.000      1.000      1.000  0.345286   
3           0.56    0    0.135755      0.667      0.667      0.667  0.338000   
4           0.00    1    0.135755      0.000      1.000      1.000  0.345286   

    cfib_w4   cfib_w6   cfib_w8  ...    wbc_w8  whval_w2  whval_w4  wstval_w2  \
0  0.503143  0.485429  0.497000  ...  0.026000  0.468286  0.400000   0.385000   
1  0.370000  0.294000  0.294000  ...  0.188000  0.468429  0.430000   0.390571   
2  0.500000  0.495429  0.49

In [268]:
# Separate features and target variable
X = data_copy.drop(columns=[TARGET_CLASS])
y = data_copy[TARGET_CLASS]
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (7097, 140)
y shape: (7097,)


In [269]:
# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [270]:
# Identify features by wave
column_names = X.columns
wave_identifiers = sorted(set(col.split('_')[-1] for col in column_names if col != 'sex' and col != 'indager_wave8' and col != 'dheas_wave4' and col != 'apoe_wave2'))
non_longitudinal_features = [col for col in column_names if not col.endswith('w1') and not col.endswith('w2') and not col.endswith('w3') and not col.endswith('w4') and not col.endswith('w5') and not col.endswith('w6') and not col.endswith('w7') and not col.endswith('w8')]

print(f"Wave identifiers: {wave_identifiers}")
print(non_longitudinal_features)

Wave identifiers: ['w2', 'w4', 'w6', 'w8']
['indager_wave8', 'sex', 'apoe_wave2', 'dheas_wave4']


In [None]:
# Group features by waves
features_by_wave = {wave: [] for wave in wave_identifiers}
for col in column_names:
    if col not in non_longitudinal_features:
        wave = col.split('_')[-1]
        features_by_wave[wave].append(col)
        print(wave, col)
        

In [272]:
# Prepare data for RNN
n_samples = X_scaled.shape[0]
n_timesteps = len(wave_identifiers)
n_features_per_wave = {wave: len(features) for wave, features in features_by_wave.items()}
max_features = max(n_features_per_wave.values())

print(f"Number of samples: {n_samples}")
print(f"Number of waves: {n_timesteps}")
print(f"Wave identifiers: {wave_identifiers}")
print(f"Number of features per wave: {n_features_per_wave}")

Number of samples: 7097
Number of waves: 4
Wave identifiers: ['w2', 'w4', 'w6', 'w8']
Number of features per wave: {'w2': 37, 'w4': 40, 'w6': 38, 'w8': 21}


In [None]:
# Reshape data without non-longitudinal features
X_reshaped = np.zeros((n_samples, n_timesteps, max_features))
for i, wave in enumerate(wave_identifiers):
    wave_features = features_by_wave[wave]
    indices = [column_names.get_loc(f) for f in wave_features]
    X_reshaped[:, i, :len(indices)] = X_scaled[:, indices]
    print(f"Wave {wave} features: {wave_features}")
print(f"X reshaped shape: {X_reshaped.shape}")

In [276]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=TEST_SIZE, random_state=42)

# PyTorch Dataset and DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [277]:
print(f"X shape: {X_train.shape}")
print(f"y shape: {y_train.shape}")
print(f"Wave identifiers: {wave_identifiers}")
print(f"Number of features per wave: {n_features_per_wave}")

X shape: (6387, 4, 40)
y shape: (6387,)
Wave identifiers: ['w2', 'w4', 'w6', 'w8']
Number of features per wave: {'w2': 37, 'w4': 40, 'w6': 38, 'w8': 21}


In [278]:
class LSTMModel(Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional=False, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = LSTM(input_dim, hidden_dim, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout)
        self.dropout = Dropout(dropout)
        direction_factor = 2 if bidirectional else 1
        self.fc = Linear(hidden_dim * direction_factor, output_dim)
        self.sigmoid = Sigmoid()

    def forward(self, x):
        h_lstm, _ = self.lstm(x)
        h_lstm = self.dropout(h_lstm[:, -1, :])  # Get the output of the last LSTM cell
        out = self.fc(h_lstm)
        out = self.sigmoid(out)
        return out

In [279]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[2]
hidden_dim = LSTM_UNITS
output_dim = DENSE_UNITS
n_layers = NUM_LAYERS

model = LSTMModel(input_dim, hidden_dim, output_dim, n_layers, BIDIRECTIONAL, DROPOUT).to(device)
criterion = LOSS
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [280]:
# Training function
def train_model(model, train_loader, criterion, optimizer, n_epochs, device):
    model.train()
    for epoch in range(n_epochs):
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * X_batch.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss:.4f}")

In [281]:
# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad(): 
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_true.extend(y_batch.cpu().numpy())
            y_pred_batch = model(X_batch)
            y_pred.extend(y_pred_batch.cpu().numpy())
    print(y_true)

    y_pred = np.array(y_pred).squeeze()
    
    # Convert predictions to binary values
    y_pred_binary = (y_pred > 0.5).astype(int)

    print(y_pred_binary)
    
    # Calculate metrics using binary predictions
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    
    # ROC-AUC and Precision-Recall AUC use the continuous y_pred
    roc_auc = roc_auc_score(y_true, y_pred)
    precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall_curve, precision_curve)
    
    report = classification_report(y_true, y_pred_binary)
    conf_matrix = confusion_matrix(y_true, y_pred_binary)

    print(f'Test Precision: {precision:.4f}')
    print(f'Test Recall: {recall:.4f}')
    print(f'Test F1 Score: {f1:.4f}')
    print(f'Test ROC-AUC Score: {roc_auc:.4f}')
    print(f'Test AUPRC Score: {auprc:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Classification Report:\n{report}')

In [282]:
# Training the model
train_model(model, train_loader, criterion, optimizer, 100, device)
evaluate_model(model, test_loader, device)

Epoch 1/100, Loss: 0.6913
Epoch 2/100, Loss: 0.6768
Epoch 3/100, Loss: 0.6742
Epoch 4/100, Loss: 0.6713
Epoch 5/100, Loss: 0.6697
Epoch 6/100, Loss: 0.6665
Epoch 7/100, Loss: 0.6633
Epoch 8/100, Loss: 0.6604
Epoch 9/100, Loss: 0.6578
Epoch 10/100, Loss: 0.6551
Epoch 11/100, Loss: 0.6517
Epoch 12/100, Loss: 0.6509
Epoch 13/100, Loss: 0.6491
Epoch 14/100, Loss: 0.6452
Epoch 15/100, Loss: 0.6408
Epoch 16/100, Loss: 0.6403
Epoch 17/100, Loss: 0.6406
Epoch 18/100, Loss: 0.6369
Epoch 19/100, Loss: 0.6358
Epoch 20/100, Loss: 0.6332
Epoch 21/100, Loss: 0.6333
Epoch 22/100, Loss: 0.6308
Epoch 23/100, Loss: 0.6276
Epoch 24/100, Loss: 0.6273
Epoch 25/100, Loss: 0.6270
Epoch 26/100, Loss: 0.6242
Epoch 27/100, Loss: 0.6248
Epoch 28/100, Loss: 0.6223
Epoch 29/100, Loss: 0.6225
Epoch 30/100, Loss: 0.6212
Epoch 31/100, Loss: 0.6208
Epoch 32/100, Loss: 0.6214
Epoch 33/100, Loss: 0.6209
Epoch 34/100, Loss: 0.6214
Epoch 35/100, Loss: 0.6188
Epoch 36/100, Loss: 0.6182
Epoch 37/100, Loss: 0.6174
Epoch 38/1