In [186]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.nn import Module, LSTM, Dropout, Linear, Sigmoid, BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, auc, precision_recall_curve, classification_report


In [187]:
# Constants
DATASET_PATH = "./vehicle_dataset.csv"
TARGET_NAME = DATASET_PATH.split("/")[-1].split("_")[0]
TARGET_CLASS = "class_" + TARGET_NAME + "_w8"

LEARNING_RATE = 0.001
MAX_EPOCHS = 100
BATCH_SIZE = 64
BIDIRECTIONAL = True
DROPOUT = 0.5
LSTM_UNITS = 100
DENSE_UNITS = 1
ACTIVATION = 'sigmoid'
LOSS = BCEWithLogitsLoss()
METRICS = ['accuracy']
VALIDATION_SPLIT = 0.1
PATIENCE = 10
NUM_LAYERS = 2
TEST_SIZE = 0.1

print(f"Dataset: {DATASET_PATH}")
print(f"Target: {TARGET_NAME}")
print(f"Target class: {TARGET_CLASS}")

Dataset: ./arthritis_dataset.csv
Target: arthritis
Target class: class_arthritis_w8


In [188]:
data = pd.read_csv(DATASET_PATH)
data.replace('?', np.nan, inplace=True)
data = data.apply(pd.to_numeric, errors='coerce')
data.fillna(data.mean(), inplace=True)

In [189]:
# Remove class variables except the target for wave 8
class_vars_to_remove = [col for col in data.columns if f"class_{TARGET_NAME}_w" in col and TARGET_CLASS not in col]
data_copy = data.drop(columns=class_vars_to_remove)
print(f"Removed class variables: {class_vars_to_remove}")

Removed class variables: ['class_arthritis_w3', 'class_arthritis_w4', 'class_arthritis_w5', 'class_arthritis_w6', 'class_arthritis_w7']


In [190]:
# Separate features and target variable
X = data_copy.drop(columns=[TARGET_CLASS])
y = data_copy[TARGET_CLASS]
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (7097, 140)
y shape: (7097,)


In [191]:
# Normalize the data
# scaler = StandardScaler()
# X = data_copy.drop(columns=[TARGET_CLASS])
# y = data_copy[TARGET_CLASS]
# X_scaled = scaler.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [192]:
# Identify features by wave
column_names = X.columns
wave_identifiers = sorted(set(col.split('_')[-1] for col in column_names if col != 'sex' and col != 'indager_wave8' and col != 'dheas_wave4' and col != 'apoe_wave2'))
non_longitudinal_features = ['sex', 'indager_wave8', 'dheas_wave4', 'apoe_wave2']

print(f"Wave identifiers: {wave_identifiers}")
print(non_longitudinal_features)

Wave identifiers: ['w2', 'w4', 'w6', 'w8']
['sex', 'indager_wave8', 'dheas_wave4', 'apoe_wave2']


In [193]:
# Group features by waves
features_by_wave = {wave: [] for wave in wave_identifiers}
for col in column_names:
    if col not in non_longitudinal_features:
        wave = col.split('_')[-1]
        features_by_wave[wave].append(col)
        print(wave, col)

w2 bmiobe_w2
w4 bmiobe_w4
w6 bmiobe_w6
w2 cfib_w2
w4 cfib_w4
w6 cfib_w6
w8 cfib_w8
w2 chestin_w2
w4 chestin_w4
w6 chestin_w6
w2 chol_w2
w4 chol_w4
w6 chol_w6
w8 chol_w8
w2 clotb_w2
w4 clotb_w4
w6 clotb_w6
w8 clotb_w8
w2 diaval_w2
w4 diaval_w4
w6 diaval_w6
w8 diaval_w8
w2 eyesurg_w2
w4 eyesurg_w4
w6 eyesurg_w6
w2 fglu_w2
w4 fglu_w4
w6 fglu_w6
w8 fglu_w8
w2 hastro_w2
w4 hastro_w4
w6 hastro_w6
w2 hasurg_w2
w4 hasurg_w4
w6 hasurg_w6
w2 hba1c_w2
w4 hba1c_w4
w6 hba1c_w6
w8 hba1c_w8
w2 hdl_w2
w4 hdl_w4
w6 hdl_w6
w8 hdl_w8
w2 hgb_w2
w4 hgb_w4
w6 hgb_w6
w8 hgb_w8
w2 hipval_w2
w4 hipval_w4
w2 hscrp_w2
w4 hscrp_w4
w6 hscrp_w6
w8 hscrp_w8
w2 htfev_w2
w4 htfev_w4
w6 htfev_w6
w2 htfvc_w2
w4 htfvc_w4
w6 htfvc_w6
w2 htpf_w2
w4 htpf_w4
w2 htval_w2
w4 htval_w4
w6 htval_w6
w4 igf1_w4
w6 igf1_w6
w8 igf1_w8
w2 ldl_w2
w4 ldl_w4
w6 ldl_w6
w8 ldl_w8
w2 mapval_w2
w4 mapval_w4
w6 mapval_w6
w8 mapval_w8
w4 mch_w4
w6 mch_w6
w8 mch_w8
w2 mmcrre_w2
w4 mmcrre_w4
w6 mmcrre_w6
w2 mmftre2_w2
w4 mmftre2_w4
w6 mmftre2_w6

In [194]:
# Prepare data for RNN
n_samples = X_scaled.shape[0]
n_timesteps = len(wave_identifiers)
n_features_per_wave = {wave: len(features) for wave, features in features_by_wave.items()}
max_features = max(n_features_per_wave.values())

print(f"Number of samples: {n_samples}")
print(f"Number of waves: {n_timesteps}")
print(f"Wave identifiers: {wave_identifiers}")
print(f"Number of features per wave: {n_features_per_wave}")

Number of samples: 7097
Number of waves: 4
Wave identifiers: ['w2', 'w4', 'w6', 'w8']
Number of features per wave: {'w2': 37, 'w4': 40, 'w6': 38, 'w8': 21}


In [195]:
# Reshape data without non-longitudinal features
X_reshaped = np.zeros((n_samples, n_timesteps, max_features))
for i, wave in enumerate(wave_identifiers):
    wave_features = features_by_wave[wave]
    indices = [column_names.get_loc(f) for f in wave_features]
    X_reshaped[:, i, :len(indices)] = X_scaled[:, indices]
print(f"X reshaped shape: {X_reshaped.shape}")

X reshaped shape: (7097, 4, 40)


In [196]:
# Add non-longitudinal features to every timestep
for feature in non_longitudinal_features:
    feature_index = column_names.get_loc(feature)
    expanded_features = np.repeat(X_scaled[:, feature_index][:, np.newaxis], n_timesteps, axis=1)
    X_reshaped[:, :, -len(non_longitudinal_features) + non_longitudinal_features.index(feature)] = expanded_features


In [197]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=TEST_SIZE, random_state=42)

# PyTorch Dataset and DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [198]:
print(f"X shape: {X_train.shape}")
print(f"y shape: {y_train.shape}")
print(f"Wave identifiers: {wave_identifiers}")
print(f"Number of features per wave: {n_features_per_wave}")

X shape: (6387, 4, 40)
y shape: (6387,)
Wave identifiers: ['w2', 'w4', 'w6', 'w8']
Number of features per wave: {'w2': 37, 'w4': 40, 'w6': 38, 'w8': 21}


In [199]:
class LSTMModel(Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional=False, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = LSTM(input_dim, hidden_dim, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout)
        self.dropout = Dropout(dropout)
        direction_factor = 2 if bidirectional else 1
        self.fc = Linear(hidden_dim * direction_factor, output_dim)
        self.sigmoid = Sigmoid()

    def forward(self, x):
        h_lstm, _ = self.lstm(x)
        h_lstm = self.dropout(h_lstm[:, -1, :])  # Get the output of the last LSTM cell
        out = self.fc(h_lstm)
        out = self.sigmoid(out)
        return out

In [200]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[2]
hidden_dim = LSTM_UNITS
output_dim = DENSE_UNITS
n_layers = NUM_LAYERS

model = LSTMModel(input_dim, hidden_dim, output_dim, n_layers, BIDIRECTIONAL, DROPOUT).to(device)
criterion = LOSS
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [201]:
# Training function
def train_model(model, train_loader, criterion, optimizer, n_epochs, device):
    model.train()
    for epoch in range(n_epochs):
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * X_batch.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss:.4f}")

In [202]:
# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_true.extend(y_batch.cpu().numpy())
            y_pred_batch = model(X_batch)
            y_pred.extend(y_pred_batch.cpu().numpy())

    y_pred = np.array(y_pred).squeeze()
    y_pred_binary = (y_pred > 0.5).astype(int)
    
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred)
    precision_curve, recall_curve, thresholds = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall_curve, precision_curve)
    report = classification_report(y_true, y_pred_binary)
    conf_matrix = confusion_matrix(y_true, y_pred_binary)

    print(f'Test Precision: {precision:.4f}')
    print(f'Test Recall: {recall:.4f}')
    print(f'Test F1 Score: {f1:.4f}')
    print(f'Test ROC-AUC Score: {roc_auc:.4f}')
    print(f'Test AUPRC Score: {auprc:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Classification Report:\n{report}')

In [203]:
# Training the model
train_model(model, train_loader, criterion, optimizer, MAX_EPOCHS, device)
evaluate_model(model, test_loader, device)

Epoch 1/100, Loss: 0.6891
Epoch 2/100, Loss: 0.6821
Epoch 3/100, Loss: 0.6766
Epoch 4/100, Loss: 0.6764
Epoch 5/100, Loss: 0.6735
Epoch 6/100, Loss: 0.6698
Epoch 7/100, Loss: 0.6669
Epoch 8/100, Loss: 0.6653
Epoch 9/100, Loss: 0.6644
Epoch 10/100, Loss: 0.6623
Epoch 11/100, Loss: 0.6610
Epoch 12/100, Loss: 0.6588
Epoch 13/100, Loss: 0.6604
Epoch 14/100, Loss: 0.6554
Epoch 15/100, Loss: 0.6532
Epoch 16/100, Loss: 0.6530
Epoch 17/100, Loss: 0.6518
Epoch 18/100, Loss: 0.6519
Epoch 19/100, Loss: 0.6472
Epoch 20/100, Loss: 0.6441
Epoch 21/100, Loss: 0.6469
Epoch 22/100, Loss: 0.6455
Epoch 23/100, Loss: 0.6412
Epoch 24/100, Loss: 0.6409
Epoch 25/100, Loss: 0.6388
Epoch 26/100, Loss: 0.6373
Epoch 27/100, Loss: 0.6388
Epoch 28/100, Loss: 0.6379
Epoch 29/100, Loss: 0.6378
Epoch 30/100, Loss: 0.6379
Epoch 31/100, Loss: 0.6356
Epoch 32/100, Loss: 0.6347
Epoch 33/100, Loss: 0.6322
Epoch 34/100, Loss: 0.6336
Epoch 35/100, Loss: 0.6330
Epoch 36/100, Loss: 0.6326
Epoch 37/100, Loss: 0.6304
Epoch 38/1