In [3]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, TensorDataset
import re
import string
from sklearn.preprocessing import StandardScaler

In [4]:
import pandas as pd

df = pd.read_csv('resfe_df_delivery_mode_2024-02-23.csv')
data_cleaned = df.copy(deep=True)

data_cleaned.drop(columns='elective_emergency',inplace=True)

In [5]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    return text


In [6]:
# Preprocess the text data and split the dataset
data_cleaned['preprocessed_text'] = data_cleaned['diagnosis'].apply(preprocess_text)
X = data_cleaned.drop(columns=['delivery_mode_NVD', 'diagnosis'])
y = data_cleaned['delivery_mode_NVD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract and preprocess numerical features
numerical_features = X_train.drop(columns=['preprocessed_text', 'patient_id'])  # Adjust as necessary
numerical_features_test = X_test.drop(columns=['preprocessed_text', 'patient_id'])  # Adjust as necessary

In [7]:
int_cols=['age', 'hb', 'ga_weeks','kg_upd','height_upd','bmi','abortion','living_children','parity','gravida','upd_cervix_length','upd_afi','efw_upd' ]
scaler = StandardScaler()

# Fit the scaler on the numerical columns of the training data and transform
numerical_features[int_cols] = scaler.fit_transform(numerical_features[int_cols])

# Transform the numerical columns of the test data using the same scaler
numerical_features_test[int_cols] = scaler.transform(numerical_features_test[int_cols])

In [13]:

# Define a function to preprocess text
def preprocess_text(text):
    # Your preprocessing code here
    return processed_text

bert_model = "dmis-lab/biobert-v1.1"
# # Assuming X contains features and y contains labels
# X = data_cleaned.drop(columns=['delivery_mode_NVD', 'diagnosis'])
# y = data_cleaned['delivery_mode_NVD']
# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Processing {bert_model}...")
tokenizer = AutoTokenizer.from_pretrained(bert_model)
model = AutoModel.from_pretrained(bert_model, output_hidden_states=True)




train_hidden_states_list = []
# Process in batches
for batch in np.array_split(np.array(X_train['preprocessed_text']), 10):
    inputs = tokenizer(batch.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=30)
    with torch.no_grad():
    
        outputs = model(**inputs)
        # Correctly extract the last four hidden states
        hidden_states = outputs.hidden_states[-5:]  # This is a tuple of the last four layers
        # Concatenate the last four layers for each token across the batch
        concatenated_layers = torch.cat([hidden_states[i] for i in range(5)], dim=-1)  # Now shape is [batch_size, seq_length, 4*hidden_size]

        
    train_hidden_states_list.append(concatenated_layers)
    ##### updated code end
  
   
## Concatenate along the batch dimension
train_hidden_states = torch.cat(train_hidden_states_list, dim=0)

## Code update start for train
tensor = torch.tensor(numerical_features.values, dtype=torch.float32)
batch_size = tensor.shape[0]

# Reshape tensor to match batch size
tensor = tensor.view(batch_size, -1)

train_hidden_states_upd = torch.cat([train_hidden_states, tensor.unsqueeze(1).repeat(1, 30, 1)], dim=-1)
## Code update end for train



# Optionally convert to numpy if needed
# train_hidden_states = train_hidden_states.numpy()
print(train_hidden_states_upd.shape)
test_hidden_states_list = []
# Process in batches
for batch in np.array_split(np.array(X_test['preprocessed_text']), 20):
    inputs = tokenizer(batch.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=30)
    with torch.no_grad():
        outputs = model(**inputs)
        # Correctly extract the last four hidden states
        hidden_states = outputs.hidden_states[-5:]  # This is a tuple of the last four layers
        # Concatenate the last four layers for each token across the batch
        concatenated_layers = torch.cat([hidden_states[i] for i in range(5)], dim=-1)  # Now shape is [batch_size, seq_length, 4*hidden_size]
    test_hidden_states_list.append(concatenated_layers)
# Concatenate along the batch dimension
test_hidden_states = torch.cat(test_hidden_states_list, dim=0)

###Code update start###
tensor_test = torch.tensor(numerical_features_test.values, dtype=torch.float32)
batch_size = tensor_test.shape[0]

# Reshape tensor to match batch size
tensor_test = tensor_test.view(batch_size, -1)

test_hidden_states_upd = torch.cat([test_hidden_states, tensor_test.unsqueeze(1).repeat(1, 30, 1)], dim=-1)


# Optionally convert to numpy if needed
# train_hidden_states = train_hidden_states.numpy()
print(test_hidden_states_upd.shape)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Assuming classification task
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
# Create TensorDataset instances
train_dataset = TensorDataset(train_hidden_states_upd, y_train_tensor)
test_dataset = TensorDataset(test_hidden_states_upd, y_test_tensor)
# Create DataLoader instances
batch_size = 32  # Adjust based on your preference and system capability
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


Processing dmis-lab/biobert-v1.1...
torch.Size([255, 30, 3878])
torch.Size([64, 30, 3878])


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class CustomCNN(nn.Module):
    # 5 different sizes of kernels with 32 times each for 1d data as input
    def __init__(self, num_filters=32, filter_sizes=[1,2, 3, 4, 5], num_classes=2):
        super(CustomCNN, self).__init__()
        self.conv_layers = nn.ModuleList()
        feature_size = 3878  # 4*768+numericals
        for size in filter_sizes:
            # Assuming the input is reshaped to [batch_size, 1, seq_length, 3072] before being passed to the model
            conv_layer = nn.Conv2d(in_channels=1,
                                   out_channels=num_filters,
                                   kernel_size=(size, feature_size),
                                   stride=(1, feature_size))
            self.conv_layers.append(conv_layer)
        self.linear = nn.Linear(num_filters * len(filter_sizes), num_classes)
    def forward(self, x):
        # Reshape x to add a channel dimension ([batch_size, seq_length, 3072] -> [batch_size, 1, seq_length, 3072])
        x = x.unsqueeze(1)  # Add channel dimension
        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv_out = F.relu(conv_layer(x))
            # Since our convolution outputs will have a reduced "height", we pool over the entire height
            conv_out = F.max_pool2d(conv_out, kernel_size=(conv_out.size(2), 1))
            conv_outputs.append(conv_out.squeeze(2))  # Remove the dimension of size 1 after pooling
        concat_out = torch.cat(conv_outputs, dim=1)
        flat_out = torch.flatten(concat_out, start_dim=1)
        output = self.linear(flat_out)
        return output
    
 

In [18]:
   
model = CustomCNN()
print(model)
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import roc_auc_score
import numpy as np
# Assuming 'model' is already defined and correctly configured for your task
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)
# Define the loss function
criterion = nn.CrossEntropyLoss()  # For binary classification with 2 output units
# Number of training epochs
num_epochs = 30
# Move model to the appropriate device (e.g., GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model in training mode
    train_loss = 0.0
    for inputs1,labels in train_loader:  # Iterate over the training dataset
        inputs, labels = inputs1.to(device), labels.to(device)
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Calculate the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update the parameters
        train_loss += loss.item() * inputs.size(0)  # Accumulate the training loss
    train_loss = train_loss / len(train_loader.dataset)  # Calculate average training loss
    model.eval()  # Set the model in evaluation mode
    test_loss = 0.0
    correct_predictions = 0
    all_labels = []
    all_predictions = []
    with torch.no_grad():  # Disable gradient calculation during validation
        for inputs1, labels in test_loader:  # Iterate over the test dataset
            inputs, labels = inputs1.to(device), labels.to(device)
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Calculate the loss
            test_loss += loss.item() * inputs.size(0)  # Accumulate the test loss
            _, predicted = torch.max(outputs.data, 1)  # Get the predicted class
            correct_predictions += (predicted == labels).sum().item()  # Count correct predictions
            # Store probabilities (use softmax outputs if available) and true labels for ROC AUC calculation
            probs = nn.functional.softmax(outputs, dim=1)[:, 1]  # Assuming your model outputs raw logits
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(probs.cpu().numpy())
    test_loss = test_loss / len(test_loader.dataset)  # Calculate average test loss
    accuracy = correct_predictions / len(test_loader.dataset)  # Calculate test accuracy
    auc_roc_score = roc_auc_score(all_labels, all_predictions)  # Calculate AUC ROC score
    # Print the epoch, training loss, test loss, test accuracy, and AUC ROC score
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}, AUC ROC: {auc_roc_score:.4f}')

CustomCNN(
  (conv_layers): ModuleList(
    (0): Conv2d(1, 32, kernel_size=(1, 3878), stride=(1, 3878))
    (1): Conv2d(1, 32, kernel_size=(2, 3878), stride=(1, 3878))
    (2): Conv2d(1, 32, kernel_size=(3, 3878), stride=(1, 3878))
    (3): Conv2d(1, 32, kernel_size=(4, 3878), stride=(1, 3878))
    (4): Conv2d(1, 32, kernel_size=(5, 3878), stride=(1, 3878))
  )
  (linear): Linear(in_features=160, out_features=2, bias=True)
)
Epoch [1/30], Train Loss: 0.5614, Test Loss: 0.5310, Test Accuracy: 0.7344, AUC ROC: 0.7635
Epoch [2/30], Train Loss: 0.4667, Test Loss: 0.4912, Test Accuracy: 0.7344, AUC ROC: 0.7810
Epoch [3/30], Train Loss: 0.4135, Test Loss: 0.4781, Test Accuracy: 0.7812, AUC ROC: 0.7985
Epoch [4/30], Train Loss: 0.3756, Test Loss: 0.4595, Test Accuracy: 0.7969, AUC ROC: 0.8073
Epoch [5/30], Train Loss: 0.3496, Test Loss: 0.4478, Test Accuracy: 0.7969, AUC ROC: 0.8210
Epoch [6/30], Train Loss: 0.3325, Test Loss: 0.4487, Test Accuracy: 0.7969, AUC ROC: 0.8198
Epoch [7/30], Train

In [97]:
 for inputs, labels,x in merged_train_loader:
        print(x)

dataframe_values
dataframe_values
dataframe_values
dataframe_values
dataframe_values
dataframe_values
dataframe_values
dataframe_values
