In [1]:
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import optuna
from tqdm import tqdm
from generateDataset import class_weights, ProgesteroneDataset
from sklearn.metrics import confusion_matrix

# Initialize ChemBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-10M-MLM')

# Define the LSTM model with dropout
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate, num_layers=2):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(self.dropout(out[:, -1, :]))
        return out

# Load the saved datasets
with open(r'C:\Users\igorh\Documents\Progesterone\data\customDataset\train_dataset.pkl', 'rb') as f:
    train_dataset = pickle.load(f)

with open(r'C:\Users\igorh\Documents\Progesterone\data\customDataset\test_dataset.pkl', 'rb') as f:
    test_dataset = pickle.load(f)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

input_dim = tokenizer.vocab_size
hidden_dim = 128
output_dim = 2
model = LSTMModel(input_dim, hidden_dim, output_dim, dropout_rate=0.1).to(device)

# Define loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-6)

num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Validation loop
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Validating"):
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())


# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Calculate balanced accuracy
balanced_accuracy = (sensitivity + specificity) / 2


  from .autonotebook import tqdm as notebook_tqdm
Training Epoch 1/3: 100%|██████████| 278/278 [00:01<00:00, 211.09it/s]
Training Epoch 2/3: 100%|██████████| 278/278 [00:01<00:00, 242.58it/s]
Training Epoch 3/3: 100%|██████████| 278/278 [00:01<00:00, 243.85it/s]
Validating: 100%|██████████| 70/70 [00:00<00:00, 530.43it/s]
