In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_train.csv",
    usecols=["Requirement", "Req/Not Req"],
)

test_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_test.csv",
    usecols=["Requirement", "Req/Not Req"],
)

valid_df = pd.read_csv(
    "../../../Datasets/RegExpPURE/PURE_valid.csv",
    usecols=["Requirement", "Req/Not Req"],
)

In [3]:
train_X = train_df["Requirement"].values
train_y = train_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

test_X = test_df["Requirement"].values
test_y = test_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

valid_X = valid_df["Requirement"].values
valid_y = valid_df["Req/Not Req"].map({"Req": 1, "Not_Req": 0}).values

## DistilBERT for Sequence Classification

In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Custom Dataset Class

In [5]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X[idx]
        label = self.y[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label),
        }

In [6]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased").to(
    device
)

# Define hyperparameters
batch_size = 16
max_length = 128
learning_rate = 2e-5
num_epochs = 10

# Create dataloaders for training, validation, and testing
train_dataset = CustomDataset(train_X, train_y, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = CustomDataset(valid_X, valid_y, tokenizer, max_length)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

test_dataset = CustomDataset(test_X, test_y, tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training Loop

In [8]:
# Training loop
total_accuracy = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)

    # Validation loop
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            total_correct += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    total_accuracy += accuracy

    print(
        f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f} - Validation Accuracy: {accuracy:.4f}"
    )

print(f"Average Validation Accuracy: {total_accuracy/num_epochs:.4f}")

Epoch 1/10 - Loss: 0.3092 - Validation Accuracy: 0.7724
Epoch 2/10 - Loss: 0.1500 - Validation Accuracy: 0.7779
Epoch 3/10 - Loss: 0.0775 - Validation Accuracy: 0.7657
Epoch 4/10 - Loss: 0.0433 - Validation Accuracy: 0.7425
Epoch 5/10 - Loss: 0.0311 - Validation Accuracy: 0.7536
Epoch 6/10 - Loss: 0.0215 - Validation Accuracy: 0.7238
Epoch 7/10 - Loss: 0.0154 - Validation Accuracy: 0.7735
Epoch 8/10 - Loss: 0.0066 - Validation Accuracy: 0.7757
Epoch 9/10 - Loss: 0.0165 - Validation Accuracy: 0.7558
Epoch 10/10 - Loss: 0.0098 - Validation Accuracy: 0.7492
Average Validation Accuracy: 0.7590


### Validation Loop

In [9]:
# Testing loop
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples

print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7627


In [10]:
# Save the model

model.save_pretrained("../../../Models/DistilBERT_for_Sequence_Classification")