## Custom Training of DistilBERT

In this notebook, we will custom train the DistilBERT model for squence classsification task and then use the trained model to make predictions on the test data.


## Dataset loading and preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
train_df = pd.read_csv("../../../Datasets/pure/PURE_train.csv", engine="pyarrow", usecols=["Requirement", "label"])
test_df = pd.read_csv("../../../Datasets/pure/PURE_test.csv", engine="pyarrow", usecols=["Requirement", "label"])
valid_df = pd.read_csv("../../../Datasets/pure/PURE_valid.csv", engine="pyarrow", usecols=["Requirement", "label"])

In [3]:
train_X = train_df["Requirement"].to_numpy()
train_y = train_df["label"].to_numpy()

test_X = test_df["Requirement"].to_numpy()
test_y = test_df["label"].to_numpy()

valid_X = valid_df["Requirement"].to_numpy()
valid_y = valid_df["label"].to_numpy()

## Converting To Torch Dataset

In [4]:
class RequirementsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


## Defining the Model and Tokenizer

In [5]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader




In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [7]:
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenizing the text

In [8]:
train_encodings = tokenizer(train_X.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_X.tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_X.tolist(), truncation=True, padding=True)

In [9]:
train_encodings[:5]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [10]:
train_req_dataset = RequirementsDataset(train_encodings, train_y)
test_req_dataset = RequirementsDataset(test_encodings, test_y)
valid_req_dataset = RequirementsDataset(valid_encodings, valid_y)

In [11]:
train_req_dataset[0]

{'input_ids': tensor([  101,  1996,  5576,  2323,  3073,  6851,  6123,  1011,  7591,  2393,
          3430,  2005,  2035,  1996,  2825,  4506,  1998, 16820,  2006,  2035,
          5310, 19706,  1999,  1996,  4646,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

### Training the Model (With Pytorch and CUDA)

In [12]:
MY_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MY_DEVICE

device(type='cuda')

In [14]:
distilbert_model.to(MY_DEVICE)
distilbert_model.train()

train_loader = DataLoader(train_req_dataset, batch_size=16, shuffle=True)
optimizer = AdamW(distilbert_model.parameters(), lr=5e-5)

# Train for 3 epochs
for epoch in range(3):
    for batch in train_loader:
        # print(batch)
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(MY_DEVICE)
        attention_mask = batch['attention_mask'].to(MY_DEVICE)
        labels = batch['labels'].to(MY_DEVICE)
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch} Loss: {loss.item()}")

distilbert_model.eval()



Epoch 0 Loss: 0.7053360939025879
Epoch 0 Loss: 0.6753596663475037
Epoch 0 Loss: 0.6532688140869141
Epoch 0 Loss: 0.6961690187454224
Epoch 0 Loss: 0.6172416806221008
Epoch 0 Loss: 0.5248299837112427
Epoch 0 Loss: 0.5229129195213318
Epoch 0 Loss: 0.6888538599014282
Epoch 0 Loss: 0.6065130829811096
Epoch 0 Loss: 0.649664044380188
Epoch 0 Loss: 0.5263355374336243
Epoch 0 Loss: 0.4570524990558624
Epoch 0 Loss: 0.6019513010978699
Epoch 0 Loss: 0.4622551202774048
Epoch 0 Loss: 0.6873756051063538
Epoch 0 Loss: 0.4956197142601013
Epoch 0 Loss: 0.4668370187282562
Epoch 0 Loss: 0.3469903767108917
Epoch 0 Loss: 0.35774239897727966
Epoch 0 Loss: 0.5159806609153748
Epoch 0 Loss: 0.4887233078479767
Epoch 0 Loss: 0.331891268491745
Epoch 0 Loss: 0.5228985548019409
Epoch 0 Loss: 0.5415565371513367
Epoch 0 Loss: 0.3013581931591034
Epoch 0 Loss: 0.5354827046394348
Epoch 0 Loss: 0.4932573735713959
Epoch 0 Loss: 0.23166301846504211
Epoch 0 Loss: 0.31125345826148987
Epoch 0 Loss: 0.4800143837928772
Epoch 0 L

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
distilbert_model.save_pretrained("../../../Models/requirement_relevancy_experiment/NLP_models/custom_trained_distilbert_1")

## Training the Model (With Trainer)

In [12]:
training_args = TrainingArguments(
    output_dir='../../../Models/requirement_relevancy_experiment/NLP_models/custom_trained_distilbert_2',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay           # directory for storing logs
)

In [13]:
optimizer = torch.optim.Adam(distilbert_model.parameters(), lr=5e-5)
optimizer_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [14]:
trainer = Trainer(
    model=distilbert_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_req_dataset,         # training dataset
    eval_dataset=valid_req_dataset,             # evaluation dataset
    optimizers=(optimizer, optimizer_scheduler)


trainer.train()

  0%|          | 0/996 [00:00<?, ?it/s]

{'loss': 0.6803, 'learning_rate': 0.0, 'epoch': 1.51}
{'train_runtime': 11983.6845, 'train_samples_per_second': 1.328, 'train_steps_per_second': 0.083, 'train_loss': 0.6805132823775571, 'epoch': 3.0}


TrainOutput(global_step=996, training_loss=0.6805132823775571, metrics={'train_runtime': 11983.6845, 'train_samples_per_second': 1.328, 'train_steps_per_second': 0.083, 'train_loss': 0.6805132823775571, 'epoch': 3.0})

In [15]:
distilbert_model.save_pretrained("../../../Models/requirement_relevancy_experiment/NLP_models/custom_trained_distilbert_2")

## Evaluate the Model