<a href="https://colab.research.google.com/github/Twahaaa/training-a-bert-model/blob/main/bert_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import shutil
import sys

In [28]:
df = pd.read_csv('/content/train(1).csv')

In [29]:
df

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [30]:
df['context'] = df['TITLE'] + "- " + df['ABSTRACT']

In [31]:
df.drop(['ID','TITLE', 'ABSTRACT'], axis=1, inplace=True)

In [32]:
df

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,context
0,1,0,0,0,0,0,Reconstructing Subject-Specific Effect Maps- ...
1,1,0,0,0,0,0,Rotation Invariance Neural Network- Rotation...
2,0,0,1,0,0,0,Spherical polyharmonics and Poisson kernels fo...
3,0,0,1,0,0,0,A finite element approximation for the stochas...
4,1,0,0,1,0,0,Comparative study of Discrete Wavelet Transfor...
...,...,...,...,...,...,...,...
20967,1,1,0,0,0,0,Contemporary machine learning: a guide for pra...
20968,0,1,0,0,0,0,Uniform diamond coatings on WC-Co hard alloy c...
20969,1,0,0,0,0,0,Analysing Soccer Games with Clustering and Con...
20970,0,0,1,1,0,0,On the Efficient Simulation of the Left-Tail o...


In [33]:
df.columns

Index(['Computer Science', 'Physics', 'Mathematics', 'Statistics',
       'Quantitative Biology', 'Quantitative Finance', 'context'],
      dtype='object')

In [34]:
df = df[['context','Computer Science', 'Physics', 'Mathematics', 'Statistics','Quantitative Biology', 'Quantitative Finance']]
df

Unnamed: 0,context,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps- ...,1,0,0,0,0,0
1,Rotation Invariance Neural Network- Rotation...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,A finite element approximation for the stochas...,0,0,1,0,0,0
4,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0
...,...,...,...,...,...,...,...
20967,Contemporary machine learning: a guide for pra...,1,1,0,0,0,0
20968,Uniform diamond coatings on WC-Co hard alloy c...,0,1,0,0,0,0
20969,Analysing Soccer Games with Clustering and Con...,1,0,0,0,0,0
20970,On the Efficient Simulation of the Left-Tail o...,0,0,1,1,0,0


In [35]:
target_list = ['Computer Science', 'Physics', 'Mathematics', 'Statistics','Quantitative Biology', 'Quantitative Finance']

In [36]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [37]:
from transformers import BertTokenizer, BertModel

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [39]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.tokenizer = tokenizer
    self.df = df
    self.max_len = max_len
    self.title = self.df['context']
    self.targets = self.df[target_list].values

  def __len__(self):
    return len(self.title)

  def __getitem__(self,index):
    title = str(self.title[index])
    title = " ".join(title.split())

    inputs = self.tokenizer.encode_plus(
        title,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'input_ids': inputs['input_ids'].flatten(),
        'attention_mask': inputs['attention_mask'].flatten(),
        'token_type_ids': inputs['token_type_ids'].flatten(),
        'targets': torch.FloatTensor(self.targets[index])
    }


In [40]:
train_size = 0.8
train_df = df.sample(frac=train_size,random_state=200).reset_index(drop=True)
valid_df = df.drop(train_df.index).reset_index(drop=True)


In [41]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(valid_df, tokenizer, MAX_LEN)


In [42]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=0
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=0
)

In [43]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [44]:
def load_ckp(checkpoint_fpath, model, optimizer):
  checkpoint = torch.load(checkpoint_fpath)
  model.load_state_dict(checkpoint['state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer'])
  valid_loss_min = checkpoint['valid_loss_min']
  return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [45]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
  f_path = checkpoint_path
  torch.save(state, f_path)
  if is_best:
    best_fpath = best_model_path
    shutil.copyfile(f_path, best_fpath)


In [46]:
class BERTClass(nn.Module):
  def __init__(self):
    super(BERTClass, self).__init__()
    self.bertmodel = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
    self.dropout = nn.Dropout(0.3)
    self.linear = nn.Linear(768, 6)

  def forward(self, input_ids, attention_mask, token_type_ids):
    output_1 = self.bertmodel(input_ids, attention_mask, token_type_ids)
    output_dropout = self.dropout(output_1.pooler_output)
    output = self.linear(output_dropout)
    return output

In [47]:
model = BERTClass()
model.to(device)


BERTClass(
  (bertmodel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [48]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets)


optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


In [52]:
def train_model(
    n_epochs,
    training_loader,
    validation_loader,
    model,
    optimizer,
    checkpoint_path,
    best_model_path
):
    valid_loss_min = np.inf

    for epoch in range(1, n_epochs + 1):
        train_loss = 0.0
        valid_loss = 0.0

        model.train()
        for index, batch in enumerate(training_loader):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        model.eval()
        with torch.no_grad():
            for index, batch in enumerate(validation_loader):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.float)

                outputs = model(input_ids, attention_mask, token_type_ids)
                loss = loss_fn(outputs, targets)

                valid_loss += loss.item()

        avg_train_loss = train_loss / len(training_loader)
        avg_valid_loss = valid_loss / len(validation_loader)

        print(f"Epoch {epoch}/{n_epochs} => Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}")

        checkpoint = {
            'epoch': epoch,
            'valid_loss_min': avg_valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)

        if avg_valid_loss < valid_loss_min:
            print(f"Validation loss decreased ({valid_loss_min:.4f} --> {avg_valid_loss:.4f}). Saving best model...")
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = avg_valid_loss

    return model


In [53]:
trained_model = train_model(EPOCHS, train_data_loader,valid_data_loader,model, optimizer, '/curr_ckpt','/best.pt')

Epoch 1/2 => Train Loss: 0.2722, Valid Loss: 0.1991
Validation loss decreased (inf --> 0.1991). Saving best model...
Epoch 2/2 => Train Loss: 0.1785, Valid Loss: 0.1541
Validation loss decreased (0.1991 --> 0.1541). Saving best model...


1.input_ids
2.attention_masks
3.token_ids


In [56]:
test_df = pd.read_csv('/content/train(1).csv')

In [57]:
example = test_df['ABSTRACT'][2912]
example

'  The recent success of Deep Neural Networks (DNNs) has drastically improved\nthe state of the art for many application domains. While achieving high\naccuracy performance, deploying state-of-the-art DNNs is a challenge since they\ntypically require billions of expensive arithmetic computations. In addition,\nDNNs are typically deployed in ensemble to boost accuracy performance, which\nfurther exacerbates the system requirements. This computational overhead is an\nissue for many platforms, e.g. data centers and embedded systems, with tight\nlatency and energy budgets. In this article, we introduce flexible DNNs\nensemble processing technique, which achieves large reduction in average\ninference latency while incurring small to negligible accuracy drop. Our\ntechnique is flexible in that it allows for dynamic adaptation between quality\nof results (QoR) and execution runtime. We demonstrate the effectiveness of the\ntechnique on AlexNet and ResNet-50 using the ImageNet dataset. This te

In [58]:
train_df.columns[1:].tolist()

['Computer Science',
 'Physics',
 'Mathematics',
 'Statistics',
 'Quantitative Biology',
 'Quantitative Finance']

In [62]:
encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

Computer Science
[[0.8010447025299072, 0.029943207278847694, 0.012100431136786938, 0.43875959515571594, 0.01064415741711855, 0.004451499320566654]]


  print(train_df.columns[1:].tolist()[int(np.argmax(final_output,axis=1))])


In [63]:
trained_model.eval()
with torch.no_grad():
  input_ids = encodings['input_ids'].to(device, dtype=torch.long)
  attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
  token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
  output = trained_model(input_ids, attention_mask, token_type_ids)
  final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
  print(train_df.columns[1:].tolist()[int(np.argmax(final_output,axis=1))])
  print(final_output)

Computer Science
[[0.8010447025299072, 0.029943207278847694, 0.012100431136786938, 0.43875959515571594, 0.01064415741711855, 0.004451499320566654]]


  print(train_df.columns[1:].tolist()[int(np.argmax(final_output,axis=1))])
