<a href="https://colab.research.google.com/github/ShwetaBaranwal/BERT/blob/master/BERT_from_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |▊                               | 10kB 31.8MB/s eta 0:00:01[K     |█▍                              | 20kB 36.2MB/s eta 0:00:01[K     |██                              | 30kB 37.9MB/s eta 0:00:01[K     |██▊                             | 40kB 41.7MB/s eta 0:00:01[K     |███▍                            | 51kB 44.6MB/s eta 0:00:01[K     |████▏                           | 61kB 48.4MB/s eta 0:00:01[K     |████▉                           | 71kB 48.9MB/s eta 0:00:01[K     |█████▌                          | 81kB 49.5MB/s eta 0:00:01[K     |██████▏                         | 92kB 51.0MB/s eta 0:00:01[K     |██████▉                         | 102kB 51.0MB/s eta 0:00:01[K     |███████▋                        | 112kB 51.0MB/s eta 0:00:01[K     |████████▎                       | 

In [2]:
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import glue_compute_metrics as compute_metrics
from transformers.optimization import AdamW
import pandas as pd
import numpy as np
from tqdm import trange, tqdm_notebook
from sklearn.metrics import confusion_matrix

Bert Tokenization:


In [3]:
#tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
sent = 'This is my dog. His name is Jack.'
sent2 = 'He loves playing.'
inputs = tokenizer.encode_plus(text = sent, text_pair = sent2, add_special_tokens=True, max_length= 15)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [4]:
inputs

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
  2023,
  2003,
  2026,
  3899,
  1012,
  2010,
  2171,
  2003,
  102,
  2002,
  7459,
  2652,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]}

In [5]:
tokenizer.decode([101,
  2023,
  2003,
  2026,
  3899,
  1012,
  2010,
  2171,
  2003,
  102,
  2002,
  7459,
  2652,
  1012,
  102])

'[CLS] this is my dog. his name is [SEP] he loves playing. [SEP]'

Model class:

In [0]:
class BERTBASECLASSIFIER(nn.Module):
  def __init__(self, bert_type, num_labels):
    super(BERTBASECLASSIFIER, self).__init__()
    self.bert_type = bert_type
    self.num_labels = num_labels
    self.bert = transformers.BertForSequenceClassification.from_pretrained(
                      self.bert_type, 
                      num_labels=self.num_labels)

  def forward(self, ids, mask_ids, token_ids, label):
    _, logits = self.bert(
                      input_ids = ids, 
                      attention_mask = mask_ids, 
                      token_type_ids = token_ids,
                      labels = label)
    return logits


Dataset class:

In [0]:
class BertDatasetModule(Dataset):
  def __init__(self, tokenizer, input_sent, max_length, target):
    self.input_seq = input_sent
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.target = target 
  
  def __len__(self):
        return len(self.input_seq)
  
  def __getitem__(self, idx):
    input_ = self.input_seq[idx]
    inputs = self.tokenizer.encode_plus(text = input_, add_special_tokens=True, max_length= self.max_length)
    ids = inputs['input_ids']
    mask_ids = inputs['attention_mask']
    token_ids = inputs['token_type_ids']

    padding_len = self.max_length - len(ids)
    ids = ids + ([0]*padding_len)
    mask_ids = mask_ids + ([0]*padding_len)
    token_ids = token_ids  + ([0]*padding_len)
 
    return {'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask_ids, dtype = torch.long),
            'token_type_ids': torch.tensor(token_ids, dtype = torch.long),
            'target':  torch.tensor(self.target[idx], dtype = torch.int16)}

Defining loss:

In [0]:
def loss_func(outputs, targets):
  return nn.CrossEntropyLoss()(outputs, targets)

Model training:

In [0]:
def train_loop(dataloader, model, optimizer, device, max_grad_norm, scheduler=None):
  model.train()
  for bi, d in enumerate(tqdm_notebook(dataloader, desc="Iteration")):
    ids = d['ids']
    mask_ids = d['mask']
    token_ids = d['token_type_ids']
    target = d['target']

    ids = ids.to(device, dtype = torch.long)
    mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    target = target.to(device, dtype = torch.long)

    optimizer.zero_grad()
    output = model(ids, mask_ids, token_ids, target)
    loss = loss_func(output, target)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    loss.backward()
    optimizer.step()
    if scheduler is not None:
      scheduler.step()
    if bi%100==0:
      print (f"bi: {bi}, loss: {loss}")


Model evaluation:

In [0]:
def eval_loop(dataloader, model, device):
  model.eval()
  preds = None
  out_label_ids = None
  eval_loss = 0.0
  eval_steps = 0

  for bi, d in enumerate(dataloader):
    ids = d['ids']
    mask_ids = d['mask']
    token_ids = d['token_type_ids']
    target = d['target']

    ids = ids.to(device, dtype = torch.long)
    mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    target = target.to(device, dtype = torch.long)
    with torch.no_grad():
      output = model(ids, mask_ids, token_ids, target)
      loss = loss_func(output, target)
      eval_loss += loss.mean().item()
    
    eval_steps += 1
    if preds is None:
      preds = output.detach().cpu().numpy()
      out_label_ids = target.detach().cpu().numpy()
    else:
      preds = np.append(preds, output.detach().cpu().numpy(), axis=0)
      out_label_ids = np.append(out_label_ids, target.detach().cpu().numpy(), axis=0)

  eval_loss = eval_loss/eval_steps
  preds = np.argmax(preds, axis=1)
  
  conf_matrix = confusion_matrix(out_label_ids, preds)
  print("Confusion Matrix:")
  print(conf_matrix)

  tn, fp, fn, tp = conf_matrix.ravel()
  print(f'tn:{tn}, fp:{fp}, fn:{fn}, tp:{tp}')

  return eval_loss
    


In [0]:
def dataset_details(df):
  print("Dataset preview")
  print(df.head(5))
  print("label count:")
  print(df.groupby([0]).count())


In [0]:
def run():
  MAX_SEQ_LENGTH = 128
  TRAIN_BATCH_SIZE = 32
  EVAL_BATCH_SIZE = 32
  LEARNING_RATE = 1e-5
  NUM_TRAIN_EPOCHS = 1
  NUM_LABELS = 2
  BERT_TYPE = "bert-base-uncased"
  max_grad_norm = 1.0

  train_df = pd.read_csv('train.csv', header=None)
  test_df = pd.read_csv('test.csv', header=None)
  train_df[0] = (train_df[0] == 2).astype(int)
  test_df[0] = (test_df[0] == 2).astype(int)

  dataset_details(train_df)
  dataset_details(test_df)

  tokenizer = transformers.BertTokenizer.from_pretrained(BERT_TYPE)
  train_dataset = BertDatasetModule(
      tokenizer = tokenizer,
      input_sent = train_df[1],
      max_length = MAX_SEQ_LENGTH,
      target = train_df[0]
  )

  train_dataloader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, shuffle=True)

  eval_dataset = BertDatasetModule(
      tokenizer = tokenizer,
      input_sent = test_df[1],
      max_length = MAX_SEQ_LENGTH,
      target = test_df[0]
  ) 

  eval_dataloader = DataLoader(eval_dataset, batch_size = EVAL_BATCH_SIZE, shuffle=False)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model = BERTBASECLASSIFIER(BERT_TYPE, NUM_LABELS).to(device)

  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)

  NUM_TRAIN_STEPS = int(len(train_dataset)/TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 
  scheduler = transformers.get_constant_schedule_with_warmup(
                  optimizer, 
                  num_warmup_steps=100,
                  # num_training_steps=NUM_TRAIN_STEPS,
                  last_epoch=-1)
  
  for epoch in trange(NUM_TRAIN_EPOCHS):
    train_loop(train_dataloader, model, optimizer, device, max_grad_norm, scheduler)
  
  res = eval_loop(eval_dataloader, model, device)
  print(res)
  


In [13]:
if __name__ == '__main__':
  run()

   0                                                  1
0  0  Unfortunately, the frustration of being Dr. Go...
1  1  Been going to Dr. Goldberg for over 10 years. ...
2  0  I don't know what Dr. Goldberg was like before...
3  0  I'm writing this review to give you a heads up...
4  1  All the food is great here. But the best thing...
total rows:
0    50001
1    50001
dtype: int64
label count:
       1
0       
0  26832
1  23169
   0                                                  1
0  1  Contrary to other reviews, I have zero complai...
1  0  Last summer I had an appointment to get new ti...
2  1  Friendly staff, same starbucks fair you get an...
3  0  The food is good. Unfortunately the service is...
4  1  Even when we didn't have a car Filene's Baseme...
total rows:
0    38000
1    38000
dtype: int64
label count:
       1
0       
0  19000
1  19000


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




  0%|          | 0/1 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration', max=1563, style=ProgressStyle(description_width='…

bi: 0, loss: 0.7565804719924927
bi: 100, loss: 0.27455559372901917
bi: 200, loss: 0.2796914577484131
bi: 300, loss: 0.2654933035373688
bi: 400, loss: 0.1789671927690506
bi: 500, loss: 0.3040599822998047
bi: 600, loss: 0.26138177514076233
bi: 700, loss: 0.22351501882076263
bi: 800, loss: 0.08943471312522888
bi: 900, loss: 0.06939338892698288
bi: 1000, loss: 0.05412149801850319
bi: 1100, loss: 0.08004928380250931
bi: 1200, loss: 0.19310957193374634
bi: 1300, loss: 0.12269952893257141
bi: 1400, loss: 0.10538876056671143
bi: 1500, loss: 0.09364955127239227


100%|██████████| 1/1 [24:56<00:00, 1496.55s/it]







Confusion Matrix:
[[17984  1016]
 [ 1335 17665]]
tn:17984, fp:1016, fn:1335, tp:17665
0.15527574553370777
