<a href="https://colab.research.google.com/github/ShwetaBaranwal/BERT-for-Classification/blob/master/XLNet_for_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/22/97/7db72a0beef1825f82188a4b923e62a146271ac2ced7928baa4d47ef2467/transformers-2.9.1-py3-none-any.whl (641kB)
[K     |████████████████████████████████| 645kB 6.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 43.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/3b/88/49e772d686088e1278766ad68a463513642a2a877487decbd691dec02955/sentencepiece-0.1.90-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.9MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████

In [0]:
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import glue_compute_metrics as compute_metrics
from transformers.optimization import AdamW
import pandas as pd
import numpy as np
from tqdm import trange, tqdm_notebook
from sklearn.metrics import confusion_matrix

In [0]:
#tokenizer
tokenizer = transformers.XLNetTokenizer.from_pretrained('xlnet-base-cased')
sent = 'This is my dog. His name is Jack.'
sent2 = 'He loves playing.'
inputs = tokenizer.encode_plus(text = sent, text_pair = sent2, add_special_tokens=True, max_length= 20)

In [0]:
inputs

{'input_ids': [122, 27, 94, 2288, 9, 329, 304, 27, 2187, 9, 4, 69, 9465, 945, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [0]:
tokenizer.decode(inputs.input_ids)

'This is my dog. His name is Jack.<sep> He loves playing.<sep><cls>'

In [0]:
class XLNETBASECLASSIFIER(nn.Module):
  def __init__(self, xlnet_type, num_labels):
    super(XLNETBASECLASSIFIER, self).__init__()
    self.xlnet_type = xlnet_type
    self.num_labels = num_labels
    self.xlnet = transformers.XLNetForSequenceClassification.from_pretrained(
                      self.xlnet_type, 
                      num_labels=self.num_labels)

  def forward(self, ids, mask_ids, token_ids, label):
    _, logits = self.xlnet(
                      input_ids = ids, 
                      attention_mask = mask_ids, 
                      token_type_ids = token_ids,
                      labels = label)
    return logits

In [0]:
class XLNetDatasetModule(Dataset):
  def __init__(self, tokenizer, input_sent, max_length, target):
    self.input_seq = input_sent
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.target = target 
  
  def __len__(self):
        return len(self.input_seq)
  
  def __getitem__(self, idx):
    input_ = self.input_seq[idx]
    inputs = self.tokenizer.encode_plus(text = input_, add_special_tokens=True, max_length= self.max_length)
    ids = inputs['input_ids']
    mask_ids = inputs['attention_mask']
    token_ids = inputs['token_type_ids']

    padding_len = self.max_length - len(ids)
    ids = ids + ([0]*padding_len)
    mask_ids = mask_ids + ([0]*padding_len)
    token_ids = token_ids  + ([0]*padding_len)
 
    return {'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask_ids, dtype = torch.long),
            'token_type_ids': torch.tensor(token_ids, dtype = torch.long),
            'target':  torch.tensor(self.target[idx], dtype = torch.int16)}

In [0]:
def loss_func(outputs, targets):
  return nn.CrossEntropyLoss()(outputs, targets)

In [0]:
def train_loop(dataloader, model, optimizer, device, max_grad_norm, scheduler=None):
  model.train()
  for bi, d in enumerate(tqdm_notebook(dataloader, desc="Iteration")):
    ids = d['ids']
    mask_ids = d['mask']
    token_ids = d['token_type_ids']
    target = d['target']

    ids = ids.to(device, dtype = torch.long)
    mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    target = target.to(device, dtype = torch.long)

    optimizer.zero_grad()
    output = model(ids, mask_ids, token_ids, target)
    loss = loss_func(output, target)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    loss.backward()
    optimizer.step()
    if scheduler is not None:
      scheduler.step()
    if bi%100==0:
      print (f"bi: {bi}, loss: {loss}")


In [0]:
def eval_loop(dataloader, model, device):
  model.eval()
  preds = None
  out_label_ids = None
  eval_loss = 0.0
  eval_steps = 0

  for bi, d in enumerate(dataloader):
    ids = d['ids']
    mask_ids = d['mask']
    token_ids = d['token_type_ids']
    target = d['target']

    ids = ids.to(device, dtype = torch.long)
    mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    target = target.to(device, dtype = torch.long)
    with torch.no_grad():
      output = model(ids, mask_ids, token_ids, target)
      loss = loss_func(output, target)
      eval_loss += loss.mean().item()
    
    eval_steps += 1
    if preds is None:
      preds = output.detach().cpu().numpy()
      out_label_ids = target.detach().cpu().numpy()
    else:
      preds = np.append(preds, output.detach().cpu().numpy(), axis=0)
      out_label_ids = np.append(out_label_ids, target.detach().cpu().numpy(), axis=0)

  eval_loss = eval_loss/eval_steps
  preds = np.argmax(preds, axis=1)
  
  conf_matrix = confusion_matrix(out_label_ids, preds)
  print("Confusion Matrix:")
  print(conf_matrix)

  tn, fp, fn, tp = conf_matrix.ravel()
  print(f'tn:{tn}, fp:{fp}, fn:{fn}, tp:{tp}')

  return eval_loss
    

In [0]:
def dataset_details(df):
  print("Dataset preview")
  print(df.head(5))
  print("label count:")
  print(df.groupby([0]).count())

In [0]:
def run():
  MAX_SEQ_LENGTH = 128
  TRAIN_BATCH_SIZE = 32
  EVAL_BATCH_SIZE = 32
  LEARNING_RATE = 1e-5
  NUM_TRAIN_EPOCHS = 1
  NUM_LABELS = 2
  XLNET_TYPE = "xlnet-base-cased"
  max_grad_norm = 1.0

  train_df = pd.read_csv('train.csv', header=None)
  test_df = pd.read_csv('test.csv', header=None)
  train_df[0] = (train_df[0] == 2).astype(int)
  test_df[0] = (test_df[0] == 2).astype(int)

  dataset_details(train_df)
  dataset_details(test_df)

  tokenizer = transformers.XLNetTokenizer.from_pretrained(XLNET_TYPE)
  train_dataset = XLNetDatasetModule(
      tokenizer = tokenizer,
      input_sent = train_df[1],
      max_length = MAX_SEQ_LENGTH,
      target = train_df[0]
  )

  train_dataloader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, shuffle=True)

  eval_dataset = XLNetDatasetModule(
      tokenizer = tokenizer,
      input_sent = test_df[1],
      max_length = MAX_SEQ_LENGTH,
      target = test_df[0]
  ) 

  eval_dataloader = DataLoader(eval_dataset, batch_size = EVAL_BATCH_SIZE, shuffle=False)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model = XLNETBASECLASSIFIER(XLNET_TYPE, NUM_LABELS).to(device)

  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)

  NUM_TRAIN_STEPS = int(len(train_dataset)/TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 
  scheduler = transformers.get_constant_schedule_with_warmup(
                  optimizer, 
                  num_warmup_steps=100,
                  # num_training_steps=NUM_TRAIN_STEPS,
                  last_epoch=-1)
  
  for epoch in trange(NUM_TRAIN_EPOCHS):
    train_loop(train_dataloader, model, optimizer, device, max_grad_norm, scheduler)
  
  res = eval_loop(eval_dataloader, model, device)
  print(res)
  


In [0]:
if __name__ == '__main__':
  run()

Dataset preview
   0                                                  1
0  0  Unfortunately, the frustration of being Dr. Go...
1  1  Been going to Dr. Goldberg for over 10 years. ...
2  0  I don't know what Dr. Goldberg was like before...
3  0  I'm writing this review to give you a heads up...
4  1  All the food is great here. But the best thing...
label count:
       1
0       
0  26832
1  23169
Dataset preview
   0                                                  1
0  1  Contrary to other reviews, I have zero complai...
1  0  Last summer I had an appointment to get new ti...
2  1  Friendly staff, same starbucks fair you get an...
3  0  The food is good. Unfortunately the service is...
4  1  Even when we didn't have a car Filene's Baseme...
label count:
       1
0       
0  19000
1  19000


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1563.0, style=ProgressStyle(description_w…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


bi: 0, loss: 0.6796261072158813
bi: 100, loss: 0.2555306553840637
bi: 200, loss: 0.21070092916488647
bi: 300, loss: 0.3804832696914673
bi: 400, loss: 0.015034101903438568
bi: 500, loss: 0.25997301936149597
bi: 600, loss: 0.09383919090032578
bi: 700, loss: 0.2007942497730255
bi: 800, loss: 0.19002646207809448
bi: 900, loss: 0.129128098487854
bi: 1000, loss: 0.2381821870803833
bi: 1100, loss: 0.08555006980895996
bi: 1200, loss: 0.058050736784935
bi: 1300, loss: 0.2508467435836792
bi: 1400, loss: 0.1753806620836258
bi: 1500, loss: 0.06408591568470001


100%|██████████| 1/1 [49:02<00:00, 2942.73s/it]







Confusion Matrix:
[[18107   893]
 [ 1048 17952]]
tn:18107, fp:893, fn:1048, tp:17952
0.13184613703126055
