## Installing necessary dependencies

In [0]:
pip install transformers

## Importing the Libraries

In [0]:
import os
import transformers
import numpy as np
import torch.nn as nn
import torch
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
from transformers import AdamW
from transformer import get_linear_scheduler_with_warmup


## Downlaod the dataset from here: [link](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

## Setting the configurations for the project

In [0]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
ACCUMULATION = 2 # This will be used to step the optimizer in the training 
BERT_PATH = './'
MODEL_PATH = 'model.bin'
TRAINING_FILE = 'imdb.csv'
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## Making the model

In [0]:
class BertBaseUncased(nn.Module):
  def __init__(self):
    super(BertBaseUncased, self).__init__()
    self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
    self.bert_drop = nn.Dropout(0.3)
    self.out = nn.Linear(768, 1)

  def forword(self, ids, mask, token_type_ids):
    ## Out1: (lst hidden state)sequence of hidden states for each token in the batch. 
    ## Example:- input--> (512, 1), output--> (512, 768)
    ## Out2: output for the first token. Output from bert pooler.
    out1, out2 = self.bert(ids, 
                           attention_mask=mask, 
                           token_type_ids = token_type_ids
                          )
    bo = self.bert_drop(out2)
    output = self.out(bo)

    return out

## Setting up the dataset

In [0]:
class BERT_Dataset:
  def __init__(self, review, target):
    self.review = review
    self.target = target
    self.tokenizer = TOKENIZER
    self.max_len = MAX_LEN

  def __len__(self):
    return len(self.review)

  def __getitem__(self, item):
    review = str(self.review)
    review = " ".join(review.split())

    inputs = self.tokenizer.encode_plus(
        review,
        None, 
        add_special_tokens=True,
        max_length = self.max_len
    )

    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    padding_length = self.max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0]*padding_length)
    token_type_ids = token_type_ids + ([0]*padding_length)

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'target': torch.tensor(self.target[item], dtype=torch.float)
        }


## Setting the loss fucntions

In [0]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.view(-1,1))

## Setting up training and evaluation pipeline

In [0]:
def train_fn(data_loader, model, optimizer, device, scheduler):
  model.train()
  
  for bi, d in tqdm(enumerate(data_loader), total = len(data_loader)):
    ids = d['ids']
    token_typ_ids = d['token_type_ids']
    mask = d['mask']
    targets = d['targets']

    ids = ids.to_device(device, dtype=torch.long)
    token_type_ids = token_type_ids.to_device(device, dtype=torch.long)
    mask = mask.to_device(device, dtype=torch.long)
    targets = targets.to_device(device, dtype=torch.float)

    optimizer.zero_grad()

    outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
    )

    loss = loss_fn(outputs, targets)
    loss.backward()


    if (bi+1) % accumulation_steps == 0:
      optimizer.step()
      scheduler.step()



def eval_fn(data_loader, model, device):

  model.eval()
  fin_targets= []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total = len(data_loader)):
      ids = d['ids']
      token_typ_ids = d['token_type_ids']
      mask = d['mask']
      targets = d['targets']

      ids = ids.to_device(device, dtype=torch.long)
      token_type_ids = token_type_ids.to_device(device, dtype=torch.long)
      mask = mask.to_device(device, dtype=torch.long)
      targets = targets.to_device(device, dtype=torch.float)

      optimizer.zero_grad()

      outputs = model(
          ids = ids,
          mask = mask,
          token_type_ids = token_type_ids
      )

      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    
  return fin_outputs, fin_targets


In [0]:
def run():
  dfx = pd.read_csv(TRAINING_FILE).fillna("none")

  dfx.sentiment = dfx.sentiment.apply(
      lambda x: 1 if x == "positive" else 0
  )

  df_train, df_valid = model_selection.train_test_split(
      dfx, test_size=10, random_state=42, 
      stratify=dfx.sentiment.values
    )

  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  train_dataset = BertDataset(
      review = df_train.review.values,
      target = df_train.sentiment.values
  )

  train_data_loader = torch.utils.DataLoader(
      train_dataset,
      batch_size=TRAIN_BATCH_SIZE,
      num_workers=4
  )

  valid_dataset = BertDataset(
      review = df_valid.review.values,
      target = df_valid.sentiment.values
  )

  valid_data_loader = torch.utils.DataLoader(
      valid_dataset,
      batch_size=VALID_BATCH_SIZE,
      num_workers=4
  )

  device = torch.device('cuda')
  model = BertBaseUncased()
  model.to(device)

  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', "LayerNorm.weight"]

  optmizer_parameters = [
            {"params":[p
                       for p in param_optimizer 
                       if not any(nd in n for nd in no_decay)],
              'weight_decay':0.001 
            },
            {"params":[p
                       for p in param_optimizer 
                       if any(nd in n for nd in no_decay)],
              'weight_decay':0.01 
            }
  ]

  num_training_steps = int(len(df_train) / TRAIN_BATCH_SIZE*EPOCHS)

  optimizer = AdamW(optimizer_parameters, lr=3e-5)
  scheduler = get_linear_scheduler_with_warmup(
      optmizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )

  best_accuracy = 0

  for epoch in range(EPOCHS):
    train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval_fn(valid_data_loader, model, device)
    outputs = np.array(ouputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)

    print(f"Accuracy Score={accuracy_score}")

    if accuracy > best_accuracy:
      torch.save(model.state_dict(), MODEL_PATH)
      best_accuracy = accuracy