In [None]:
# Install dependencies
!pip uninstall -y tensorflow
!pip install transformers

Uninstalling tensorflow-2.2.0:
  Successfully uninstalled tensorflow-2.2.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.7MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 13.7MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import lr_scheduler

import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore")

In [None]:
class SentimentClassifier(nn.Module):
  """
  This class defines the model architecture which is simply a fully-connected
  layer on top of a pre-trained BERT model. 
  """

  def __init__(self, BERT_MODEL):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 3) # Number of output classes = 3

  def forward(self, ids, mask, token_type_ids):
    last_hidden_state, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
class SentiHood:
  """
  This class tokenizes the input text using the pre-trained BERT tokenizer 
  (wordpiece) and returns the corresponding tensors.
  """
  
  def __init__(self, text, auxiliary_sentence, targets, tokenizer, max_len):
    self.text = text
    self.auxiliary_sentence = auxiliary_sentence
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    auxiliary_sentence = str(self.auxiliary_sentence[item])
    targets = self.targets[item]

    text = text + ' ' + auxiliary_sentence

    inputs = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [None]:
def loss_function(outputs, targets):
	"""
	This function defines the loss function which is used to train the model, i.e.
	CrossEntropy.
	"""

	# probability, predicted = torch.max(outputs, 1)
	# print(f"Predicted = {predicted.cpu().detach().numpy()}\nTargets = {targets}")

	return nn.CrossEntropyLoss(reduction='mean')(outputs, targets)


In [None]:
def train_loop_function(data_loader, model, optimizer, device):
  """
  This function defines the training loop over the entire training set.
  """

  model.train()

  running_loss = 0.0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    loss = loss_function(outputs, targets)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if bi % 10 == 0 and bi!=0:
      temp = f'Batch index = {bi}\tLoss = {running_loss/10}'
      print(temp)

      f1 = open('/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Models/' + 'loss.txt', 'a+')
      temp = temp + '\n'
      f1.write(temp)
      f1.close()

      running_loss = 0.0

In [None]:
def eval_loop_function(data_loader, model, device):
  """
  This function defines the evaluation loop over the entire validation set.
  It also computes accuracy of the trained model, which is used to select the 
  best model.
  """
  
  model.eval()

  corrects = 0
  total = 0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

    _, predicted = torch.max(outputs, 1)
    total = total + targets.size(0)
    corrects = corrects + (predicted==targets).sum().item()

    print(f"bi: {bi}\tPredicted: {predicted}\tTargets: {targets}")

  accuracy = corrects / total * 100
  f1 = open('/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Models/' + 'accuracy.txt', 'a+')
  temp = f"Corrects: {corrects}\tTotal: {total}\tAccuracy: {accuracy}\n"
  f1.write(temp)
  f1.close()

  return accuracy

In [None]:
def run():
  """
  This function defines hyperparameters, model and optimizer, loads required
  datasets and initiate the training and validation procedures.
  """

  TRAIN_MAX_LEN = 160
  VALID_MAX_LEN = 160
  TRAIN_BATCH_SIZE = 16
  VALID_BATCH_SIZE = 16
  EPOCHS = 10
  BERT_MODEL = 'bert-base-uncased'
  LEARNING_RATE = 3e-5

  locations = ['LOCATION1', 'LOCATION2']
  aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

  training_set_path = '/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Datasets/training_set.csv'
  validation_set_path = '/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Datasets/validation_set.csv'

  df_train = pd.read_csv(training_set_path)
  df_valid = pd.read_csv(validation_set_path)
  sentiment_mapping = {
      'Positive': 0,
      'Negative': 1,
      'None': 2
  }
  df_train['sentiment'] = df_train['sentiment'].map(sentiment_mapping)
  df_valid['sentiment'] = df_valid['sentiment'].map(sentiment_mapping)
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  tokenizer = transformers.BertTokenizer.from_pretrained(BERT_MODEL)

  train_dataset = SentiHood(
      text = df_train['text'].values,
      auxiliary_sentence = df_train['auxiliary_sentence'],
      targets = df_train['sentiment'].values,
      tokenizer = tokenizer,
      max_len = TRAIN_MAX_LEN
  )
  print(f"Training Set: {len(train_dataset)}")

  # Custom sampler to compensate class imbalance in the dataset
  # ============================================================================
  class_counts = []
  for i in range(3):
    class_counts.append(df_train[df_train['sentiment']==i].shape[0])
  print(f"Class Counts: {class_counts}")
  
  num_samples = sum(class_counts)
  labels = df_train['sentiment'].values

  class_weights = []
  for i in range(len(class_counts)):
    if class_counts[i] != 0:
      class_weights.append(num_samples/class_counts[i])
    else:
      class_weights.append(0)

  weights = [class_weights[labels[i]] for i in range(int(num_samples))]
  sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
  # ============================================================================

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size = TRAIN_BATCH_SIZE,
      shuffle = False,
      sampler = sampler
  )

  valid_dataset = SentiHood(
      text = df_valid['text'].values,
      auxiliary_sentence = df_train['auxiliary_sentence'],
      targets = df_valid['sentiment'].values,
      tokenizer = tokenizer,
      max_len = VALID_MAX_LEN
  )
  print(f"Validation Set: {len(valid_dataset)}")

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size = VALID_BATCH_SIZE,
      shuffle = False
  )

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")

  model = SentimentClassifier(BERT_MODEL)
  model = model.to(device)

  num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

  scheduler = lr_scheduler.StepLR(
      optimizer,
      step_size = 1,
      gamma = 0.8
  )

  for epoch in range(EPOCHS):
    train_loop_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
    accuracy = eval_loop_function(data_loader=valid_data_loader, model=model, device=device)

    print(f"\nEpoch = {epoch}\tAccuracy Score = {accuracy}")
    print(f"Learning Rate = {scheduler.get_lr()[0]}\n")

    scheduler.step()

    torch.save(model, '/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Models/' + str(epoch) + '.bin')

if __name__ == "__main__":
  run()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Training Set: 45024
Class Counts: [2474, 921, 41629]
Validation Set: 11244
Device: cuda:0


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Batch index = 10	Loss = 1.2579200267791748
Batch index = 20	Loss = 1.0927840650081635
Batch index = 30	Loss = 1.0368426203727723
Batch index = 40	Loss = 0.9834808886051178
