<a href="https://colab.research.google.com/github/Nix07/Utilizing-BERT-for-Aspect-Based-Sentiment-Analysis/blob/master/Bert-single_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip uninstall -y tensorflow
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 7.1MB/s 
[?25hCollecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 23.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 58.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import lr_scheduler

import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, BERT_MODEL):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, 3) # Number of output classes = 3

  def forward(self, ids, mask, token_type_ids, device, tokenizer=None):
    last_hidden_state, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
class SentiHood:
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    targets = self.targets[item]

    inputs = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [None]:
# def load_data(training_set_path, validation_set_path, location, aspect):
#   df_train = pd.read_csv(training_set_path)
#   df_valid = pd.read_csv(validation_set_path)

#   # Extract LOCATION1 and ascpect related data
#   df_train_input = df_train[(df_train['target_entity']==location) & (df_train['aspect']==aspect)]
#   df_valid_input = df_valid[(df_valid['target_entity']==location) & (df_valid['aspect']==aspect)]
  
#   # Add "None" related data
#   df_train_temp = df_train[(df_train['target_entity']!=location) | (df_train['aspect']!=aspect)]
#   df_train_temp = df_train_temp.assign(sentiment='None')
#   df_valid_temp = df_valid[(df_valid['target_entity']!=location) | (df_valid['aspect']!=aspect)]
#   df_valid_temp = df_valid_temp.assign(sentiment='None')

#   # Merging the "None" sentiment data
#   df_train_input = pd.concat([df_train_input, df_train_temp])
#   df_valid_input = pd.concat([df_valid_input, df_valid_temp])

#   sentiment_mapping = {
#       'Positive': 0,
#       'Negative': 1,
#       'None': 2
#   }

#   df_train_input['sentiment'] = df_train_input['sentiment'].map(sentiment_mapping)
#   df_valid_input['sentiment'] = df_valid_input['sentiment'].map(sentiment_mapping)

#   df_train_input = df_train_input.reset_index(drop=True)
#   df_valid_input = df_valid_input.reset_index(drop=True)

#   return (df_train_input, df_valid_input)

In [None]:
def loss_function(outputs, targets):
	probability, predicted = torch.max(outputs, 1)
	print(f"Predicted = {predicted.cpu().detach().numpy()}\nTargets = {targets}")

	# Weighting the loss function => Penalizing the error associated with minority class more than that of majority class
	return nn.CrossEntropyLoss(reduction='mean')(outputs, targets)


In [None]:
def train_loop_function(data_loader, model, optimizer, device, location, aspect, tokenizer=None, scheduler=None):
  model.train()

  running_loss = 0.0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids, device=device, tokenizer=tokenizer)
    loss = loss_function(outputs, targets)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if bi % 10 == 0 and bi!=0:
      temp = f'Batch index = {bi}\tLoss = {running_loss/10}'
      print(temp)

      f1 = open('/content/drive/My Drive/SentiHood/LocationAspectPairs(NEW)/' + str(location) + str(aspect) + '/loss.txt', 'a+')
      temp = temp + '\n'
      f1.write(temp)
      f1.close()

      running_loss = 0.0

In [None]:
def eval_loop_function(data_loader, model, device, tokenizer, location, aspect):
  model.eval()

  corrects = 0
  total = 0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids, device=device, tokenizer=tokenizer)

    _, predicted = torch.max(outputs, 1)
    total = total + targets.size(0)
    corrects = corrects + (predicted==targets).sum().item()

    print(f"bi: {bi}\tPredicted: {predicted}\tTargets: {targets}")

  accuracy = corrects / total * 100
  f1 = open('/content/drive/My Drive/SentiHood/LocationAspectPairs(NEW)/' + str(location) + str(aspect) + '/accuracy.txt', 'a+')
  temp = f"Corrects: {corrects}\tTotal: {total}\tAccuracy: {accuracy}\n"
  f1.write(temp)
  f1.close()

  return accuracy

In [None]:
def run():
  TRAIN_MAX_LEN = 140
  VALID_MAX_LEN = 140
  TRAIN_BATCH_SIZE = 16
  VALID_BATCH_SIZE = 16
  EPOCHS = 10
  BERT_MODEL = 'bert-base-uncased'
  LEARNING_RATE = 3e-5

  locations = ['LOCATION1', 'LOCATION2']
  aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']
 
  for location in locations:
    for aspect in aspects:
      print(f"Starting {location} {aspect}...")
      training_set_path = '/content/drive/My Drive/SentiHood/TrainingData/' + str(location) + str(aspect) + '.csv'
      validation_set_path = '/content/drive/My Drive/SentiHood/ValidationData/' + str(location) + str(aspect) + '.csv'

      # Loadind and processing Training & Validation sets
      df_train = pd.read_csv(training_set_path)
      df_valid = pd.read_csv(validation_set_path)
      sentiment_mapping = {
          'Positive': 0,
          'Negative': 1,
          'None': 2
      }
      df_train['sentiment'] = df_train['sentiment'].map(sentiment_mapping)
      df_valid['sentiment'] = df_valid['sentiment'].map(sentiment_mapping)
      df_train = df_train.reset_index(drop=True)
      df_valid = df_valid.reset_index(drop=True)

      tokenizer = transformers.BertTokenizer.from_pretrained(BERT_MODEL)

      train_dataset = SentiHood(
          text = df_train['text'].values,
          targets = df_train['sentiment'].values,
          tokenizer = tokenizer,
          max_len = TRAIN_MAX_LEN
      )
      print(f"Training Set: {len(train_dataset)}")

      class_counts = []
      for i in range(3):
        class_counts.append(df_train[df_train['sentiment']==i].shape[0])
      print(f"Class Counts: {class_counts}")
      num_samples = sum(class_counts)
      labels = df_train['sentiment'].values

      class_weights = []
      for i in range(len(class_counts)):
        if class_counts[i] != 0:
          class_weights.append(num_samples/class_counts[i])
        else:
          class_weights.append(0)

      weights = [class_weights[labels[i]] for i in range(int(num_samples))]
      sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

      train_data_loader = torch.utils.data.DataLoader(
          train_dataset,
          batch_size = TRAIN_BATCH_SIZE,
          shuffle = False,
          sampler = sampler
      )

      valid_dataset = SentiHood(
          text = df_valid['text'].values,
          targets = df_valid['sentiment'].values,
          tokenizer = tokenizer,
          max_len = VALID_MAX_LEN
      )
      print(f"Validation Set: {len(valid_dataset)}")

      valid_data_loader = torch.utils.data.DataLoader(
          valid_dataset,
          batch_size = VALID_BATCH_SIZE,
          shuffle = False
      )

      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      print(f"Device: {device}")

      model = SentimentClassifier(BERT_MODEL)
      model = model.to(device)

      num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
      optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

      scheduler = lr_scheduler.StepLR(
          optimizer,
          step_size = 1,
          gamma = 0.8
      )

      for epoch in range(EPOCHS):
        train_loop_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device, location=location, aspect=aspect, tokenizer=tokenizer, scheduler=scheduler)
        accuracy = eval_loop_function(data_loader=valid_data_loader, model=model, device=device, tokenizer=tokenizer, location=location, aspect=aspect)

        print(f"\nEpoch = {epoch}\tAccuracy Score = {accuracy}")
        print(f"Learning Rate = {scheduler.get_lr()[0]}\n")

        scheduler.step()

        torch.save(model, '/content/drive/My Drive/SentiHood/LocationAspectPairs(NEW)/' + str(location) + str(aspect) + '/'+ str(epoch) + '.bin')

if __name__ == "__main__":
  run()

Starting LOCATION2 dining...
Training Set: 775
Class Counts: [13, 0, 762]
Validation Set: 190
Device: cuda:0
Predicted = [1 1 1 1 1 2 2 2 1 2 0 2 0 2 1 2]
Targets = tensor([2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Predicted = [2 0 0 2 0 2 1 2 0 2 1 2 2 2 0 2]
Targets = tensor([2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2], device='cuda:0')
Predicted = [0 2 0 0 0 0 0 0 0 0 2 0 0 2 2 1]
Targets = tensor([0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2], device='cuda:0')
Predicted = [0 0 0 0 0 2 0 0 2 0 0 0 2 2 0 0]
Targets = tensor([2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2], device='cuda:0')
Predicted = [0 0 0 0 0 0 0 2 2 0 0 2 0 0 2 2]
Targets = tensor([0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2], device='cuda:0')
Predicted = [0 0 0 0 0 2 0 0 2 0 2 2 0 2 0 0]
Targets = tensor([0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2], device='cuda:0')
Predicted = [2 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0]
Targets = tensor([2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0]

  "type " + obj.__name__ + ". It won't be checked "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Targets = tensor([0, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2], device='cuda:0')
Predicted = [1 2 0 0 0 0 1 0 2 0 1 0 0 1 0 2]
Targets = tensor([1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 2], device='cuda:0')
Predicted = [0 0 1 2 0 2 1 2 2 0 2 0 2 2 1 1]
Targets = tensor([0, 0, 1, 2, 0, 2, 1, 2, 2, 0, 2, 0, 2, 2, 1, 1], device='cuda:0')
Predicted = [1 0 2 1 0 0 2]
Targets = tensor([1, 0, 2, 1, 0, 0, 2], device='cuda:0')
bi: 0	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
bi: 1	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
bi: 2	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2], d