<a href="https://colab.research.google.com/github/PurvaChiniya/Aspect-based-sentiment-analysis/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies
!pip uninstall -y tensorflow
!pip install transformers



In [2]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim


import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import StepLR
import logging
logging.basicConfig(level=logging.ERROR)
from transformers import logging
logging.set_verbosity_warning()
import warnings
warnings.filterwarnings("ignore")
# declare parameters
max_length = 160
batch_size = 32
epochs = 2
lr  = 0.01
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']
label_to_int = {
    'Positive': 0,
    'Negative': 1,
    'None': 2
}
# load data
train_data = pd.read_csv('/content/training_set.csv')[:1000]
val_data = pd.read_csv('/content/validation_set.csv')[:600]
#convert text sentiment to int labels
train_data["sentiment"]=train_data.apply(lambda row:label_to_int[row.sentiment],axis=1)
val_data["sentiment"]=val_data.apply(lambda row:label_to_int[row.sentiment],axis=1)
# drop values
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',do_lowercase=True)
labels = train_data['sentiment'].values



In [3]:
print(train_data)

       id  ... sentiment
0    1430  ...         2
1    1430  ...         2
2    1430  ...         2
3    1430  ...         2
4    1430  ...         2
..    ...  ...       ...
995   760  ...         2
996  1605  ...         2
997  1605  ...         0
998  1605  ...         2
999  1605  ...         2

[1000 rows x 4 columns]


In [4]:
class SentiHood:
  """
  This class tokenizes the input text using the pre-trained BERT tokenizer 
  (wordpiece) and returns the corresponding tensors.
  """
  
  def __init__(self, text, auxiliary_sentence, targets, tokenizer, max_len):
    self.text = text
    self.auxiliary_sentence = auxiliary_sentence
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    auxiliary_sentence = str(self.auxiliary_sentence[item])
    targets = self.targets[item]
    text = text + ' ' + auxiliary_sentence

    inputs = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long),
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        
    }


In [5]:
# define training dataloader
train_dataset = SentiHood(
    text = train_data['text'].values,
    auxiliary_sentence = train_data['auxiliary_sentence'],
    targets = train_data['sentiment'].values,
    tokenizer = tokenizer,
    max_len = max_length
)
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = False
)
print(len(train_dataset))
print(len(train_data_loader))

1000
32


In [6]:
# define val dataloader
val_dataset = SentiHood(
    text = val_data['text'].values,
    auxiliary_sentence = train_data['auxiliary_sentence'],
    targets = val_data['sentiment'].values,
    tokenizer = tokenizer,
    max_len = max_length
)
val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = batch_size,
    shuffle = False
)
print(len(val_dataset))
print(len(val_data_loader))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")



600
19
Device: cpu


In [7]:
class Model(nn.Module):

  def __init__(self, bert_model):
    super(Model, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model, return_dict=False)
    self.drop = nn.Dropout(p=0.5)
    self.out = nn.Linear(self.bert.config.hidden_size, 3) # Number of output classes = 3

  def forward(self, ids, mask, token_type_ids):
    last_hidden_state, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
    output = self.drop(pooled_output)
    return self.out(output)
model = Model("bert-base-uncased")
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# to initalize weights in proportion to imbalance in classes
class_counts = []
for i in range(3):
  class_counts.append(train_data[train_data['sentiment']==i].shape[0])
print(f"Class Counts: {class_counts}")
num_train_steps = int(len(train_dataset) / batch_size * epochs)
optimizer=optim.Adagrad(model.parameters(),lr=lr)
scheduler =StepLR(
    optimizer,
    gamma=0.8,
    step_size = 1
)
print(num_train_steps)

Class Counts: [74, 30, 896]
62


In [10]:

len(train_data)

1000

In [20]:
def train_function(data_loader, model, optimizer, device):
  """
  This function defines the training loop over the entire training set.
  """

  model.train()

  running_loss = 0.0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()
    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    summed = sum(class_counts)
    weight = torch.tensor(class_counts) / summed
    loss = nn.CrossEntropyLoss(weight=weight)(outputs, targets)

    
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if bi % 10 == 0 and bi!=0:
      temp = f'Batch index = {bi}\tLoss = {running_loss/10}'
      print(temp)

      f1 = open('/content/loss.txt', 'a+')
      temp = temp + '\n'
      f1.write(temp)
      f1.close()

      running_loss = 0.0

In [11]:
def eval_function(data_loader, model, device):
  """
  This function defines the evaluation loop over the entire validation set.
  It also computes accuracy of the trained model, which is used to select the 
  best model.
  """
  
  model.eval()

  corrects = 0
  total = 0
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

    _, predicted = torch.max(outputs, 1)
    total = total + targets.size(0)
    corrects = corrects + (predicted==targets).sum().item()

    print(f"bi: {bi}\tPredicted: {predicted}\tTargets: {targets}")

  accuracy = corrects / total * 100
  f1 = open('/content/accuracy.txt', 'a+')
  temp = f"Corrects: {corrects}\tTotal: {total}\tAccuracy: {accuracy}\n"
  f1.write(temp)
  f1.close()

  return accuracy

# Training

In [15]:
for epoch in range(epochs):
  train_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
  accuracy = eval_function(data_loader=val_data_loader, model=model, device=device)

  print(f"\nEpoch = {epoch}\tAccuracy Score = {accuracy}")
  print(f"Learning Rate = {scheduler.get_lr()[0]}\n")

  scheduler.step()

  torch.save(model, '/content/' + str(epoch) + '.bin')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Batch index = 10	Loss = 1.2250111788511275
Batch index = 20	Loss = 0.13874215632677078
Batch index = 30	Loss = 0.11299104765057563
bi: 0	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
bi: 1	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
bi: 2	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])	Targets: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
bi: 3	Predicted: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2,