# **Task 1**

In [1]:
!pip install tqdm
!pip install transformers
!conda install pytorch torchvision torchaudio cpuonly -c pytorch
!pip install -U scikit-learn
# In Google collab, change runtime-GPU
import os
import requests
os.makedirs("data/semeval2014-task10", exist_ok=True)
# download train data and save it to disk
resp = requests.get("https://raw.githubusercontent.com/pedrobalage/SemevalAspectBasedSentimentAnalysis/master/semeval_data/Restaurants_Train.xml")
if resp.status_code != 200:
    raise RuntimeError("Error while downloading dataset! (status-code = %i)" % reps.status_code)
with open("data/semeval2014-task10/restaurants-train.xml", "wb+") as f:
    f.write(resp.content)
# download test data and save it to disk
resp = requests.get("https://raw.githubusercontent.com/HSLCY/ABSA-BERT-pair/master/data/semeval2014/Restaurants_Test_Gold.xml")
if resp.status_code != 200:
    raise RuntimeError("Error while downloading dataset! (status-code = %i)" % reps.status_code)
with open("data/semeval2014-task10/restaurants-trial.xml", "wb+") as f:
    f.write(resp.content)

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.5 transformers-4.16.2
^C
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\scikit_learn-0.24.1.dist-info\\COPYING'
Consider using the `--user` option or check the permissions.



In [2]:
!ls data/semeval2014-task10

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from tqdm.notebook import tqdm
import xml.etree.ElementTree as ET

DATA_PATH = "data/semeval2014-task10"
assert os.path.isdir(DATA_PATH)

ModuleNotFoundError: No module named 'torch'

In [None]:
# set device
device = 'cuda:0' # 'cpu'
# pretrained bert model (provided by transformers package)
pretrained_bert_model = "bert-base-uncased"
# training parameters
epochs = 3
batch_size = 12
max_seq_length = 64
learning_rate = 1e-5

In [None]:
# full list of labels to consider for prediction task
label_list = ['positive', 'neutral', 'negative', 'conflict']

## Create the Tokenizer, Model and Optimizer

In [None]:
# load the pretrained bert model and push it to the device
# this might take a while as the model needs to be downloaded on the first execution
model = transformers.BertForSequenceClassification.from_pretrained(pretrained_bert_model, num_labels=len(label_list))
model = model.to(device)
# create the tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_bert_model)
# use the default adam optimizer
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# some statistics on the model and tokenizer
# count number of parameters
number_of_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print statistics
print("Vocab size:\t\t\t", model.config.vocab_size)
print("Number of layers:\t\t", model.config.num_hidden_layers)
print("Number of attention heads:\t", model.config.num_attention_heads)
print("Hidden Layer Dimension:\t\t", model.config.hidden_size)
print("Number of parameters:\t\t", number_of_params)

# **Task 2**

## Preprocess Train and Test Datasets

In [None]:
def get_input_features(text, aspect, tokenizer, max_length=None):
    enc_out = tokenizer.encode_plus(
        text=text,
        text_pair=aspect,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_attention_mask=False,
        return_token_type_ids=True
    )
    return enc_out.input_ids, enc_out.token_type_ids
    
def build_dataset(fpath, tokenizer):
    all_input_ids, all_token_type_ids, all_label_ids = [], [], []
    # parse xml-file
    tree = ET.parse(fpath)    
    for sent in tree.iter("sentence"):
        # get sentence text
        text = sent.find("text").text
        # loop over all aspects
        for aspect in sent.iter("aspectTerm"):
            term = aspect.attrib['term']
            polarity = aspect.attrib['polarity']
            if polarity not in label_list:
                continue
            # build input features
            input_ids, token_type_ids = get_input_features(text, term, tokenizer, max_length=max_seq_length)
            all_input_ids.append(input_ids)
            all_token_type_ids.append(token_type_ids)
            all_label_ids.append(label_list.index(polarity))

        # loop over all aspect categories
        for aspect in sent.iter("aspectCategory"):
            term = aspect.attrib['category']
            polarity = aspect.attrib['polarity']
            if polarity not in label_list:
                continue
            # build input features
            input_ids, token_type_ids = get_input_features(text, term, tokenizer, max_length=max_seq_length)
            all_input_ids.append(input_ids)
            all_token_type_ids.append(token_type_ids)
            all_label_ids.append(label_list.index(polarity))

    return torch.LongTensor(all_input_ids), torch.LongTensor(all_token_type_ids), torch.LongTensor(all_label_ids)

In [None]:
# full paths to train and test files
train_fpath = os.path.join(DATA_PATH, "restaurants-train.xml")
test_fpath = os.path.join(DATA_PATH, "restaurants-trial.xml")
# create train and test datasets
train_data = torch.utils.data.TensorDataset(*build_dataset(train_fpath, tokenizer))
test_data = torch.utils.data.TensorDataset(*build_dataset(test_fpath, tokenizer))
# create dataloaders from the datasets
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)
# some basic stats
print("Train Size:", len(train_data))
print("Test Size: ", len(test_data))

## stats on dataset

In [None]:
# count the number of occurances of each relation in both datasets
unique_train_label_ids, train_counts = train_data.tensors[2].unique(return_counts=True)
unique_test_label_ids, test_counts = test_data.tensors[2].unique(return_counts=True)
# order counts by labels
train_counts = train_counts[torch.argsort(unique_train_label_ids)].tolist()
test_counts = test_counts[torch.argsort(unique_test_label_ids)].tolist()
# plot
fig, ax = plt.subplots(1, 1)
ax.bar(label_list, train_counts)
ax.bar(label_list, test_counts)
ax.set_xticklabels(label_list, rotation='vertical')
ax.set(ylabel="#Occurances", title="Number of Occurances per Label")
ax.legend(['train', 'test'])
plt.show()

In [None]:
# compute length of each sentence
train_lengths = (train_data.tensors[0] != tokenizer.pad_token_id).sum(dim=1)
test_lengths = (test_data.tensors[0] != tokenizer.pad_token_id).sum(dim=1)
# count occurances of different lengths
train_lengths, train_counts = train_lengths.unique(return_counts=True)
test_lengths, test_counts = test_lengths.unique(return_counts=True)
# plot
fig, ax = plt.subplots(1, 1)
ax.bar(train_lengths, train_counts)
ax.bar(test_lengths, test_counts)
ax.set(xlabel="#Tokens", ylabel="#Occurances", title="Histogram of input lengths")
ax.legend(['train', 'test'])
plt.show()

# **Task 3**

## Finetuning the Model

In [None]:
train_losses, test_losses = [], []
f1_scores = []
# train loop
for e in range(1, 1+epochs):
    # train the model for one epoch
    model.train()
    running_loss = 0
    tbar = tqdm(train_dataloader, desc="Epoch %i" % e, leave=False)
    for i, (input_ids, token_type_ids, labels) in enumerate(tbar, 1):
        # move to device
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        # create attention mask and predict logits
        attention_mask = input_ids != tokenizer.pad_token_id
        logits = model.forward(
            input_ids=input_ids, 
            token_type_ids=token_type_ids, 
            attention_mask=attention_mask
        ).logits
        # compute loss
        loss = F.cross_entropy(logits, labels)
        running_loss += loss.item()
        # update parameters
        optim.zero_grad()
        loss.backward()
        optim.step()
        # update progress bar
        tbar.set_postfix({'loss': running_loss/i})
        
    # add train loss to list
    train_losses.append(running_loss / len(train_dataloader))
        
    # evaulate model
    with torch.no_grad():
        model.eval()
        running_loss = 0
        true_labels, pred_labels = [], []
        for input_ids, token_type_ids, labels in tqdm(test_dataloader, leave=False, desc="Evaluating"):
            # move to device
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            # create attention mask and predict logits
            attention_mask = input_ids != tokenizer.pad_token_id
            logits = model.forward(
                input_ids=input_ids, 
                token_type_ids=token_type_ids, 
                attention_mask=attention_mask
            ).logits
            # compute loss
            running_loss += F.cross_entropy(logits, labels.to(device)).item()
            # save target labels and predictions in list
            true_labels += labels.tolist()
            pred_labels += logits.max(1).indices.cpu().tolist()
            
    # compute fscores for each label
    f1_scores.append(f1_score(true_labels, pred_labels, average=None))
    # add test loss to list
    test_losses.append(running_loss / len(test_dataloader))

    # compute micro and macro f1-scores
    micro_f1 = f1_score(true_labels, pred_labels, average='micro')
    macro_f1 = f1_score(true_labels, pred_labels, average='macro')
    
    print("Epoch: %i - Train Loss: %.04f - Test Loss: %.04f - Micro F1-Score: %.04f - Macro F1-Score: %.04f" % (
        e, train_losses[-1], test_losses[-1], micro_f1, macro_f1))
        

## Observation:
   As the epoch value increased there is a increase in F1-Score

## Evaluation Metrics and Confusion Matrix

In [None]:
fig, ax = plt.subplots(1, 1)
ax.plot(train_losses)
ax.plot(test_losses)
ax.legend(["train", "test"])
ax.set(xlabel="Epoch", ylabel="Loss", title="Train and Test Loss")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1)
ax.plot(f1_scores)
ax.legend(label_list)
ax.set(xlabel="Epoch", ylabel="F1-Score", title="F1-Scores per label")
plt.show()
# print final scores
for label, score in zip(label_list, f1_scores[-1]):
    print("Final F1-Score for %s:\t%.04f" % (label, score))

In [None]:
cm = confusion_matrix(pred_labels, true_labels, labels=range(len(label_list)), normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_list)
disp.plot()
disp.ax_.set_title("Confusion Matrix")
disp.ax_.set_xticklabels(label_list, rotation='vertical')
disp.figure_.set_figwidth(8)
disp.figure_.set_figheight(8)
plt.show()

## Applications or Examples 

In [None]:
text = "I was tempted to buy this product as I really like its design, but its price is not very good"
aspects = ["design", "price"]

# get input features from text
in_feats_zipped = [get_input_features(text, aspect, tokenizer) for aspect in aspects]
input_ids, token_type_ids = zip(*in_feats_zipped)
# convert to tensors and push to device
# also compute attention mask
input_ids = torch.LongTensor(input_ids).to(device)
token_type_ids = torch.LongTensor(token_type_ids).to(device)
attention_mask = input_ids != tokenizer.pad_token_id
# pass through model and get prediction
model.eval()
with torch.no_grad():
    logits = model.forward(
        input_ids=input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    ).logits
    label_ids = logits.max(dim=1).indices.cpu()
# get labels from ids
predicted_labels = [label_list[i] for i in label_ids]
# print predictions
print(text)
for aspect, label in zip(aspects, predicted_labels):
    print("Predicted Sentiment for {0}:\t{1}".format(aspect, label))

In [None]:
# input sentence
text = "This is the nice place to drink but coffee is bad."
aspects = ["place", "coffee"]

# get input features from text
in_feats_zipped = [get_input_features(text, aspect, tokenizer) for aspect in aspects]
input_ids, token_type_ids = zip(*in_feats_zipped)
# convert to tensors and push to device
# also compute attention mask
input_ids = torch.LongTensor(input_ids).to(device)
token_type_ids = torch.LongTensor(token_type_ids).to(device)
attention_mask = input_ids != tokenizer.pad_token_id
# pass through model and get prediction
model.eval()
with torch.no_grad():
    logits = model.forward(
        input_ids=input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    ).logits
    label_ids = logits.max(dim=1).indices.cpu()
# get labels from ids
predicted_labels = [label_list[i] for i in label_ids]
# print predictions
print(text)
for aspect, label in zip(aspects, predicted_labels):
    print("Predicted Sentiment for {0}:\t{1}".format(aspect, label))

In [None]:
# input sentence
text = "The dress was good but not worth for price."
aspects = ["dress", "price"]

# get input features from text
in_feats_zipped = [get_input_features(text, aspect, tokenizer) for aspect in aspects]
input_ids, token_type_ids = zip(*in_feats_zipped)
# convert to tensors and push to device
# also compute attention mask
input_ids = torch.LongTensor(input_ids).to(device)
token_type_ids = torch.LongTensor(token_type_ids).to(device)
attention_mask = input_ids != tokenizer.pad_token_id
# pass through model and get prediction
model.eval()
with torch.no_grad():
    logits = model.forward(
        input_ids=input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    ).logits
    label_ids = logits.max(dim=1).indices.cpu()
# get labels from ids
predicted_labels = [label_list[i] for i in label_ids]
# print predictions
print(text)
for aspect, label in zip(aspects, predicted_labels):
    print("Predicted Sentiment for {0}:\t{1}".format(aspect, label))