# **NLU Final Project: *Target/Aspect Based Sentiment Analysis (T/ABSA)***

### Author info: 

    Student name: Simone Caldarella
    Student Number: 224434
    Email: simone.caldarella@studenti.unitn.it

### Other info:

    Dataset: https://github.com/lixin4ever/E2E-TBSA
    Paper: https://aclanthology.org/P19-1051.pdf

# **CODE**

In [63]:
try:
    import torch
    from torch.utils.data import Dataset
    from torch.utils.data import DataLoader
    import numpy as np
    import matplotlib.pyplot as plt
    import time
    import os
    import pprint
    import matplotlib.pyplot as plt
    import tqdm
    from datetime import datetime
except:
    print("Plase install all the modules required")
    exit()

# Code for mounting drive if force remount failed

__COLAB__ = True # Use True if you are using this in colab

__BASEPATH__ = '/content/drive/MyDrive/NLU_Project'
__DATA__ = 'Data'

if __COLAB__:
    from google.colab import drive
    if os.path.isdir('/content/drive') is False:
        drive.mount('/content/drive', force_remount=False)

try:
    import transformers
except:
    if __COLAB__:
        !pip install transformers

    else:
        print("Please install transformers library before starting")
        exit()
        
from transformers import DistilBertModel, BertTokenizer, BertForQuestionAnswering, DistilBertTokenizer, DistilBertForQuestionAnswering, DistilBertConfig, BertConfig, BertModel
from transformers.optimization import get_linear_schedule_with_warmup

## PRE-PROCESSING and TEMPLATES

### Library Used for reproducibility of multi-target extraction

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import json
import math
import six
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [3]:
def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
    
class BERTLayerNorm(torch.nn.Module):
    def __init__(self, config, variance_epsilon=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BERTLayerNorm, self).__init__()
        self.gamma = torch.nn.Parameter(torch.ones(config.hidden_size))
        self.beta = torch.nn.Parameter(torch.zeros(config.hidden_size))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta


class BERTEmbeddings(torch.nn.Module):
    def __init__(self, config):
        super(BERTEmbeddings, self).__init__()
        """Construct the embedding module from word, position and token_type embeddings.
        """
        self.word_embeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = torch.nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BERTLayerNorm(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class BERTSelfAttention(torch.nn.Module):
    def __init__(self, config):
        super(BERTSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = torch.nn.Linear(config.hidden_size, self.all_head_size)
        self.key = torch.nn.Linear(config.hidden_size, self.all_head_size)
        self.value = torch.nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = torch.nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)   # [N, L, H]
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)  # [N, K, L, H//K]
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))   # [N, K, L, L]
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)  # [N, K, L, H//K]
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  # [N, L, K, H//K]
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)    # [N, L, H]
        return context_layer


class BERTSelfOutput(torch.nn.Module):
    def __init__(self, config):
        super(BERTSelfOutput, self).__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BERTLayerNorm(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BERTAttention(torch.nn.Module):
    def __init__(self, config):
        super(BERTAttention, self).__init__()
        self.self = BERTSelfAttention(config)
        self.output = BERTSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


class BERTIntermediate(torch.nn.Module):
    def __init__(self, config):
        super(BERTIntermediate, self).__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = gelu

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class BERTOutput(torch.nn.Module):
    def __init__(self, config):
        super(BERTOutput, self).__init__()
        self.dense = torch.nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BERTLayerNorm(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [4]:
class BERTLayer(torch.nn.Module):
    def __init__(self, config):
        super(BERTLayer, self).__init__()
        self.attention = BERTAttention(config)
        self.intermediate = BERTIntermediate(config)
        self.output = BERTOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class BERTEncoder(torch.nn.Module):
    def __init__(self, config):
        super(BERTEncoder, self).__init__()
        layer = BERTLayer(config)
        self.layer = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])    

    def forward(self, hidden_states, attention_mask):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers


class BERTPooler(torch.nn.Module):
    def __init__(self, config):
        super(BERTPooler, self).__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = torch.nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [5]:
class BertModelOlder(torch.nn.Module):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
    model = modeling.BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config: BertConfig):
        """Constructor for BertModel.
        Args:
            config: `BertConfig` instance.
        """
        super(BertModelOlder, self).__init__()
        self.embeddings = BERTEmbeddings(config)
        self.encoder = BERTEncoder(config)
        self.pooler = BERTPooler(config)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
        sequence_output = all_encoder_layers[-1]
        pooled_output = self.pooler(sequence_output)
        return all_encoder_layers, pooled_output


In [6]:
class BERTLayerNorm(torch.nn.Module):
    def __init__(self, config, variance_epsilon=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BERTLayerNorm, self).__init__()
        self.gamma = torch.nn.Parameter(torch.ones(config.hidden_size))
        self.beta = torch.nn.Parameter(torch.zeros(config.hidden_size))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

In [7]:
class BertForSpanAspectExtraction(torch.nn.Module):
    """BERT model for Question Answering (span extraction).
    This module is composed of the BERT model with a linear layer on top of
    the sequence output that computes start_logits and end_logits
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
    config = BertConfig(vocab_size=32000, hidden_size=512,
        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
    model = BertForQuestionAnswering(config)
    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        super(BertForSpanAspectExtraction, self).__init__()
        self.bert = BertModelOlder(config).to(get_device())
        self.qa_outputs = torch.nn.Linear(config.hidden_size, 2).to(get_device())
        self.loss = TargetExtractionLoss().to(get_device()) 
        self.M = 20
        self.K = 10
        self.gamma = 8.5

        def init_weights(module):
            if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif isinstance(module, BERTLayerNorm):
                module.beta.data.normal_(mean=0.0, std=config.initializer_range)
                module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
            if isinstance(module, torch.nn.Linear):
                module.bias.data.zero_()
        self.apply(init_weights)

    def forward(self, input):
        all_encoder_layers, _ = self.bert(**input)
        sequence_output = all_encoder_layers[-1]
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

    def compute_loss(self, data):
        sentence,x,y,polarized_targets = data
        yp_s, yp_e = self.forward(x)
        loss = self.loss(yp_s, yp_e, y)

        return loss

    def inference(self, input, debug=False):
        '''Non-Max Suppression heuristic algorithm to extract multiple targets'''

        # Incomplete forward pass
        output = self.forward(input)

        g_s = output[0].squeeze(0)
        g_e = output[1].squeeze(0)
        
        # Heuristic multi-span decoding

        R = []
        U = []
        O = []

        _, S = torch.topk(g_s, self.M)
        _, E = torch.topk(g_e, self.M)


        for s in S:
            for e in E:
                
                if s<=e and g_s[s]+g_e[e]>=self.gamma:
                    u = g_s[s]+g_e[e] - (e-s+1)
                    r = (s,e)
                    R.append(r)
                    U.append(u)

        if debug:
            out = zip(R, U)
            [print(e) for e in out]

        while len(R)>0 and len(O)<self.K:
            u = max(U)
            l = U.index(u)
            r = R[l]
            O.append(r)
            R.remove(r)
            U.remove(u)
            i = 0

            while i<len(R):
                # Overlapping checked as intersection over lists in range of start and end
                l_r = list(range(r[0], r[1]+1))
                rc = R[i]
                l_rc = list(range(rc[0], rc[1]+1))

                if (len(set.intersection(set(l_r), set(l_rc)))) > 0:
                    R.remove(rc)
                    U.remove(U[i])
                    i += -1
                i += 1

        return O   

In [8]:
def bert_load_state_dict(model, state_dict):
    missing_keys = []
    unexpected_keys = []
    error_msgs = []

    # copy state_dict so _load_from_state_dict can modify it
    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    def load(module, prefix=''):
        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
        module._load_from_state_dict(
            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')

    load(model, prefix='' if hasattr(model, 'bert') else 'bert.')

    if len(missing_keys) > 0:
        print("Weights of {} not initialized from pretrained model: {}".format(
            model.__class__.__name__, missing_keys))
    if len(unexpected_keys) > 0:
        print("Weights from pretrained model not used in {}: {}".format(
            model.__class__.__name__, unexpected_keys))
    return model

In [9]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""

import math
import torch
from torch.optim import Optimizer
from torch.nn.utils import clip_grad_norm_

def warmup_cosine(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 0.5 * (1.0 + torch.cos(math.pi * x))

def warmup_constant(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

SCHEDULES = {
    'warmup_cosine':warmup_cosine,
    'warmup_constant':warmup_constant,
    'warmup_linear':warmup_linear,
}


class BERTAdam(Optimizer):
    """Implements BERT version of Adam algorithm with weight decay fix (and no ).
    Params:
        lr: learning rate
        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
        t_total: total number of training steps for the learning
            rate schedule, -1  means constant learning rate. Default: -1
        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
        b1: Adams b1. Default: 0.9
        b2: Adams b2. Default: 0.999
        e: Adams epsilon. Default: 1e-6
        weight_decay_rate: Weight decay. Default: 0.01
        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
    """
    def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
                 b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
                 max_grad_norm=1.0):
        if not lr >= 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if schedule not in SCHEDULES:
            raise ValueError("Invalid schedule parameter: {}".format(schedule))
        if not 0.0 <= warmup < 1.0 and not warmup == -1:
            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
        if not 0.0 <= b1 < 1.0:
            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
        if not 0.0 <= b2 < 1.0:
            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
        if not e >= 0.0:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
                        b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
                        max_grad_norm=max_grad_norm)
        super(BERTAdam, self).__init__(params, defaults)

    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    return [0]
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        return lr

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['next_m'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['next_v'] = torch.zeros_like(p.data)

                next_m, next_v = state['next_m'], state['next_v']
                beta1, beta2 = group['b1'], group['b2']

                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                next_m.mul_(beta1).add_(1 - beta1, grad)
                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                update = next_m / (next_v.sqrt() + group['e'])

                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want ot decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                if group['weight_decay_rate'] > 0.0:
                    update += group['weight_decay_rate'] * p.data

                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']

                update_with_lr = lr_scheduled * update
                p.data.add_(-update_with_lr)

                state['step'] += 1

                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
                # bias_correction1 = 1 - beta1 ** state['step']
                # bias_correction2 = 1 - beta2 ** state['step']

        return loss

In [10]:
def prepare_bertadam(lr, wr, steps, model):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = BERTAdam(optimizer_grouped_parameters,
                         lr=lr,
                         warmup=wr,
                         t_total=steps)
    return optimizer

### Utilities

In [11]:
def get_device():
    '''
    Dinamically get the current device
    '''
    
    try:
        torch.cuda.current_device()
        return 'cuda'
    except:
        return 'cpu'

In [12]:
def get_BERT(model_class, pretrained_weights, layers, all_params_grad, qa=True, reproducibility=False):
    '''
    Import pretrained Bert and load it to the device used
    '''

    if reproducibility:
        bert_config = BertConfig.from_json_file(os.path.join(__BASEPATH__, __DATA__, 'bert-base-uncased', 'bert_config.json'))
        model = BertForSpanAspectExtraction(bert_config)

        model = bert_load_state_dict(model, torch.load(os.path.join(__BASEPATH__, __DATA__, 'bert-base-uncased', 'pytorch_model.bin'), map_location='cpu'))

    else:
        model = model_class.from_pretrained(pretrained_weights)

        if qa==True:
            try:
                for param in model.qa_outputs.parameters():
                    param.requires_grad = True

                for l in layers:
                    for param in model.distilbert.transformer.layer[l].parameters():
                        param.requires_grad = layers[l]
            except:
                print("This is not Bert for Question answering")
        
        else:
            for l in layers:
                    for param in model.transformer.layer[l].parameters():
                        param.requires_grad = layers[l]

    model = model.to(get_device())

    
    print("###################################################")
    print("-------------------Model Choosed-------------------")
    print(model)

    return model

In [13]:
def my_collate(batch):
    '''
    Custom collate function for dataloader to handle the variability of targets
    '''
    encoding = {} # Used as support for the custom collate
    texts = [sample[0] for sample in batch]
    encoding['input_ids'] = torch.stack([sample[1]['input_ids'].to(get_device()) for sample in batch]).squeeze(1)

    # Comment this if you are using distilled bert
    #encoding['token_type_ids'] = torch.stack([sample[1]['token_type_ids'].to(get_device()) for sample in batch]).squeeze(1)

    encoding['attention_mask'] = torch.stack([sample[1]['attention_mask'].to(get_device()) for sample in batch]).squeeze(1)
    targets = [sample[2] for sample in batch]
    pol_targets = [sample[3] for sample in batch]
    return [texts, encoding, torch.stack(targets).to(get_device()), pol_targets]

### Download and parsing dataset

In [14]:
def import_data(tokenizer_class, pretrained_weights, path, max_length):

    data = []

    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

    POL = {
            'POS':torch.Tensor([1, 0, 0]),
            'NEG':torch.Tensor([0, 0, 1]),
            'NEU':torch.Tensor([0, 1, 0])
        }

    # Read file
    with open(path) as f:
        lines = [line.rstrip() for line in f]

    # Parse the file line by line
    for line in lines:

        polarity_targets = []
        s = torch.zeros(max_length)
        e = torch.zeros(max_length) 

        labs = line.split('####')[1].split(' ')
        nsentence = line.split('####')[0]
        sentence = [t.split('=')[0] for t in labs]
        # targets = torch.zeros(max_length)
        t_support = [0]*len(sentence)
        for i, tag in enumerate(labs):
            if tag[-1] != 'O' and tag[-5] == 'T':
                t_support[i] = (1, tag[-3:])
                # targets[i] = 1
    
        i = -1

        while i < len(t_support):
            i += 1

            if i == len(t_support):
                break

            if t_support[i] != 0:
                start = i
                pol = t_support[i][1]

                while i+1 < len(t_support) and t_support[i+1] != 0:
                    i += 1

                end = i
            
                tgt = [start, end, POL[pol]]
                polarity_targets.append(tgt)

                # len+2 used in order to try question answering on context
                s[start+1] = 1
                e[end+1] = 1 
            
        targets = torch.stack([s, e])

        encoded_sentence = tokenizer.encode_plus(nsentence, return_attention_mask = True, return_tensors = "pt", add_special_tokens = True, truncation = True, padding='max_length', max_length=max_length)
        data.append([sentence, encoded_sentence, targets, polarity_targets])
    
    newdata = [el for el in data if el[3] != []]

    return newdata

In [15]:
class SemEval2014(Dataset):
    '''
    Custom dataset module
    '''
    def __init__(self, tokenizer_class, pretrained_weights_tokenizer, dataset_path, max_size):
        self.samples = import_data(tokenizer_class, pretrained_weights_tokenizer, dataset_path, max_size)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        # Text, Encoding, Targets, Pol_Targets
        return sample[0], sample[1], sample[2], sample[3]

### Optimizers

In [16]:
def get_optimizer(parameters, optim, lr, momentum):
    '''
    Get optimizer dynamically
    '''

    if optim == 'SGD':
        optimizer = torch.optim.SGD(parameters, lr=lr, momentum=momentum)

    elif optim == 'Adam':
        optimizer = torch.optim.Adam(parameters, lr=lr)

    elif optim == 'AdamW':
        optimizer = torch.optim.AdamW(parameters, lr=lr)
        
    return optimizer

### Early stopping

In [17]:
class EarlyStopping:
    def __init__(self, min_delta=0, patience=0):
        self.min_delta = min_delta
        self.patience = patience
        self.wait = 0
        self.stopped_epoch = 0
        self.best = np.Inf
        self.stop_training = False
    def on_epoch_end(self, epoch, current_value):
        if np.greater(self.best, (current_value - self.min_delta)):
            self.best = current_value
            self.wait = 0
        else:
            self.wait += 1
            if self.wait > self.patience:
                self.stopped_epoch = epoch
                self.stop_training = True
        return self.stop_training

### Training and Eval templates

In [18]:
def training(net, optimizer, dataloader, batch_size, sched=None, pc=False):
    '''
    Training function that correspond to one train pass
    '''

    pbar = tqdm.tqdm(position=0, leave=True)
    pbar.reset(total=int(dataloader.dataset.__len__()/batch_size))
    
    net.train()
    epoch_loss = []
    accuracy = []
    if pc:
        loss = torch.zeros(16)

    for i, data in enumerate(dataloader):
        # sentence,x,y,polarized_targets <-- data
        optimizer.zero_grad()

        if pc:
            l = net.compute_loss(data)
            loss[int(i%16)] = l
            if i%16 == 0:
                loss = torch.mean(loss)
                epoch_loss.append(loss)
                loss.backward()
                optimizer.step()
                loss = torch.zeros(16)
                if sched is not None:
                    sched.step()

        else:
            loss = net.compute_loss(data)
            if loss != 0:
                epoch_loss.append(loss)
                loss.backward()
                optimizer.step()
                if sched is not None:
                    sched.step()
        pbar.update()
    
    epoch_loss = torch.tensor(epoch_loss).mean().item()
    pbar.close()

    return epoch_loss

In [19]:
def validation(net, dataloader, batch_size):

    pbar = tqdm.tqdm(position=0, leave=True)
    pbar.reset(total=int(dataloader.dataset.__len__()/batch_size))
    
    epoch_loss = []
    accuracy = []

    net.eval()
    with torch.no_grad():

        for data in dataloader:
            loss = net.compute_loss(data)
            if loss != 0:
                epoch_loss.append(loss)
            pbar.update()
    
    epoch_loss = torch.tensor(epoch_loss).mean().item()
    pbar.close()

    return epoch_loss

### Train loop

In [20]:
def train_loop(net, optim, train_dataloader, validation_dataloader, warmup, steps, lr, momentum, epochs, save_path, sched=None, reproducibility=False, pc=False):
    '''
    Standard training loop with training step and validation step at each epoch
    '''
    if reproducibility:
        optimizer = prepare_bertadam(lr, warmup, steps, net)
        sched = None
    
    else:
        optimizer = get_optimizer(net.parameters(), optim, lr, momentum)
        if sched == 'linear':
            sched = get_linear_schedule_with_warmup(optimizer, int(warmup*steps*epochs), steps, last_epoch=-1)
    loss_history_train = {'loss':[]}
    loss_history_validation = {'loss':[]}
    early_stopping = EarlyStopping(patience=4)
    early_stopping.stop_training = False

    for epoch in range(epochs):

        print('========================== Epoch {} ====================== \n'.format(epoch))

        epoch_loss_train = training(net, optimizer, train_dataloader, train_dataloader.batch_size, sched, pc=pc)
        epoch_loss_validation = validation(net, validation_dataloader, validation_dataloader.batch_size)
        early_stopping.on_epoch_end(epoch = epoch, current_value = round(epoch_loss_validation,5))
        if early_stopping.wait == 0: 
            bestModel = net
            torch.save(bestModel, save_path)
            print('New best model saved with loss:', epoch_loss_validation)

        loss_history_train['loss'].append(epoch_loss_train)
        loss_history_validation['loss'].append(epoch_loss_validation)

        print('\n Results:')
        print('     - Training loss:', epoch_loss_train)
        print('     - Validation loss:', epoch_loss_validation)
        print('\n')

    final_path = save_path[:-4]+"_final.pth"
    torch.save(net, final_path)

    return loss_history_train, loss_history_validation

## TARGET EXTRACTION

### Custom Loss function for target extraction

In [35]:
class TargetExtractionLoss(torch.nn.Module):
    '''
    Reproduction of the loss in the paper, with some possible modifications
    '''
    
    def __init__(self):
        super(TargetExtractionLoss,self).__init__()
        
    
    def forward(self, yp_s, yp_e, y):
        
        y_s = y[:,0,:] # starts label
        y_e = y[:,1,:] # ends label

        log_softmax = torch.nn.LogSoftmax(dim=-1)
        log_probs_s = log_softmax(yp_s)
        log_probs_e = log_softmax(yp_e)
        
        if torch.sum(y_s.to(dtype=log_probs_s.dtype)) == 0:
            loss = 0

        else:
            loss_s = -1 * torch.mean(torch.sum(y_s.to(dtype=log_probs_s.dtype) * log_probs_s, dim=-1) / torch.sum(y_s.to(dtype=log_probs_s.dtype), dim=-1))
            loss_e = -1 * torch.mean(torch.sum(y_e.to(dtype=log_probs_e.dtype) * log_probs_e, dim=-1) / torch.sum(y_e.to(dtype=log_probs_e.dtype), dim=-1))
            
            loss = (loss_s+loss_e)/2
            print(loss)
        
        return loss

### Target extraction model

In [36]:
class TargetExtractionNet(torch.nn.Module):
    '''
    Model based on the paper, composed by a backbone and 2 linear layers, one for the
    distribution of the starts and the other for the distribution of the ends
    '''

    def __init__(self, backbone, gamma, K=10, M=20, qa=False):
        super(TargetExtractionNet, self).__init__()
        self.gamma = gamma
        self.K = K
        self.M = M
        self.qa = qa
        self.backbone = backbone
        
        # First order parameters

        if not self.qa:
            # self.w_s = torch.nn.Parameter(torch.rand(768), requires_grad=True).to(get_device())
            # self.w_e = torch.nn.Parameter(torch.rand(768), requires_grad=True).to(get_device())
            self.w_s = torch.nn.Linear(768, 1, bias=True, device=get_device())
            self.w_e = torch.nn.Linear(768, 1, bias=True, device=get_device())
        
        # Activation
        #self.softmax = torch.nn.Softmax(dim=-1).to(get_device())

        # Loss function
        self.loss = TargetExtractionLoss().to(get_device())

    def forward(self, input):

        # Backbone pass
        output = self.backbone(**input)

        if not self.qa:
            output = output[0] # size = (batch_size, sentence_max_size, bert_hidden_size=768)

            # Start distribution
            # output_s = torch.matmul(output, self.w_s)
            output_s = self.w_s(output).squeeze()

            # End distribution
            # output_e = torch.matmul(output, self.w_e)
            output_e = self.w_e(output).squeeze()
        
        else:

            output_s = output.start_logits
            output_e = output.end_logits
    
        output = [output_s, output_e] # Aggregate output

        return output

    def compute_loss(self, data):
        sentence,x,y,polarized_targets = data
        yp_s, yp_e = self.forward(x)
        loss = self.loss(yp_s, yp_e, y)

        return loss

    def inference(self, input, debug=False):
        '''Non-Max Suppression heuristic algorithm to extract multiple targets'''

        # Incomplete forward pass
        output = self.backbone(**input)

        if not self.qa:
            output = output[0]
            # g_s = torch.matmul(output, self.w_s)[0,:]
            # g_e = torch.matmul(output, self.w_e)[0,:]

            g_s = self.w_s(output).squeeze()
            g_e = self.w_e(output).squeeze()

        else:

            g_s = output.start_logits.squeeze(0)
            g_e = output.end_logits.squeeze(0)
        
        # Heuristic multi-span decoding

        R = []
        U = []
        O = []

        _, S = torch.topk(g_s, self.M)
        _, E = torch.topk(g_e, self.M)


        for s in S:
            for e in E:
                
                if s<=e and g_s[s]+g_e[e]>=self.gamma:
                    u = g_s[s]+g_e[e] - (e-s+1)
                    r = (s,e)
                    R.append(r)
                    U.append(u)

        if debug:
            out = zip(R, U)
            [print(e) for e in out]

        while len(R)>0 and len(O)<self.K:
            u = max(U)
            l = U.index(u)
            r = R[l]
            O.append(r)
            R.remove(r)
            U.remove(u)
            i = 0

            while i<len(R):
                # Overlapping checked as intersection over lists in range of start and end
                l_r = list(range(r[0], r[1]+1))
                rc = R[i]
                l_rc = list(range(rc[0], rc[1]+1))

                if (len(set.intersection(set(l_r), set(l_rc)))) > 0:
                    R.remove(rc)
                    U.remove(U[i])
                    i += -1
                i += 1

        return O

In [37]:
def init_target_extraction_model(model, reproducibility=False):
    
    backbone = get_BERT(model['model_class'], model['pretrained_weights_model'], model['layers'], model['all_params_grad'], model['qa'], reproducibility)
    
    if not reproducibility:
        net = TargetExtractionNet(backbone, model['gamma'], model['K'], model['M'], model['qa'])
    else:
        net = backbone

    return net

### Target extractor main loop

In [38]:
def te_main_loop(training):   
    training_dataset = SemEval2014(training['data']['tokenizer_class'], training['data']['pretrained_weights_tokenizer'], training['path']['train_path'], training['data']['max_length'])
    train_size = int(training_dataset.__len__()*training['data']['train_split'])
    validation_size = training_dataset.__len__() - train_size
    train_set, validation_set = torch.utils.data.random_split(training_dataset, [train_size, validation_size])
    steps = training['epochs']*int(training_dataset.__len__()/training['batch_size'])

    train_dataloader = DataLoader(train_set, batch_size=training['batch_size'], collate_fn=my_collate, shuffle=True)
    validation_dataloader = DataLoader(validation_set, batch_size=training['batch_size'], collate_fn=my_collate, shuffle=False)

    te_net = init_target_extraction_model(training['model'], training['reproducibility'])

    train_loss, valid_loss = train_loop(te_net, training['optimizer']['name'], train_dataloader, 
                                        validation_dataloader, training['optimizer']['warmup'], steps, 
                                        training['optimizer']['lr'], training['optimizer']['momentum'], 
                                        training['epochs'], training['path']['te_net_save_path'], 
                                        training['optimizer']['scheduler'], training['reproducibility']
                                        )   
    
    return train_loss, valid_loss, te_net

### Target extraction only evaluation

In [39]:
def target_extraction_eval(evaluation):
    test_set = SemEval2014(evaluation['data']['tokenizer_class'], evaluation['data']['pretrained_weights_tokenizer'], 
                           evaluation['path']['test_path'], evaluation['data']['max_length'])
    test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=my_collate, shuffle=False)

    f1s = {}

    step = 1 # fixed
    gamma_range = evaluation['gamma_range']
    gamma_space=np.arange(0,gamma_range[1])*step+gamma_range[0]

    for gamma in gamma_space:
        f1s[gamma] = []

    pbar = tqdm.tqdm(position=0, leave=True)

    for te_net_path in os.listdir(evaluation['path']['base_path_te_eval']):
        if te_net_path[-3:] == 'pth':
            net = torch.load(os.path.join(evaluation['path']['base_path_te_eval'],te_net_path), map_location='cpu').to(get_device())

            net.eval()
            print("---------------------")
            print("\n Eval started")

            for gamma in gamma_space:

                net.gamma = gamma
                conf_matrix = torch.zeros(2, 2)
            
                with torch.no_grad():
                    print('\n ---------------------')
                    print('\n Gamma: {} \n'.format(gamma))

                    pbar.reset(total=int(len(test_dataloader)))

                    for i, element in enumerate(test_dataloader):
                        o = net.inference(element[1], False)
                        pr = [tuple([int(el[0])-1,int(el[1])-1]) for el in o]
                        gt_raw = element[3][0]
                        gt = []
                        for c in gt_raw:
                            gt.append((c[0],c[1]))

                        
                        for p in pr:
                            if p in gt: 
                                # True Positive
                                conf_matrix[0,0] += 1
                            else:
                                # False Positive
                                conf_matrix[1,0] += 1
                        
                        for g in gt:
                            if g not in pr:
                                conf_matrix[0,1] += 1
                        
                        pbar.update()
                        
                    
                    p = conf_matrix[0,0]/(conf_matrix[0,0] + conf_matrix[1,0])
                    r = conf_matrix[0,0]/(conf_matrix[0,1] + conf_matrix[0,0])
                    f1 = (2 * p * r) / (p + r)

                    retrieved = conf_matrix[0,0] + conf_matrix[1,0]
                    common = conf_matrix[0,0]

                f1s[gamma].append(f1)

                print("\n \n Precision: {}, Recall: {}, F1: {}, Retrieved: {}, Common: {}".format(p, r, f1, retrieved, common))
        
    pbar.close()
    return f1s

## POLARITY CLASSIFICATION

### Custom loss for polarity classification

In [40]:
class PolarityClassificationLoss(torch.nn.Module):

    def __init__(self):
        super(PolarityClassificationLoss, self).__init__()
        
    def forward(self, p, pol):
        cross = - torch.sum(pol.to(get_device())*torch.log(p))
        return cross

### Polarity classification model

In [103]:
class PolarityClassificationNet(torch.nn.Module):

    def __init__(self, backbone):
        super(PolarityClassificationNet, self).__init__()
        self.backbone = backbone.to(get_device())
        
        # First order parameters
        self.w_a = torch.nn.Parameter(torch.rand(768), requires_grad=True).to(get_device())
        self.W_v = torch.nn.Parameter(torch.rand(768, 768), requires_grad=True).to(get_device())
        self.W_p = torch.nn.Parameter(torch.rand(3, 768), requires_grad=True).to(get_device())
        
        # Activation
        self.softmax = torch.nn.Softmax(dim=-1).to(get_device())

        # Loss function
        self.loss = PolarityClassificationLoss().to(get_device())
        

    def forward(self, x, span):

        # Forward should be done for each target in a sentence
        
        h = self.backbone(**x)[0]
        # One Target, batch doesn't taken in account
        h_t = h[0, span[0]+1:span[1]+2]  # +1 and +2 added due to the special tokens and due to the last element included
        alpha = torch.matmul(h_t.to(get_device()), self.w_a.to(get_device()))
        alpha = self.softmax(alpha)

        # Transpose needed for coefficients multiplication
        v = torch.sum(h_t.transpose(0, 1)*alpha, dim=1) 
        
        temp =  torch.tanh(torch.matmul(self.W_v.to(get_device()), v))
        g = torch.matmul(self.W_p.to(get_device()), temp) # Likelihood for each sentiment class
        p = self.softmax(g)

        return p  

    def compute_loss(self, data):

        sentence,x,y,polarity_targets = data
        loss = []
        if polarity_targets[0] != []:
            for i, pol in enumerate(polarity_targets[0]):
                p = self.forward(x, pol)
                loss_target = self.loss(p, pol[2])
                loss.append(loss_target)
            
            loss = sum(loss)/(i+1)
        
        else:
            loss = 0

        return loss


In [104]:
def init_polarity_classification_model(model):
    
    backbone = get_BERT(model['model_class'], model['pretrained_weights_model'], model['layers'], model['all_params_grad'], model['qa'])
    net = PolarityClassificationNet(backbone)

    return net

### Polarity Classification main loop

In [105]:
def pc_main_loop(training):   
    training_dataset = SemEval2014(training['data']['tokenizer_class'], training['data']['pretrained_weights_tokenizer'], training['path']['train_path'], training['data']['max_length'])
    train_size = int(training_dataset.__len__()*training['data']['train_split'])
    validation_size = training_dataset.__len__() - train_size
    train_set, validation_set = torch.utils.data.random_split(training_dataset, [train_size, validation_size])
    steps = training['epochs']*int(training_dataset.__len__()/training['batch_size'])

    train_dataloader = DataLoader(train_set, batch_size=training['batch_size'], collate_fn=my_collate, shuffle=True)
    validation_dataloader = DataLoader(validation_set, batch_size=training['batch_size'], collate_fn=my_collate, shuffle=False)

    el = next(iter(train_dataloader))
    pc_net = init_polarity_classification_model(training['model'])

    train_loss, valid_loss = train_loop(pc_net, training['optimizer']['name'], train_dataloader, 
                                        validation_dataloader, training['optimizer']['warmup'], steps, 
                                        training['optimizer']['lr'], training['optimizer']['momentum'], 
                                        training['epochs'], training['path']['pc_net_save_path'], 
                                        sched=training['optimizer']['scheduler'], pc=True)
    
    return train_loss, valid_loss, pc_net

### Polarity classification only evaluation

In [106]:
def polarity_classification_eval(evaluation):
    test_set = SemEval2014(evaluation['data']['tokenizer_class'], evaluation['data']['pretrained_weights_tokenizer'], 
                           evaluation['path']['test_path'], evaluation['data']['max_length'])
    test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=my_collate, shuffle=False)

    pbar = tqdm.tqdm(position=0, leave=True)

    for pc_net_path in os.listdir(evaluation['path']['base_path_pc_eval']):
        if pc_net_path[-3:] == 'pth':
            net = torch.load(os.join.path(evaluation['path']['base_path_pc_eval'],pc_net_path)).to(get_device())

            net.eval()
            print("---------------------")
            print("\n Eval started")

            
            conf_matrix = torch.zeros(3, 3)

                
            #     pos neu neg --> Predicted
            #     ___ ___ ___
            # pos|___|___|___|
            # neu|___|___|___|
            # neg|___|___|___|

            
            with torch.no_grad():

                pbar.reset(total=int(len(test_dataloader)))
                correct = 0
                total = 0

                for data in test_dataloader:
                    sentence,x,y,polarity_targets = data
                    if polarity_targets[0] != []:
                        for pol in polarity_targets[0]:
                            p = net.forward(x, pol)
                            pred = torch.argmax(p)
                            corr = torch.argmax(pol[2])

                            if pred == corr:
                                correct+=1
                                total+=1
                            else:
                                total+=1

                    pbar.update()
                        
            print("Accuracy:", correct/total)

    pbar.close()

## WORKSPACES

### Target extraction train

In [107]:
def te_train(restart):
    path = {}
    path['train_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014', 'laptop14_train.txt')
    path['test_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014','laptop14_test.txt')

    dir = os.path.join(__BASEPATH__, __DATA__, 'Previous_trainings', datetime.now().strftime('%Y_%m_%d_%H:%M:%S'))
    os.mkdir(dir)

    for res in range(restart):
        path['te_net_save_path'] = os.path.join(dir,'target_extraction_'+str(res)+'.pth')

        optim = {}
        optim['name'] = 'AdamW'
        optim['lr'] = 3e-4
        optim['momentum'] = 0.9
        optim['scheduler'] = 'linear'
        optim['warmup'] = 0.1

        model = {}
        model['gamma'] = 9
        model['K'] = 10
        model['M'] = 20
        model['model_class'] = BertModel
        model['pretrained_weights_model'] = 'bert-base-uncased'
        model['layers'] = {} # Manually set grad True or False --> Example: {1:True, 2:False}
        model['all_params_grad'] = True

        if model['model_class'] == DistilBertForQuestionAnswering or model['model_class'] == BertForQuestionAnswering:
            model['qa'] = True
        else:
            model['qa'] = False

        data = {}
        data['max_length'] = 96
        data['train_split'] = 1
        data['pretrained_weights_tokenizer'] = 'bert-base-uncased'
        data['tokenizer_class'] = BertTokenizer
        
        training = {}
        training['batch_size'] = 32
        training['epochs'] = 5
        training['optimizer'] = optim
        training['model'] = model
        training['data'] = data
        training['path'] = path
        training['reproducibility'] = False # Flag that enable the use of the older version of bert

        pprint.pprint(training)

        # Target extraction task and result plots
        te_train_loss, te_valid_loss, te_net = te_main_loop(training)
        plt.plot(range(training['epochs']), te_valid_loss['loss'])
        plt.show()
        
        # Aggregate all the statistics
        stats = [te_train_loss, te_valid_loss]

### Target extraction evaluation

In [108]:
def te_eval(base_path_te_eval):
    path = {}
    path['train_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014', 'laptop14_train.txt')
    path['test_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014','laptop14_test.txt')
    
    path['base_path_te_eval'] = base_path_te_eval
    data = {}
    data['pretrained_weights_tokenizer'] = 'bert-base-uncased'
    data['tokenizer_class'] = BertTokenizer
    data['max_length'] = 96

    evaluation = {}
    evaluation['data'] = data
    evaluation['path'] = path
    evaluation['gamma_range'] = [9, 1] # From gamma_range[0] for number of steps equals to gamma_range[1]

    pprint.pprint(evaluation)

    f1s = target_extraction_eval(evaluation)

    print("Here it is all the f1:", f1s)

### Polarity classification training

In [109]:
def pc_train(restart):
    path = {}
    path['train_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014', 'laptop14_train.txt')
    path['test_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014','laptop14_test.txt')

    dir = os.path.join(__BASEPATH__, __DATA__, 'Previous_trainings', datetime.now().strftime('%Y_%m_%d_%H:%M:%S'))
    os.mkdir(dir)

    for res in range(restart):
        path['pc_net_save_path'] = os.path.join(dir,'polarity_classification'+str(res)+'.pth')

        optim = {}
        optim['name'] = 'Adam'
        optim['lr'] = 3e-5
        optim['momentum'] = 0.9
        optim['scheduler'] = 'linear'
        optim['warmup'] = 0.1

        model = {}
        model['model_class'] = DistilBertModel
        model['pretrained_weights_model'] = 'distilbert-base-uncased'
        model['layers'] = {} 
        model['all_params_grad'] = True
        model['qa'] = False

        data = {}
        data['max_length'] = 96
        data['train_split'] = 1
        data['pretrained_weights_tokenizer'] = 'distilbert-base-uncased'
        data['tokenizer_class'] = DistilBertTokenizer
        
        training = {}
        training['batch_size'] = 1 # Polarity Classification doesn't work with more than one
        training['epochs'] = 10
        training['optimizer'] = optim
        training['model'] = model
        training['data'] = data
        training['path'] = path

        pprint.pprint(training)

        pc_train_loss, pc_valid_loss, pc_net = pc_main_loop(training)
        plt.plot(range(training['epochs']), pc_valid_loss['loss'])
        plt.show()
        
        # Aggregate all the statistics
        stats = [pc_train_loss, pc_valid_loss]

### Polarity classification evaluation

In [110]:
def pc_eval(base_path_pc_eval):
    path = {}
    path['train_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014', 'laptop14_train.txt')
    path['test_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014','laptop14_test.txt')

    path['base_path_pc_eval'] = base_path_pc_eval
    data = {}
    data['pretrained_weights_tokenizer'] = 'distilbert-base-uncased'
    data['tokenizer_class'] = DistilBertTokenizer
    data['max_length'] = 96

    evaluation = {}
    evaluation['data'] = data
    evaluation['path'] = path

    pprint.pprint(evaluation)

    polarity_classification_eval(evaluation)

### Target extraction and Polarity classification pipeline

In [111]:
def te_pc_pipe(base_path_eval, restart):
    path = {}
    path['train_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014', 'laptop14_train.txt')
    path['test_path'] = os.path.join(__BASEPATH__, __DATA__, 'Dataset_semeval_2014','laptop14_test.txt')
    
    path['base_path_eval'] = base_path_eval
    data = {}
    data['pretrained_weights_tokenizer'] = 'bert-base-uncased'
    data['tokenizer_class'] = BertTokenizer
    data['max_length'] = 96

    evaluation = {}
    evaluation['data'] = data
    evaluation['path'] = path
    evaluation['gamma'] = 8.5

    pprint.pprint(evaluation)

    test_set = SemEval2014(evaluation['data']['tokenizer_class'], evaluation['data']['pretrained_weights_tokenizer'], 
                           evaluation['path']['test_path'], evaluation['data']['max_length'])
    test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=my_collate, shuffle=False)

    pbar = tqdm.tqdm(position=0, leave=True)

    te_net = torch.load(os.path.join(evaluation['path']['base_path_eval'],'te_net.pth'), map_location='cpu').to(get_device())
    pc_net = torch.load(os.path.join(evaluation['path']['base_path_eval'],'pc_net.pth'), map_location='cpu').to(get_device())

    te_net.eval()
    pc_net.eval()

    print("---------------------")
    print("\n Eval started")

    te_net.gamma = evaluation['gamma']
    conf_matrix = torch.zeros(2, 2)

    with torch.no_grad():
        print('\n ---------------------')
        print('\n Gamma: {} \n'.format(evaluation['gamma']))

        pbar.reset(total=int(len(test_dataloader)))
        
        for i, element in enumerate(test_dataloader):

            o = te_net.inference(element[1], False)
            pr = [tuple([int(el[0])-1,int(el[1])-1]) for el in o]
            gt_raw = element[3][0]
            gt_raw_proc = {}
            gt = []
            for count, c in enumerate(gt_raw):
                gt.append((c[0],c[1]))
                gt_raw_proc[(c[0],c[1])] = count
            
            for p in pr:
                if p in gt: 
                    # True Positive
                    pol = gt_raw[gt_raw_proc[p]]
                    pred = pc_net.forward(element[1], pol)
                    pred = torch.argmax(pred)
                    corr = torch.argmax(pol[2])
                    if pred==corr:
                        conf_matrix[0,0] += 1
                    
                else:
                    # False Positive
                    conf_matrix[1,0] += 1
            
            for g in gt:
                if g not in pr:
                    conf_matrix[0,1] += 1
            
            pbar.update()
            
        
        p = conf_matrix[0,0]/(conf_matrix[0,0] + conf_matrix[1,0])
        r = conf_matrix[0,0]/(conf_matrix[0,1] + conf_matrix[0,0])
        f1 = (2 * p * r) / (p + r)

        retrieved = conf_matrix[0,0] + conf_matrix[1,0]
        common = conf_matrix[0,0]

    print("\n \n Precision: {}, Recall: {}, F1: {}, Retrieved: {}, Common: {}".format(p, r, f1, retrieved, common))

    pbar.close()
    return f1

## MAIN

### Main function

In [112]:
def main(mode=None):
    base_path_te_eval = os.path.join(__BASEPATH__, __DATA__, 'Previous_trainings', 'best_te')
    base_path_pc_eval = os.path.join(__BASEPATH__, __DATA__, 'Previous_trainings', 'best_pc')
    base_path_eval = os.path.join(__BASEPATH__, __DATA__, 'Jointed')
    restart = 1 # Number of different random restart for a training session

    if mode == 'te_train':
        te_train(restart)
    elif mode == 'te_eval':
        te_eval(base_path_te_eval) 
    elif mode == 'pc_train':
        pc_train(restart) 
    elif mode == 'pc_eval':
        pc_eval(base_path_pc_eval) 
    elif mode == 'te_pc_pipe':
        te_pc_pipe(base_path_eval, restart)
    else:
        print(f"Choose a mode between the following: ['te_train', 'te_eval', 'pc_train', 'pc_eval', 'te_pc_pipe']")

In [113]:
if __name__ == '__main__':
    # All the available modes -> ['te_train', 'te_eval', 'pc_train', 'pc_eval', 'te_pc_pipe']
    main(mode='te_pc_pipe')

{'data': {'max_length': 96,
          'pretrained_weights_tokenizer': 'bert-base-uncased',
          'tokenizer_class': <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>},
 'gamma': 8.5,
 'path': {'base_path_eval': '/content/drive/MyDrive/NLU_Project/Data/Jointed',
          'test_path': '/content/drive/MyDrive/NLU_Project/Data/Dataset_semeval_2014/laptop14_test.txt',
          'train_path': '/content/drive/MyDrive/NLU_Project/Data/Dataset_semeval_2014/laptop14_train.txt'}}


  0%|          | 1/411 [00:00<00:52,  7.81it/s]

---------------------

 Eval started

 ---------------------

 Gamma: 8.5 



100%|██████████| 411/411 [00:31<00:00, 12.99it/s]


 
 Precision: 0.494577020406723, Recall: 0.4115523397922516, F1: 0.4492610991001129, Retrieved: 461.0, Common: 228.0



