In [5]:
# Download and unzip dataset
# !gdown 1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C
# !unzip ./data.zip

Archive:  ./data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/restaurants_train.csv  
  inflating: __MACOSX/data/._restaurants_train.csv  
  inflating: data/restaurants_test.csv  
  inflating: __MACOSX/data/._restaurants_test.csv  


##**Dataset**

In [1]:
import torch
from torch.utils.data import Dataset
import numpy as np
class ABSADataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values
        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_att = []
        pols_labels = []
        start_ids = 0
        end_ids = -1
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            if int(tags[i]) == 1:
                start_ids = i
                end_ids = i
            elif int(tags[i]) == 2:
                end_ids += 1
            elif int(tags[i]) == 0:
                if start_ids <= end_ids:
                    bert_att.append(tokens[start_ids:end_ids+1])
                    pols_labels.append(int(pols[start_ids]) if int(pols[start_ids]) !=-1 else 1 )
                    end_ids = -1
        if start_ids <= end_ids:
            bert_att.append(tokens[start_ids:])
            pols_labels.append(int(pols[start_ids]) if int(pols[start_ids]) !=-1 else 1 )
        if len(bert_att) !=0 :
            id = np.random.randint(0,len(bert_att)) # in one sentence has one or more than one aspects
            bert_att_ = self.tokenizer.tokenize(" ".join(bert_att[id]))
            pols_label = pols_labels[id]
        else:
            bert_att_ = []
            pols_label = 1 # neutral
        
        segment_tensor = [0] + [0]*len(bert_tokens) + [0] + [1]*len(bert_att_)
        bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]'] + bert_att_


        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        pols_tensor = torch.tensor(pols_label)
        segment_tensor = torch.tensor(segment_tensor)

        return bert_tokens, ids_tensor, segment_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

train_df = pd.read_csv('./data/restaurants_train.csv')
test_df = pd.read_csv('./data/restaurants_test.csv')

In [3]:
train_df.iloc[0]

Tokens        ['But', 'the', 'staff', 'was', 'so', 'horrible...
Tags                                [0, 0, 1, 0, 0, 0, 0, 0, 0]
Polarities                  [-1, -1, 0, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

In [4]:
from transformers import BertTokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
train_ds = ABSADataset(train_df, tokenizer)
test_ds = ABSADataset(test_df, tokenizer)

In [6]:
# next(iter(train_ds))
train_ds[3101]

(['i',
  've',
  'eaten',
  'that',
  'many',
  'times',
  'and',
  'am',
  'very',
  'familiar',
  'with',
  'the',
  'qui',
  '##nine'],
 tensor([ 1045,  2310,  8828,  2008,  2116,  2335,  1998,  2572,  2200,  5220,
          2007,  1996, 21864, 19105]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]),
 tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]))

In [28]:
from torch.nn.utils.rnn import pad_sequence

def padding(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True,padding_value=1) # pad to seq 2

    label_ids = torch.stack([s[3] for s in samples])

    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [29]:
from torch.utils.data import DataLoader

batch_size = 32
train_loader = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)
test_loader = DataLoader(
    test_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)


##**Model**

In [57]:
from transformers import BertModel

class ABSABert(torch.nn.Module):
    def __init__(self, model_name):
        super(ABSABert, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, masks_tensors, segments_tensors, lable_tensors):
        outputs = self.bert(
            input_ids=ids_tensors,
            attention_mask=masks_tensors,
            token_type_ids=segments_tensors,
            
        )
        linear_outputs = self.linear(outputs['pooler_output'])
        

        if lable_tensors is not None:
            loss = self.loss_fn(linear_outputs, lable_tensors)
            return loss, linear_outputs
        else:
            return linear_outputs

In [58]:
model = ABSABert(model_name)

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [61]:
model.to(device)

ABSABert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [62]:
import time
import numpy as np

def train_epoch(model, optimizer, train_loader, device):
    losses = []
    for batch in (train_loader):
        ids_tensors, segments_tensors, masks_tensors, label_ids = batch
        ids_tensors = ids_tensors.to(device)
        segments_tensors = segments_tensors.to(device)
        label_ids = label_ids.to(device)
        masks_tensors = masks_tensors.to(device)

        loss, _ = model(
            ids_tensors=ids_tensors,
            masks_tensors=masks_tensors,
            segments_tensors=segments_tensors,
            lable_tensors=label_ids
        )
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return sum(losses)/len(losses)

def evaluate_epoch(model, valid_loader, device):
    losses = []

    preds, labels = [], []
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in (valid_loader):
            ids_tensors, segments_tensors, masks_tensors, label_ids = batch
            ids_tensors = ids_tensors.to(device)
            segments_tensors = segments_tensors.to(device)
            masks_tensors = masks_tensors.to(device)
            label_ids = label_ids.to(device)

            loss, outputs = model(
                ids_tensors=ids_tensors,
                masks_tensors=masks_tensors,
                segments_tensors=segments_tensors,
                lable_tensors=label_ids
            )
            losses.append(loss.item())

            _, p = torch.max(outputs, dim=1)
            preds += list([int(i) for i in p])
            labels += list([int(i) for i in label_ids])
            correct += (outputs.argmax(dim = -1) == label_ids).float().sum().item()
            total   += outputs.shape[0]

    # acc = np.mean(np.array(preds) == np.array(labels))
    acc = correct/total
    return sum(losses)/len(losses), acc

def train(model, model_name, save_model, optimizer, train_loader, valid_loader, num_epochs, device):
    train_losses = []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        # Training
        train_loss = train_epoch(model, optimizer, train_loader, device)
        train_losses.append(train_loss)

        # Evaluation
        eval_loss, eval_acc = evaluate_epoch(model, valid_loader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, train_loss, eval_acc, eval_loss
            )
        )
        print("-" * 59)

    # Load best model
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt'))
    model.eval()
    metrics = {
        'train_loss': train_losses,
        'valid_accuracy': eval_accs,
        'valid_loss': eval_losses,
        'time': times
    }
    return model, metrics

##**Training**

In [63]:
!mkdir "./model"

mkdir: cannot create directory ‘./model’: File exists


In [64]:
save_model = "./model"
model = ABSABert(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5,weight_decay=5e-5)
num_epochs = 10
best_model, metrics = train(
    model, model_name, save_model, optimizer, train_loader, test_loader, num_epochs, device
)

-----------------------------------------------------------
| End of epoch   1 | Time: 29.09s | Train Loss    0.922 | Valid Accuracy    0.609 | Valid Loss    0.858 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   2 | Time: 38.95s | Train Loss    0.804 | Valid Accuracy    0.636 | Valid Loss    0.741 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   3 | Time: 41.29s | Train Loss    0.666 | Valid Accuracy    0.666 | Valid Loss    0.696 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   4 | Time: 43.89s | Train Loss    0.588 | Valid Accuracy    0.666 | Valid Loss    0.722 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   5 | Time: 46.00s | Trai

##**Prediction**

In [65]:
test_df.iloc[0]

Tokens        ['The', 'bread', 'is', 'top', 'notch', 'as', '...
Tags                                   [0, 1, 0, 0, 0, 0, 0, 0]
Polarities                      [-1, 2, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

In [72]:
def predict(model, tokenizer, sentence, aspect, device):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[CLS]'] + t1 + ['[SEP]'] + t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    input_ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([input_ids]).to(device)
    segment_tensor = torch.tensor([segment_tensor]).to(device)

    with torch.no_grad():
        outputs = model(input_tensor, None, segment_tensor, None)
        _, predictions = torch.max(outputs, dim=1)

    return word_pieces, int(predictions), outputs.softmax(dim = -1)

In [73]:
" ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))

'The bread is top notch as well'

In [74]:
sentence = " ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))
aspect = "bread"
predict(best_model, tokenizer, sentence, aspect, device)

(['[CLS]',
  'the',
  'bread',
  'is',
  'top',
  'notch',
  'as',
  'well',
  '[SEP]',
  'bread'],
 2,
 tensor([[0.0012, 0.0140, 0.9848]], device='cuda:0'))