use the output of the last TWO layers

In [1]:
!pip install transformers
!pip install tokenizers



## Import library

In [2]:
import os
from pathlib import Path
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [4]:
class config:
    MAX_LEN = 128 # no text is longer than 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 16
    EPOCHS = 10
    BASE_PATH = Path("../")
    BERT_PATH = BASE_PATH  / "bert-base-uncased"
    MODEL_PATH = BASE_PATH  / "model_save/model_0421_2"
    TRAINING_FILE = BASE_PATH / "input/train-5fold/train_folds.csv"
    TESTING_FILE = BASE_PATH  / "input/test.csv"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        os.path.join(BERT_PATH, 'vocab.txt'),
        lowercase=True
    )

## Utils

In [5]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


class EarlyStopping:
    # https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, name):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, name):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), name)
        self.val_loss_min = val_loss

## Data processing

In [6]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    """
    Preprocessing the data to the BERT model formatting
    """
    len_st = len(selected_text)
    idx0 = None
    idx1 = None
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
        if tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids[1:-1]
    tweet_offsets = tok_tweet.offsets[1:-1]
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 3893,
        'negative': 4997,
        'neutral': 8699
    }
    
    input_ids = [101] + [sentiment_id[sentiment]] + [102] + input_ids_orig + [102]
    token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1) # +1 for the last [102]
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    targets_start += 3
    targets_end += 3

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

## Data loader

In [7]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

## The model

In [8]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768*2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        bert_output = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        out = bert_output[2] 
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out) # (batch_size, MAX_LEN, 2)

        start_logits, end_logits = logits.split(1, dim=-1) # (batch_size, MAX_LEN, 1), (batch_size, MAX_LEN, 1)
        start_logits = start_logits.squeeze(-1) # (batch_size, MAX_LEN)
        end_logits = end_logits.squeeze(-1) # (batch_size, MAX_LEN)

        return start_logits, end_logits

## Loss function

In [9]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

## Training function

In [10]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            jaccard_scores.append(jaccard_score)

        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

## Evaluation function

In [11]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start # or idx_start = idx_end
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    print(f"Loss = {losses.avg}")
    return jaccards.avg, losses.avg

## Training 

In [12]:
def run(fold):

    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
    
    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(os.path.join(config.BERT_PATH, 'bert_config.json'))
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    es = EarlyStopping(patience=2, verbose=True)
    print(f"Training is Starting for fold={fold}")
    
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard, loss = eval_fn(valid_data_loader, model, device)
        #print(f"Jaccard Score = {jaccard}")
        #print(f"Loss score = {loss}")
        es(loss, model, name=config.MODEL_PATH / f"model_{fold}.bin")
        
        if es.early_stop:
            print("Early stopping")
            break
  
    return es.val_loss_min

## Run training

In [13]:
%%time
val_loss = []
for ifold in range(5):
    q = run(ifold)
    val_loss.append(q)
print(f'Mean val loss: {np.mean(val_loss)}')

  0%|          | 0/344 [00:00<?, ?it/s]

Training is Starting for fold=0


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.643, loss=2.43]
100%|██████████| 344/344 [00:15<00:00, 21.81it/s, jaccard=0.685, loss=1.9] 


Jaccard = 0.6851445427410803
Loss = 1.899860461592696
Validation loss decreased (inf --> 1.899860).  Saving model ...


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.697, loss=1.75]
100%|██████████| 344/344 [00:15<00:00, 21.85it/s, jaccard=0.694, loss=1.87]


Jaccard = 0.694296586079748
Loss = 1.8722019794639162
Validation loss decreased (1.899860 --> 1.872202).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.729, loss=1.5] 
100%|██████████| 344/344 [00:15<00:00, 21.93it/s, jaccard=0.694, loss=1.95]
  0%|          | 0/344 [00:00<?, ?it/s]

Jaccard = 0.6936610783015128
Loss = 1.9452155916522107
EarlyStopping counter: 1 out of 2


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.764, loss=1.26]
100%|██████████| 344/344 [00:15<00:00, 21.95it/s, jaccard=0.687, loss=2.11]


Jaccard = 0.6874403715006004
Loss = 2.105886080524413
EarlyStopping counter: 2 out of 2
Early stopping


  0%|          | 0/344 [00:00<?, ?it/s]

Training is Starting for fold=1


100%|██████████| 344/344 [02:34<00:00,  2.22it/s, jaccard=0.643, loss=2.45]
100%|██████████| 344/344 [00:15<00:00, 21.86it/s, jaccard=0.687, loss=1.89]


Jaccard = 0.6869452237662096
Loss = 1.8931933581362816
Validation loss decreased (inf --> 1.893193).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.697, loss=1.74]
100%|██████████| 344/344 [00:15<00:00, 21.95it/s, jaccard=0.691, loss=1.87]


Jaccard = 0.6912907146401235
Loss = 1.8667405604101732
Validation loss decreased (1.893193 --> 1.866741).  Saving model ...


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.732, loss=1.48]
100%|██████████| 344/344 [00:15<00:00, 21.66it/s, jaccard=0.691, loss=1.93]
  0%|          | 0/344 [00:00<?, ?it/s]

Jaccard = 0.6908770203294092
Loss = 1.9272176964794352
EarlyStopping counter: 1 out of 2


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.77, loss=1.22] 
100%|██████████| 344/344 [00:15<00:00, 21.85it/s, jaccard=0.688, loss=2.13]


Jaccard = 0.6880206895097066
Loss = 2.1280241215990223
EarlyStopping counter: 2 out of 2
Early stopping


  0%|          | 0/344 [00:00<?, ?it/s]

Training is Starting for fold=2


100%|██████████| 344/344 [02:34<00:00,  2.22it/s, jaccard=0.647, loss=2.41]
100%|██████████| 344/344 [00:15<00:00, 21.89it/s, jaccard=0.676, loss=1.9] 


Jaccard = 0.6763422605426532
Loss = 1.9016060155587824
Validation loss decreased (inf --> 1.901606).  Saving model ...


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.7, loss=1.74]  
100%|██████████| 344/344 [00:15<00:00, 21.93it/s, jaccard=0.683, loss=1.87]


Jaccard = 0.6830725408526428
Loss = 1.8656761168739633
Validation loss decreased (1.901606 --> 1.865676).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.23it/s, jaccard=0.734, loss=1.48]
100%|██████████| 344/344 [00:15<00:00, 21.95it/s, jaccard=0.687, loss=1.93]
  0%|          | 0/344 [00:00<?, ?it/s]

Jaccard = 0.6871837294247871
Loss = 1.9268038530056533
EarlyStopping counter: 1 out of 2


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.769, loss=1.21]
100%|██████████| 344/344 [00:15<00:00, 21.88it/s, jaccard=0.686, loss=2.12]


Jaccard = 0.6855625344364876
Loss = 2.123729288306955
EarlyStopping counter: 2 out of 2
Early stopping


  0%|          | 0/344 [00:00<?, ?it/s]

Training is Starting for fold=3


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.644, loss=2.5] 
100%|██████████| 344/344 [00:15<00:00, 21.82it/s, jaccard=0.679, loss=1.87]


Jaccard = 0.6790681498716744
Loss = 1.865243993369513
Validation loss decreased (inf --> 1.865244).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.698, loss=1.77]
100%|██████████| 344/344 [00:15<00:00, 21.87it/s, jaccard=0.689, loss=1.85]


Jaccard = 0.6890462069187815
Loss = 1.8466681400095417
Validation loss decreased (1.865244 --> 1.846668).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.728, loss=1.52]
100%|██████████| 344/344 [00:15<00:00, 21.97it/s, jaccard=0.691, loss=1.94]
  0%|          | 0/344 [00:00<?, ?it/s]

Jaccard = 0.6911166721074372
Loss = 1.940388842494147
EarlyStopping counter: 1 out of 2


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.761, loss=1.27]
100%|██████████| 344/344 [00:15<00:00, 21.85it/s, jaccard=0.69, loss=2.07] 


Jaccard = 0.6902120995524947
Loss = 2.074903417544699
EarlyStopping counter: 2 out of 2
Early stopping


  0%|          | 0/344 [00:00<?, ?it/s]

Training is Starting for fold=4


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.646, loss=2.42]
100%|██████████| 344/344 [00:15<00:00, 21.96it/s, jaccard=0.686, loss=1.92]


Jaccard = 0.6855133528239351
Loss = 1.9198039857521911
Validation loss decreased (inf --> 1.919804).  Saving model ...


100%|██████████| 344/344 [02:34<00:00,  2.23it/s, jaccard=0.695, loss=1.75]
100%|██████████| 344/344 [00:15<00:00, 21.95it/s, jaccard=0.695, loss=1.83]


Jaccard = 0.6953161404959611
Loss = 1.83362769072156
Validation loss decreased (1.919804 --> 1.833628).  Saving model ...


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.729, loss=1.49]
100%|██████████| 344/344 [00:15<00:00, 21.92it/s, jaccard=0.69, loss=1.88] 
  0%|          | 0/344 [00:00<?, ?it/s]

Jaccard = 0.6898779082432401
Loss = 1.8771422424163735
EarlyStopping counter: 1 out of 2


100%|██████████| 344/344 [02:33<00:00,  2.24it/s, jaccard=0.765, loss=1.24]
100%|██████████| 344/344 [00:15<00:00, 21.86it/s, jaccard=0.687, loss=2.04]

Jaccard = 0.687461169745305
Loss = 2.038793492733576
EarlyStopping counter: 2 out of 2
Early stopping
Mean val loss: 1.856982897495831



