In [1]:
!pip install transformers
!pip install tokenizers



In [2]:
import os
from pathlib import Path
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

import string

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [4]:
class config:
    MAX_LEN = 128 # no text is longer than 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 16
    EPOCHS = 5
    BASE_PATH = Path("../")
    BERT_PATH = BASE_PATH  / "bert-base-uncased"
    BERT_NAME = "bert-base-uncased"
    MODEL_PATH = BASE_PATH  / "model_save/model_0415-1.bin"
    TRAINING_FILE = BASE_PATH / "input/train.csv"
    TESTING_FILE = BASE_PATH  / "input/test.csv"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        os.path.join(BERT_PATH, 'bert-base-uncased-vocab.txt'),
        lowercase=True
    )

In [5]:
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [6]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(config.BERT_NAME)
        #self.bert_drop = nn.Dropout(0.3)
        self.l0 = nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
    #def forward(self, ids, mask, token_type_ids):
        # not using sentiment at all
        sequence_output, pooled_output = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        # sequence_output:  (batch_size, num_tokens or MAX_LEN?, 768)
        # pooled_output: Last layer hidden-state of the first token of the sequence (classification token)
        logits = self.l0(sequence_output)
        # (batch_size, num_tokens, 2)

        # (batch_size, num_tokens, 1), (batch_size, num_tokens, 1)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        # (batch_size, num_tokens), (batch_size, num_tokens)

        return start_logits, end_logits

In [7]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.max_len = config.MAX_LEN
        self.tokenizer = config.TOKENIZER

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item): # item is index value
        # remove redundant spaces
        tweet = " ".join(str(self.tweet[item]).split())
        selected_text = " ".join(str(self.selected_text[item]).split())

        len_sel_text = len(selected_text)
        idx0 = -1
        idx1 = -1
        # is it possible that there are two matches?
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
            if tweet[ind: ind + len_sel_text] == selected_text:
                idx0 = ind
                idx1 = ind + len_sel_text - 1
                break

        # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        char_targets = [0] * len(tweet)
        if idx0 != -1 and idx1 != -1:
            for j in range(idx0, idx1 + 1):
                if tweet[j] != " ":
                    char_targets[j] = 1
        # [0,0,0,0,0,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0]

        # https://github.com/huggingface/tokenizers
        tok_tweet = self.tokenizer.encode(tweet)
        tok_tweet_tokens = tok_tweet.tokens
        tok_tweet_ids = tok_tweet.ids
        tok_tweet_offsets = tok_tweet.offsets[1:-1]

        # [Not clear] partial match?
        # need to check if there are partial words in selected_text
        targets = [0] * (len(tok_tweet_tokens) - 2) # -2 for cls, sep
        # [0,0,0,0,0,0,0]
        for j, (offset1, offset2) in enumerate(tok_tweet_offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                targets[j] = 1
        # [0,1,1,1,0,0,0]
        targets = [0] + targets + [0] # cls, sep
        targets_start = [0] * len(targets) # [0,1,0,0,0,0,0]
        targets_end = [0] * len(targets)   # [0,0,0,1,0,0,0]

        non_zero = np.nonzero(targets)[0]
        if len(non_zero) > 0:
            targets_start[non_zero[0]] = 1
            targets_end[non_zero[-1]] = 1

        mask = [1] * len(tok_tweet_ids)
        token_type_ids = [0] * len(tok_tweet_ids)

        padding_len = self.max_len - len(tok_tweet_ids)
        ids = tok_tweet_ids + [0] * padding_len
        mask = mask + [0] * padding_len
        token_type_ids = token_type_ids + [0] * padding_len
        targets = targets + [0] * padding_len
        targets_start = targets_start + [0] * padding_len
        targets_end  = targets_end + [0] * padding_len

        sentiment = [1, 0, 0]
        if self.sentiment[item] == 'positive':
            sentiment = [0, 0, 1]
        if self.sentiment[item] == 'negative':
            sentiment = [0, 1, 0]

        return {
            'ids':torch.tensor(ids, dtype=torch.long),
            'mask':torch.tensor(mask, dtype=torch.long),
            'token_type_ids':torch.tensor(token_type_ids, dtype=torch.long),
            'targets':torch.tensor(targets, dtype=torch.long),
            'targets_start':torch.tensor(targets_start, dtype=torch.long),
            'targets_end':torch.tensor(targets_end, dtype=torch.long),
            'padding_len':torch.tensor(padding_len, dtype=torch.long),
            'tweet_tokens':" ".join(tok_tweet_tokens),
            'orig_tweet':self.tweet[item],
            'sentiment':torch.tensor(sentiment, dtype=torch.long),
            'orig_sentiment':self.sentiment[item],
            'orig_selected':self.selected_text[item]
        }

In [8]:
def loss_fn(o1, o2, t1, t2): # start_logits, end_logits, targets_start, targets_end
    l1 = nn.BCEWithLogitsLoss()(o1, t1)
    l2 = nn.BCEWithLogitsLoss()(o2, t2)
    return l1 + l2

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))

    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.float)
        targets_end = targets_end.to(device, dtype=torch.float)

        optimizer.zero_grad()
        start_logits, end_logits = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(start_logits, end_logits, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)


def eval_fn(data_loader, model, device):
    model.eval()
    fin_outputs_start = []
    fin_outputs_end = []
    fin_padding_lens = []
    fin_tweet_tokens = []
    fin_orig_sentiment = []
    fin_orig_selected = []
    fin_orig_tweet = []

    for bi, d in enumerate(data_loader):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        tweet_tokens = d["tweet_tokens"]
        padding_len = d["padding_len"]
        orig_sentiment = d["orig_sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)

        start_logits, end_logits = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        fin_outputs_start.append(torch.sigmoid(start_logits).cpu().detach().numpy())
        fin_outputs_end.append(torch.sigmoid(end_logits).cpu().detach().numpy())
        fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist())

        fin_tweet_tokens.extend(tweet_tokens)
        fin_orig_sentiment.extend(orig_sentiment)
        fin_orig_selected.extend(orig_selected)
        fin_orig_tweet.extend(orig_tweet)

    fin_outputs_start = np.vstack(fin_outputs_start)
    fin_outputs_end = np.vstack(fin_outputs_end)

    threshold = 0.3
    jaccards = []
    for j in range(len(fin_tweet_tokens)):
        target_string = fin_orig_selected[j]
        tweet_tokens = fin_tweet_tokens[j]
        padding_len = fin_padding_lens[j]
        original_tweet = fin_orig_tweet[j]
        sentiment = fin_orig_sentiment[j]

        if padding_len > 0:
            mask_start = fin_outputs_start[j,:][:-padding_len] >= threshold
            mask_end = fin_outputs_end[j,:][:-padding_len] >= threshold
        else:
            mask_start = fin_outputs_start[j,:] >= threshold
            mask_end = fin_outputs_end[j,:] >= threshold

        mask = [0]*len(mask_start)
        idx_start = np.nonzero(mask_start)[0]
        idx_end = np.nonzero(mask_end)[0]

        if len(idx_start) > 0:
            idx_start = idx_start[0]
            if len(idx_end) > 0:
                idx_end = idx_end[0]
            else:
                idx_end = idx_start
        else:
            idx_start = 0
            idx_end = 0

        for mj in range(idx_start, idx_end + 1):
            mask[mj] = 1

        output_tokens = [x for p,x in enumerate(tweet_tokens.split()) if mask[p] == 1]
        output_tokens = [x for x in output_tokens if x not in ('[CLS]', '[SEP]')]

        final_output = ""
        for ot in output_tokens:
            if ot.startswith('##'):
                final_output = final_output + ot[2:]
            elif len(ot) == 1 and ot in string.punctuation:
                final_output = final_output + ot
            else: final_output = final_output + " " + ot
        final_output = final_output.strip()
        # this can be improved 
        if sentiment == 'neutral' or len(original_tweet.split()) < 4:
            final_output = original_tweet
        jac = jaccard(target_string.strip(), final_output.strip())
        jaccards.append(jac)
    mean_jac = np.mean(jaccards)
    return mean_jac

In [9]:
def run():
    #dfx = pd.read_csv(config.TRAINING_FILE, nrows=100).dropna().reset_index(drop=True)
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard

In [10]:
%%time
run()

100%|██████████| 387/387 [02:45<00:00,  2.34it/s, loss=0.0557]


Jaccard Score = 0.5830396511523096


100%|██████████| 387/387 [02:45<00:00,  2.34it/s, loss=0.0298]


Jaccard Score = 0.6100185495117904


100%|██████████| 387/387 [02:45<00:00,  2.34it/s, loss=0.0271]


Jaccard Score = 0.6146267574918401


100%|██████████| 387/387 [02:45<00:00,  2.34it/s, loss=0.0256]


Jaccard Score = 0.6223177109121496


100%|██████████| 387/387 [02:45<00:00,  2.34it/s, loss=0.0244]


Jaccard Score = 0.6270922695832994
CPU times: user 10min 38s, sys: 3min 52s, total: 14min 30s
Wall time: 14min 40s
