In [1]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig, BertTokenizer
import utils
from tqdm.autonotebook import tqdm

warnings.filterwarnings('ignore')

In [2]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 43
seed_everything(seed)

In [3]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=128):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='../input/roberta-base/vocab.json', 
            merges_file='../input/roberta-base/merges.txt', 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        data['sentiment'] = row['sentiment']
        
        if self.labeled:
            start_idx, end_idx, selected_text = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
            data['selected_text'] = selected_text

        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * (self.max_len-len(offsets))
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1
        
        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx, selected_text
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=8,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

In [4]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        
        config = RobertaConfig.from_pretrained(
            '../input/roberta-base/config.json', output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            '../input/roberta-base/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0.01)

    def forward(self, input_ids, attention_mask):
        _, _, hs = self.roberta(input_ids, attention_mask)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3]])
        x = torch.max(x, 0)[0]
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

In [5]:
def position_loss(logits, positions):
    pos_w = (torch.cumsum(torch.ones(logits.size()), axis = -1) - torch.ones(logits.size())).cuda()
    pos_w_ = torch.abs(pos_w - positions)
    loss = torch.mean(torch.sum(pos_w * logits, axis = -1))
    return loss

def batch_out(a,b):
    a = a.view(-1,a.size(1),1)
    b = b.view(-1,1,b.size(1))
    return torch.bmm(a,b)

def broadcast_1d(a,b):
    return a.view(a.size(0),1).repeat(1, b.size(1))

def broadcast_2d(a,b):
    return a.view(a.size(0),1,1).repeat(1,b.size(1),b.size(1))

def span_loss(start_logits, end_logits,start_positions, end_positions):
    prob_matrix = batch_out(start_logits, end_logits)
    pos_w_2d = (torch.cumsum(torch.ones(prob_matrix.size()), axis = -1) - torch.ones(prob_matrix.size())).cuda()
    pos_w_2d = pos_w_2d - torch.transpose(pos_w_2d,1,2) + 1
    pos_w_2d = torch.abs(pos_w_2d - (end_positions - start_positions + 1))
    loss = torch.mean(torch.sum(pos_w_2d * prob_matrix, axis = (-2,-1)))
    return loss

def expected_jaccard(start_logits, end_logits,start_positions, end_positions):

    """
    given_start: the selected start as an integer
    given_end, the selected end as an integer
    If given_start and given_end are one hot vectors, take argmax here.
    ps, model predicted starting prob distribution
    pe, model predicted ending prob distribution
    """

    ZEROZ = torch.zeros_like(start_logits)
    ONEZ = torch.ones_like(start_logits)
    
    p_2d = batch_out(start_logits, end_logits)
    ZEROZ_2 = torch.zeros_like(p_2d)
    ONEZ_2 = torch.ones_like(p_2d)
    
    x_j = (torch.cumsum(ONEZ_2, dim = -1) - 1)
    x_i = torch.transpose(x_j, 1, 2)

    x_len_j_i = x_j - x_i + 1
    
    S = ONEZ_2* broadcast_2d(start_positions,start_logits)
    E = ONEZ_2* broadcast_2d(end_positions,end_logits)

    I_i_le_j = torch.where(x_len_j_i > 0, ONEZ_2, ZEROZ_2)

    idx = torch.cumsum(ONEZ, dim = -1) - 1
    #Calculate area 1 (i <= S, j <= E)
    
    I_1 = batch_out(torch.where(idx <= start_positions, ONEZ, ZEROZ), 
                    torch.where(idx <= end_positions, ONEZ, ZEROZ)) * I_i_le_j

    #print(I_1)

    J_1 = torch.where((x_j - S + 1) > 0, x_j - S + 1.0, ZEROZ_2) * I_1 
    J_1 = torch.where(E - x_i + 1 > 0, J_1 / (E - x_i + 1), ZEROZ_2)

    #Calculate area 2 (i <= S, j > E)
    I_2 = batch_out(torch.where(idx <= start_positions, ONEZ, ZEROZ), 
                    torch.where(idx > end_positions, ONEZ, ZEROZ)) * I_i_le_j

    J_2 = E - S + 1
    J_2 = torch.where(x_j - x_i + 1 > 0, J_2 / (x_j - x_i + 1), ZEROZ_2) * I_2

    #Calculate area 3 (i > S, j <= E)
    I_3 = batch_out(torch.where(idx > start_positions, ONEZ, ZEROZ), 
                    torch.where(idx <= end_positions, ONEZ, ZEROZ)) * I_i_le_j

    J_3 = torch.where(x_j - x_i + 1 > 0, x_j - x_i + 1, ZEROZ_2)
    J_3 = torch.where(E - S + 1 > 0, J_3 / (E - S + 1), ZEROZ_2) * I_3

    #Calculate area 4 (i > S, j > E)
    I_4 = batch_out(torch.where(idx > start_positions, ONEZ, ZEROZ), 
                    torch.where(idx > end_positions, ONEZ, ZEROZ)) * I_i_le_j

    J_4 = torch.where(E - x_i + 1 > 0, E - x_i + 1, ZEROZ_2)
    J_4 = torch.where(x_j - S + 1 > 0, J_4 / (x_j - S + 1), ZEROZ_2) * I_4

    #Adding up together and take expectation
    J = J_1 + J_2 + J_3 + J_4
    E_J = J*p_2d

    #print(J_1)

    #print(p_2d)

    return torch.mean(torch.sum(E_J, axis=(-2,-1)))

In [14]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)
    start_logits =  torch.softmax(start_logits, dim=1)
    end_logits = torch.softmax(end_logits, dim=1)
    start_positions = start_positions.view(start_positions.size(0),1)
    end_positions = end_positions.view(end_positions.size(0),1)
    #position_loss()
    #span_loss()
    jac_loss = expected_jaccard(start_logits, end_logits,start_positions, end_positions)
    total_loss = start_loss + end_loss - 5. * jac_loss 
    #+ 0.05*(start_loss1 + end_loss1) + 0.02 * span_loss 
    # + 1.0 * torch.max(start_pred - end_pred, torch.zeros(start_pred.size()).cuda())[0]
    return total_loss

In [15]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets, selected_text, sentiment=None):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
    #if sentiment == 'neutral':
    #    pred = text
        
    #true = get_selected_text(text, start_idx, end_idx, offsets)
    true = selected_text
    
    return jaccard(true, pred)

In [16]:
def train_model(model, dataloaders_dict, criterion, optimizer, scheduler, num_epochs, filename):
    model.cuda()
    max_jaccard = 0
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            losses = utils.AverageMeter()
            jaccards = utils.AverageMeter()
            tk0 = tqdm(dataloaders_dict[phase], total=len(dataloaders_dict[phase]))
            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            #for data in (dataloaders_dict[phase]):
            for bi, data in enumerate(tk0):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                sentiment = data['sentiment']
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()
                selected_text = data['selected_text']
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    start_logits, end_logits = model(ids, masks)

                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        #scheduler.step()

                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    jaccard_scores = []
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i],
                            selected_text[i], 
                            sentiment[i],
                        )
                        epoch_jaccard += jaccard_score
                        jaccard_scores.append(jaccard_score)
                    jaccards.update(np.mean(jaccard_scores), len(ids))
                    losses.update(loss.item(), len(ids))
                    tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
            
        #if epoch_jaccard > max_jaccard:
        #    max_jaccard = epoch_jaccard
        torch.save(model.state_dict(), filename)
        #    print("Jac improved, model saved")
            
    return max_jaccard

In [17]:
num_epochs = 4
batch_size = 32
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed)

In [18]:
%%time

train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)
fold_jaccard = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    print(f'Fold: {fold}')

    model = TweetModel()
    num_training_steps = int(len(train_df) / batch_size * num_epochs)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    scheduler = None # get_linear_schedule_with_warmup(optimizer, num_warmup_steps=8, num_training_steps=num_training_steps)
    criterion = loss_fn    
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)

    jac = train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        scheduler,
        num_epochs,
        f'roberta-finetune_l1crossloss2_fold{fold}.pth')
    
    fold_jaccard += jac
print('fold averaged jac = ', fold_jaccard/skf.get_n_splits())

Fold: 1


HBox(children=(FloatProgress(value=0.0, max=751.0), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.




Traceback (most recent call last):
  File "/home/qiao/.local/lib/python3.7/site-packages/IPython/core/magics/execution.py", line 1312, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 28, in <module>
  File "<ipython-input-16-33f044006b30>", line 17, in train_model
    for bi, data in enumerate(tk0):
  File "/home/qiao/.local/lib/python3.7/site-packages/tqdm/notebook.py", line 218, in __iter__
    for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
  File "/home/qiao/.local/lib/python3.7/site-packages/tqdm/std.py", line 1129, in __iter__
    for obj in iterable:
  File "/home/qiao/.local/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 345, in __next__
    data = self._next_data()
  File "/home/qiao/.local/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 841, in _next_data
    idx, data = self._get_data()
  File "/home/qiao/.local/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 808, in _get_data
    suc

KeyboardInterrupt: 

In [19]:
%%time
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
models = []
for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fc12ecb4950>
Traceback (most recent call last):
  File "/home/qiao/.local/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 962, in __del__
    self._shutdown_workers()
  File "/home/qiao/.local/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 942, in _shutdown_workers
    w.join()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 48, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/usr/lib/python3.7/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/qiao/.local/lib/python3.7/site-packages/IPython/core/magics/execution.py", line 1312, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 7, in <module>
  File "<ipython-input-4-95a0299bb4db>", line 8, in __init__
    '../input/roberta-base/pytorch_model.bin', config=config)
  File "/home/qiao/.local/lib/python3.7/site-packages/transformers/modeling_utils.py", line 655, in from_pretrained
    model = cls(config, *model_args, **model_kwargs)
  File "/home/qiao/.local/lib/python3.7/site-packages/transformers/modeling_roberta.py", line 149, in __init__
    super().__init__(config)
  File "/home/qiao/.local/lib/python3.7/site-packages/transformers/modeling_bert.py", line 615, in __init__
    self.embeddings = BertEmbeddings(config)
  File "/home/qiao/.local/lib/python3.7/site-packages/transformers/modeling_bert.py", line 149, in __init__
    self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx

KeyboardInterrupt: 

In [20]:
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()

ValueError: Length of values does not match length of index

In [None]:
td = TweetDataset(train_df)
for i in train_df.index:
    text = train_df.loc[i, 'text']
    selected_text = train_df.loc[i, 'selected_text']
    rev_text = td.tokenizer.decode(td.tokenizer.encode(text).ids)
    rev_selected_text = td.tokenizer.decode(td.tokenizer.encode(selected_text).ids)
    if " ".join(rev_text.lower().split()) != " ".join(text.lower().split()):
        print(rev_text, text)
    if " ".join(rev_selected_text.lower().split()) != " ".join(selected_text.lower().split()):
        print(rev_selected_text, selected_text)