In [1]:
import pandas as pd, numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import utils

train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [2]:
def data_aug(train, multiple = 1):
    def aug(row):
        tweet = row['text']
        selected_text = row['selected_text']
        idx = tweet.find(selected_text) 
        if idx>=0:
            new_tweets = []
            prev = tweet[:idx].split()
            after = tweet[idx+len(selected_text):].split()
            pool = [(i,j) for i in range(len(prev)+1) for j in range(len(after)+1)]
            pool.remove((len(prev),len(after)))
            if len(pool) == 0:
                return None
            for r in np.random.choice(len(pool), multiple):
                r1, r2 = pool[r]
                start = ''
                end = ''
                if r1 > 0:
                    start =' '.join(prev[r1:]) + ' '
                if r2 > 0:
                    end = ' '+' '.join(after[:r2])
                
                new_tweets.append(start+selected_text+end)
            if len(new_tweets) > 0:
                return new_tweets
            return None
        else:
            return None
        
    train_aug = {'text':[],'selected_text':[], 'sentiment':[], 'textID':[]}
    for row in train.iterrows():
        row = row[1]
        new_tweets = aug(row)
        if new_tweets:
            for new_tweet in new_tweets:
                train_aug['text'].append(new_tweet)
                train_aug['selected_text'].append(row['selected_text'])
                train_aug['sentiment'].append(row['sentiment'])
                train_aug['textID'].append(row['textID'])
    train_aug = pd.DataFrame(train_aug).dropna()
    return train_aug

In [3]:
import tokenizers 
import transformers

ROBERTA_PATH = "../input/roberta-base"

TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )

model_config = transformers.BertConfig.from_pretrained(ROBERTA_PATH)
model_config.output_hidden_states = True

#device = torch.device("cpu")
device = torch.device("cuda")

MAX_LEN = 128
VOCAB_SIZE = 50000
EMBEDDING_DIM = 768
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 20

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    if len(selected_text) == 0:
        selected_text = " "
    if len(tweet) == 0:
        tweet = " "
        
    tweet = " " + " ".join(str(tweet).split(" "))
    selected_text = " " + " ".join(str(selected_text).split(" "))

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break
            
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1

    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)

    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    input_ids = [0] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    targets_start += 3
    targets_end += 3

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)

    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

In [5]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        self.roberta = transformers.RobertaModel.from_pretrained(ROBERTA_PATH, config=model_config)
        #self.embedding = self.roberta.get_input_embeddings()
        #self.lstm = nn.LSTM(EMBEDDING_DIM, MAX_LEN // 2, batch_first=True, bidirectional=True)
        self.drop_out = nn.Dropout(0.2)
        self.l0 = nn.Linear(EMBEDDING_DIM * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.05)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.view((-1,start_logits.size(1)))
        end_logits = end_logits.view((-1,end_logits.size(1)))

        return start_logits, end_logits

class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

In [6]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = utils.calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px],
            )
            jaccard_scores.append(jaccard_score)

        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
        
    del ids
    del token_type_ids
    del mask
    del targets_start
    del targets_end
    torch.cuda.empty_cache()
    
def eval_fn(data_loader, model_pos, model_neg, device):
    model_pos.eval()
    model_neg.eval()
    losses = utils.AverageMeter()
    jaccards_pos = utils.AverageMeter()
    jaccards_neg = utils.AverageMeter()
    jaccards_full = utils.AverageMeter()

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
            outputs_start, outputs_end = model_pos(
                ids=ids,
                mask = mask,
                token_type_ids = token_type_ids,
            )
            if sentiment == 'negative':
                outputs_start, outputs_end = model_neg(
                ids=ids,
                mask = mask,
                token_type_ids = token_type_ids,
                )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores_pos = []
            jaccard_scores_neg = []
            jaccard_scores_full = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                if tweet_sentiment == 'neutral':
                    jaccard_score = utils.jaccard(tweet, selected_tweet)
                else:
                    jaccard_score, _ = utils.calculate_jaccard_score(
                        original_tweet=tweet,
                        target_string=selected_tweet,
                        sentiment_val=tweet_sentiment,
                        idx_start=np.argmax(outputs_start[px, :]),
                        idx_end=np.argmax(outputs_end[px, :]),
                        offsets=offsets[px]
                    )
                    if tweet_sentiment == 'positive':
                        jaccard_scores_pos.append(jaccard_score)
                    if tweet_sentiment == 'negative':
                        jaccard_scores_neg.append(jaccard_score)
                jaccard_scores_full.append(jaccard_score)
            jaccards_pos.update(np.mean(jaccard_scores_pos), ids.size(0))
            jaccards_neg.update(np.mean(jaccard_scores_neg), ids.size(0))
            jaccards_full.update(np.mean(jaccard_scores_full), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard_pos=jaccards_pos.avg,jaccard_neg=jaccards_neg.avg,jaccard_full=jaccards_full.avg)
    
    return jaccards_full.avg

def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = 0
    #if (start_loss + end_loss) > 0:
    #    total_loss = (start_loss * end_loss)/(start_loss + end_loss)
    total_loss = (start_loss + end_loss)
    return total_loss

In [7]:
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

indexes = []
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(train['text'],train['sentiment']):
    indexes.append((train_index, test_index))

def run(fold=0):
    train_index, test_index = indexes[fold]
    df_train, df_valid = train.iloc[train_index], train.iloc[test_index]
    
    def prepare(sentiment="neutral", df_train=df_train):
        df_train = df_train[df_train['sentiment']==sentiment]
        df_train = pd.concat([df_train, data_aug(df_train,0)], axis=0, sort=False)
    
        train_dataset = TweetDataset(
            tweet=df_train.text.values,
            sentiment=df_train.sentiment.values,
            selected_text=df_train.selected_text.values
        )

        train_data_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=TRAIN_BATCH_SIZE,
            num_workers=8
        )
        model = TweetModel()
        model.to(device)

        num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]
        optimizer = AdamW(optimizer_parameters, lr=5e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=0, 
            num_training_steps=num_train_steps
        )
    
        return train_data_loader, model, optimizer, scheduler
    
    
    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=2
    )
    
    train_data_loader_pos, model_pos, optimizer_pos, scheduler_pos = prepare(sentiment="positive",df_train=df_train)
    train_data_loader_neg, model_neg, optimizer_neg, scheduler_neg = prepare(sentiment="negative",df_train=df_train)

    es = utils.EarlyStopping(patience=2)
    print(f"Training is Starting for fold={fold}")  

    for epoch in range(EPOCHS):
        train_fn(train_data_loader_pos, model_pos, optimizer_pos, device, scheduler=scheduler_pos)
        train_fn(train_data_loader_neg, model_neg, optimizer_neg, device, scheduler=scheduler_neg)
        jaccard = eval_fn(valid_data_loader, model_pos, model_neg, device)
        print(f"Jaccard Score = {jaccard}")
        es(jaccard, model_neg)
        if es.early_stop:
            print("Early stopping")
            break
    torch.save(model_pos.state_dict(), f"model_pos_{fold}.bin")    
    torch.save(model_neg.state_dict(), f"model_neg_{fold}.bin")

In [8]:
run(0)
run(1)
run(2)
run(3)
run(4)

Training is Starting for fold=0


HBox(children=(FloatProgress(value=0.0, max=430.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)





HBox(children=(FloatProgress(value=0.0, max=389.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)



Jaccard Score = 0.6638014460835406


HBox(children=(FloatProgress(value=0.0, max=430.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=389.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.6631262258392302
EarlyStopping counter: 1 out of 2


HBox(children=(FloatProgress(value=0.0, max=430.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
def submit():
    df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
    df_test.loc[:, "selected_text"] = df_test.text.values
    #device = torch.device("cpu")

    model1 = TweetModel()
    model1.to(device)
    model1.load_state_dict(torch.load("model_0.bin"))
    model1.eval()

    model2 = TweetModel()
    model2.to(device)
    model2.load_state_dict(torch.load("model_1.bin"))
    model2.eval()

    model3 = TweetModel()
    model3.to(device)
    model3.load_state_dict(torch.load("model_2.bin"))
    model3.eval()

    model4 = TweetModel()
    model4.to(device)
    model4.load_state_dict(torch.load("model_3.bin"))
    model4.eval()

    model5 = TweetModel()
    model5.to(device)
    model5.load_state_dict(torch.load("model_4.bin"))
    model5.eval()
    final_output = []

    test_dataset = TweetDataset(
            tweet=df_test.text.values,
            sentiment=df_test.sentiment.values,
            selected_text=df_test.selected_text.values
    )

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start1, outputs_end1 = model1(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            outputs_start2, outputs_end2 = model2(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            outputs_start3, outputs_end3 = model3(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            outputs_start4, outputs_end4 = model4(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            outputs_start5, outputs_end5 = model5(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs_start = (
                outputs_start1 
                + outputs_start2 
                + outputs_start3 
                + outputs_start4 
                + outputs_start5
            ) / 5
            outputs_end = (
                outputs_end1 
                + outputs_end2 
                + outputs_end3 
                + outputs_end4 
                + outputs_end5
            ) / 5

            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                if tweet_sentiment == 'neutral':
                    output_sentence = tweet
                else:
                    _, output_sentence = utils.calculate_jaccard_score(
                        original_tweet=tweet,
                        target_string=selected_tweet,
                        sentiment_val=tweet_sentiment,
                        idx_start=np.argmax(outputs_start[px, :]),
                        idx_end=np.argmax(outputs_end[px, :]),
                        offsets=offsets[px]
                )
                final_output.append(output_sentence)
        return final_output

In [None]:
def post_process(selected):
    return " ".join(set(selected.lower().split()))

final_output = submit()
sample = pd.read_csv("../output/sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output

sample.selected_text = sample.selected_text.map(post_process)
sample.to_csv("../output/submission.csv", index=False)

sample.head()