In [1]:
import pandas as pd, numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import utils

train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [2]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len, training=True):
    input_ids = np.ones(max_len)
    masks = np.zeros(max_len)
    token_type_ids = np.zeros(max_len)
    targets_start = 0
    targets_end = 0
    tweet_offsets = [(0,0) for _ in range(max_len)]
    sentiment_id = {s:tokenizer.encode(s)[0] for s in ['positive', 'negative', 'neutral']}
    text1 = " " + " ".join(tweet.split())
    #print(text1)
    enc = tokenizer.encode(text1)
    s_tok = sentiment_id[sentiment]
    input_ids[:len(enc)+5] = [0] + enc + [2,2] + [s_tok] + [2]
    masks[:len(enc)+5] = 1

    if training:
        text2 = " ".join(selected_text.split())
        idx = text1.find(text2)
        #print(idx, enc)
        chars = np.zeros((len(text1)))
        chars[idx:idx+len(text2)]=1
        #if text1[idx-1]==' ': chars[idx-1] = 1 
        # ID_OFFSETS
        idx=0
        for i, t in enumerate(enc):
            w = tokenizer.convert_ids_to_tokens([t])[0]
            #print(t, w, len(w))
            tweet_offsets[i] = (idx,idx+len(w))
            idx += len(w)
        # START END TOKENS
        #print(text1,chars,tweet_offsets)
        toks = []
        for i,(a,b) in enumerate(tweet_offsets):
            sm = np.sum(chars[a:b])
            #print(sm, len(chars[a:b]))
            #print(i, sm, a,b, chars, chars[a:b])
            if sm>0: toks.append(i) 
        #print('toks',toks)
        if len(toks)>0:
            targets_start = toks[0]+1
            targets_end = toks[-1]+1
        #print(targets_start,targets_end)
    return {
        'ids': input_ids,
        'masks': masks,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

In [3]:
def data_aug(train_sentiment, multiple = 1):
    def aug(row):
        tweet = row['text']
        selected_text = row['selected_text']
        idx = tweet.find(selected_text) 
        if idx>=0:
            new_tweets = []
            prev = tweet[:idx].split()
            after = tweet[idx+len(selected_text):].split()
            pool = [(i,j) for i in range(len(prev)+1) for j in range(len(after)+1)]
            pool.remove((len(prev),len(after)))
            if len(pool) == 0:
                return None
            for r in np.random.choice(len(pool), multiple):
                r1, r2 = pool[r]
                start = ''
                end = ''
                if r1 > 0:
                    start =' '.join(prev[r1:]) + ' '
                if r2 > 0:
                    end = ' '+' '.join(after[:r2])
                
                new_tweets.append(start+selected_text+end)
            if len(new_tweets) > 0:
                return new_tweets
            return None
        else:
            return None
        
    train_aug = {'text':[],'selected_text':[], 'sentiment':[], 'textID':[]}
    for row in train_sentiment.iterrows():
        row = row[1]
        new_tweets = aug(row)
        if new_tweets:
            for new_tweet in new_tweets:
                train_aug['text'].append(new_tweet)
                train_aug['selected_text'].append(row['selected_text'])
                train_aug['sentiment'].append(row['sentiment'])
                train_aug['textID'].append(row['textID'])
    train_aug = pd.DataFrame(train_aug).dropna()
    return train_aug

In [4]:
import transformers
import tokenizers
MAX_LEN = 108
TRAINING_FILE = "../input/tweet-sentiment-extraction/train.csv"
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 3
XLNet_PATH = "../input/xlnet_cased/xlnet_base_cased/"
TOKENIZER = transformers.XLNetTokenizer.from_pretrained('../input/xlnet_cased/xlnet_base_cased/', 
                                                        remove_space=False,
                                                        do_lower_case=True)

device = torch.device("cuda")
#device = torch.device("cpu")

In [5]:
class TweetModel(transformers.XLNetLMHeadModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.xlnet = transformers.XLNetModel.from_pretrained(XLNet_PATH,config=conf)
        self.drop_out = nn.Dropout(0.3)
        self.l0 = nn.Linear(768, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.conv1d_1 = nn.Conv1d(MAX_LEN, 64, 3, padding=1)
        self.conv1d_2 = nn.Conv1d(64, 2, 3, padding=1)
    
    def forward(self, ids, masks, token_type_ids):
        _, out = self.xlnet(
            ids,
            attention_mask=masks,
            token_type_ids=token_type_ids
        )

        out = torch.stack((out[-1], out[-2], out[-3]),dim=0)
        out = torch.mean(out, 0)
        #out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text, training):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        self.training = training
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len,
            self.training
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'masks': torch.tensor(data["masks"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

In [6]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    #L.BinaryJaccardLogLoss()
    return total_loss

In [7]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        masks = d["masks"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]
    
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        masks = masks.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        outputs_start, outputs_end = model(
            ids=ids,
            masks=masks,
            token_type_ids=token_type_ids,
        )
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = utils.calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px],
                tokenizer=TOKENIZER,
            )
            jaccard_scores.append(jaccard_score)

        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

In [8]:
def eval_fn(data_loader, model, device):
    model.eval()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            masks = d["masks"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            masks = masks.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
            
            outputs_start, outputs_end = model(
                ids=ids,
                masks=masks,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = utils.calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px],
                    tokenizer=TOKENIZER
                )
                jaccard_scores.append(jaccard_score)
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    return jaccards.avg

In [9]:
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

indexes = []
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(train['text'],train['sentiment']):
    indexes.append((train_index, test_index))

jac_folds = []
def run(fold=0):
    train_index, test_index = indexes[fold]
    df_train, df_valid = train.iloc[train_index], train.iloc[test_index]
    #df_train= df_train[df_train['sentiment']!='neutral']
    df_train = pd.concat([df_train, data_aug(df_train,0)], axis=0, sort=False)
    #train_test_split(train_sentiment, test_size=0.2)
    
    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
        training=True
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True, 
        num_workers=8,
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
        training=True
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        shuffle=False, 
        num_workers=2
    )
    
    model_config = transformers.XLNetConfig.from_pretrained("../input/xlnet_cased/xlnet_base_cased/config.json")
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)    
    model.to(device)

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0006},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=4e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    
    es = utils.EarlyStopping(patience=2)
    print(f"Training is Starting for fold={fold}")

    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        es(jaccard, model)
        if es.early_stop:
            print("Early stopping")
            break
    jac_folds.append(jaccard)
    torch.save(model.state_dict(), f"XLNetmodel_{fold}.bin")

In [10]:
run(0)
run(1)
run(2)
run(3)
run(4)
print('folds mean = ', np.mean(jac_folds))

Training is Starting for fold=0


HBox(children=(FloatProgress(value=0.0, max=687.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.6988950205599564


HBox(children=(FloatProgress(value=0.0, max=687.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7071640970816213


HBox(children=(FloatProgress(value=0.0, max=687.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7082537812626892
Training is Starting for fold=1


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.6889587184918052


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7023902385226309


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7081705955958753
Training is Starting for fold=2


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.696553309773101


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7030382110631687


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7036598453726495
Training is Starting for fold=3


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.6970969686565768


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7075787210707901


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7103120878250555
Training is Starting for fold=4


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.6929098674034506


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7003127411942135


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))


Jaccard Score = 0.7024750791190862
folds mean =  0.7065742778350712


In [11]:
def check(train, verbose=0):
    jacs =[]
    for x in train.index:
        tweet=train.loc[x, 'text']
        selected_text = train.loc[x, 'selected_text']
        sentiment=train.loc[x, 'sentiment']
        tokenizer=TOKENIZER
        max_len=MAX_LEN
        d=process_data(tweet, selected_text, sentiment, tokenizer, max_len, training=True)
        decoded=utils.calculate_jaccard_score(
                        original_tweet=d['orig_tweet'],
                        target_string=d['orig_selected'],
                        sentiment_val=d['sentiment'],
                        idx_start=d['targets_start'],
                        idx_end=d['targets_end'],
                        offsets=d['offsets'],
                        tokenizer=TOKENIZER
                    )
        if verbose:
            print(tweet,selected_text,decoded)
        jacs.append(decoded[0])
    return np.mean(jacs)

In [12]:
def inference():
    df_test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
    df_test['text'] = df_test['text'].astype(str)
    model_config = transformers.XLNetConfig.from_pretrained("../input/xlnet_cased/xlnet_base_cased/config.json")
    model_config.output_hidden_states = True
    test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=np.zeros(len(df_test)),
        training=False
    )
    test_loader = torch.utils.data.DataLoader(
            test_dataset, 
            batch_size=32, 
            shuffle=False, 
            num_workers=2)    
    predictions = []
    models = []
    for fold in range(2):
        model = TweetModel(conf=model_config)
        model.cuda()
        model.load_state_dict(torch.load(f'XLNetmodel_{fold}.bin'))
        model.eval()
        models.append(model)

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['orig_tweet']
        offsets = data['offsets'].numpy()
        sentiment = data['sentiment']
        token_type_ids = data['token_type_ids'].cuda()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks,token_type_ids)
                start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        for i in range(len(ids)):    
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            _, pred = utils.calculate_jaccard_score(tweet[i],
                                    "",
                                    sentiment[i],
                                    start_pred,
                                    end_pred,
                                    offsets[i],
                                    tokenizer=TOKENIZER,
                                    verbose=False)
            predictions.append(pred)

    sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()

In [13]:
inference()