In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import tokenizers
import re
import torch.nn as nn
import string
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn import model_selection

In [2]:
# config
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 50
    BERT_PATH = 'bert-base-uncased'
    MODEL_PATH = 'model.bin'
    TRAINING_FILE = '../input/tweet-sentiment-extraction/train.csv'
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        '../input/berthub/assets/vocab.txt',
        lowercase = True
    )

In [3]:
# Dataset
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.max_len = config.MAX_LEN
        self.tokenizer = config.TOKENIZER
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        tweet = " ".join(str(self.tweet[item]).split())
        selected_text = " ".join(str(self.selected_text[item]).split())

        len_sel_text = len(selected_text)
        idx0 = -1
        idx1 = -1
        
        # used to find the starting and the ending index of the text
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
            if tweet[ind: ind + len_sel_text] == selected_text:
                idx0 = ind
                idx1 = ind + len_sel_text - 1
                break
       
        # have 1 if there`s character and 0 if there`s space
        char_targets = [0]*len(tweet)
        if idx0 != -1 and idx1 != -1:
            for j in range(idx0, idx1 + 1):
                if tweet[j] != " ":
                    char_targets[j] = 1
        
        # Encode
        tok_tweet = self.tokenizer.encode(tweet)
        tok_tweet_tokens = tok_tweet.tokens
        tok_tweet_ids = tok_tweet.ids
        tok_tweet_offsets = tok_tweet.offsets[1:-1]
        
        # Targets with -2 len for the tokens
        targets = [0] * (len(tok_tweet_tokens) - 2)

        # Checking if there`s characters between the offsets index
        for j, (offset1, offset2) in enumerate(tok_tweet_offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                targets[j] = 1

        targets = [0] + targets + [0]
        targets_start = [0] * len(targets)
        targets_end = [0] * len(targets)

        # Indicies of non zero values
        non_zero = np.nonzero(targets)[0]

        if len(non_zero) > 0:
            targets_start[non_zero[0]] = 1
            targets_end[non_zero[-1]] = 1

        mask = [1]* len(tok_tweet_ids)
        token_type_ids = [0]* len(tok_tweet_ids)

        padding_len = self.max_len - len(tok_tweet_ids)
        ids = tok_tweet_ids + [0]*padding_len
        mask = mask + [0]*padding_len
        token_type_ids = token_type_ids + [0]*padding_len
        targets = targets + [0]*padding_len
        targets_start = targets_start + [0]*padding_len
        targets_end = targets_end + [0]*padding_len

        sentiment = [1, 0, 0]
        if self.sentiment[item] == 'positive':
            sentiment = [0, 0, 1]
        if self.sentiment[item] == 'negative':
            sentiment = [0, 1, 0]
        
        return {
            "ids" : torch.tensor(ids, dtype = torch.long),
            "mask" : torch.tensor(mask, dtype = torch.long),
            "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long),
            "targets" : torch.tensor(targets, dtype = torch.long),
            "targets_start" : torch.tensor(targets_start, dtype = torch.long),
            "targets_end" : torch.tensor(targets_end, dtype = torch.long),
            "padding_len" : torch.tensor(padding_len, dtype = torch.long),
            "sentiment" : torch.tensor(sentiment, dtype = torch.long),
            "orig_tweet" : self.tweet[item],
            "orig_sentiment" : self.sentiment[item],
            "tweet_tokens" : " ".join(tok_tweet_tokens),
            "orig_selected" : self.selected_text[item]
            }

In [4]:
# Testing the tweet dataset
df = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop = True)
dset = TweetDataset(
tweet = df.text.values,
sentiment = df.sentiment.values,
selected_text = df.selected_text.values)
print (dset[1])

{'ids': tensor([  101, 17111,  2080,  6517,  1045,  2097,  3335,  2017,  2182,  1999,
         2624,  5277,   999,   999,   999,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,

In [5]:
# Contraction Dictionary for the expansion
contractions_dict = {
    "ain`t": "am not", "aren`t": "are not", "can`t": "cannot", "can`t`ve": "cannot have", "`cause": "because",
    "could`ve": "could have", "couldn`t": "could not", "couldn`t`ve": "could not have", "didn`t": "did not", "doesn`t": "does not",
    "doesn’t": "does not", "don`t": "do not", "don’t": "do not", "hadn`t": "had not", "hadn`t`ve": "had not have", "hasn`t": "has not",
    "haven`t": "have not", "he`d": "he had", "he`d`ve": "he would have", "he`ll": "he will", "he`ll`ve": "he will have", "he`s": "he is",
    "how`d": "how did", "how`d`y": "how do you", "how`ll": "how will", "how`s": "how is", "i`d": "i would", "i`d`ve": "i would have",
    "i`ll": "i will", "i`ll`ve": "i will have", "i`m": "i am", "i`ve": "i have", "isn`t": "is not", "it`d": "it would", "it`d`ve": "it would have",
    "it`ll": "it will", "it`ll`ve": "it will have", "it`s": "it is", "let`s": "let us", "ma`am": "madam", "mayn`t": "may not","might`ve": "might have",
    "mightn`t": "might not", "mightn`t`ve": "might not have", "must`ve": "must have", "mustn`t": "must not", "mustn`t`ve": "must not have",
    "needn`t": "need not", "needn`t`ve": "need not have", "o`clock": "of the clock", "oughtn`t": "ought not", "oughtn`t`ve": "ought not have",
    "shan`t": "shall not","sha`n`t": "shall not", "shan`t`ve": "shall not have", "she`d": "she would", "she`d`ve": "she would have",
    "she`ll": "she will", "she`ll`ve": "she will have", "she`s": "she is", "should`ve": "should have", "shouldn`t": "should not",
    "shouldn`t`ve": "should not have", "so`ve": "so have", "so`s": "so is", "that`d": "that would", "that`d`ve": "that would have",
    "that`s": "that is", "there`d": "there would", "there`d`ve": "there would have", "there`s": "there is", "they`d": "they would",
    "they`d`ve": "they would have", "they`ll": "they will", "they`ll`ve": "they will have", "they`re": "they are", "they`ve": "they have",
    "to`ve": "to have", "wasn`t": "was not", "we`d": "we would", "we`d`ve": "we would have", "we`ll": "we will", "we`ll`ve": "we will have",
    "we`re": "we are", "we`ve": "we have", "weren`t": "were not", "what`ll": "what will", "what`ll`ve": "what will have", "what`re": "what are",
    "what`s": "what is", "what`ve": "what have", "when`s": "when is", "when`ve": "when have", "where`d": "where did", "where`s": "where is",
    "where`ve": "where have", "who`ll": "who will", "who`ll`ve": "who will have", "who`s": "who is", "who`ve": "who have", "why`s": "why is",
    "why`ve": "why have", "will`ve": "will have", "won`t": "will not", "won`t`ve": "will not have", "would`ve": "would have",
    "wouldn`t": "would not", "wouldn`t`ve": "would not have", "y`all": "you all", "y’all": "you all", "y`all`d": "you all would",
    "y`all`d`ve": "you all would have", "y`all`re": "you all are", "y`all`ve": "you all have", "you`d": "you would", "you`d`ve": "you would have",
    "you`ll": "you will", "you`ll`ve": "you will have", "you`re": "you are", "you`ve": "you have", "ain’t": "am not", "aren’t": "are not",
    "can’t": "cannot", "can’t’ve": "cannot have", "’cause": "because", "could’ve": "could have", "couldn’t": "could not", "couldn’t’ve": "could not have",
    "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "don’t": "do not", "hadn’t": "had not", "hadn’t’ve": "had not have",
    "hasn’t": "has not", "haven’t": "have not", "he’d": "he had", "he’d’ve": "he would have", "he’ll": "he will", "he’ll’ve": "he will have",
    "he’s": "he is", "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is", "i’d": "i would", "i’d’ve": "i would have",
    "i’ll": "i will", "i’ll’ve": "i will have", "i’m": "i am", "i`ve": "i have", "isn’t": "is not", "it’d": "it would", "it’d’ve": "it would have",
    "it’ll": "it will", "it’ll’ve": "it will have", "it’s": "it is", "let’s": "let us", "ma’am": "madam", "mayn’t": "may not",
    "might’ve": "might have", "mightn’t": "might not", "mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not",
    "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have", "o’clock": "of the clock",
    "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have",
    "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will", "she’ll’ve": "she will have", "she’s": "she is",
    "should’ve": "should have", "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have", "so’s": "so is",
    "that’d": "that would", "that’d’ve": "that would have", "that’s": "that is", "there’d": "there would", "there’d’ve": "there would have",
    "there’s": "there is", "they’d": "they would", "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have",
    "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have",
    "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are", "we’ve": "we have", "weren’t": "were not", "what’ll": "what will",
    "what’ll’ve": "what will have", "what’re": "what are", "what’s": "what is", "what’ve": "what have", "when’s": "when is",
    "when’ve": "when have", "where’d": "where did", "where’s": "where is", "where’ve": "where have", "who’ll": "who will",
    "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have","why’s": "why is", "why’ve": "why have", "will’ve": "will have",
    "won’t": "will not", "won’t’ve": "will not have", "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have",
    "y’all": "you all", "y’all": "you all", "y’all’d": "you all would", "y’all’d’ve": "you all would have", "y’all’re": "you all are",
    "y’all’ve": "you all have", "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have",
    "you’re": "you are", "you’re": "you are", "you’ve": "you have"
}

In [6]:
# Preprocessing
def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def preprocessing(df):
    
    df['text'] = df['text'].astype(str)
    df['selected_text'] = df['selected_text'].astype(str)
    df['text'] = df['text'].str.lower()
    df['selected_text'] = df['selected_text'].str.lower()

    df['text'] = df['text'].apply(lambda x: expand_contractions(x))
    df['selected_text'] = df['selected_text'].apply(lambda x: expand_contractions(x))
    
    return df

In [7]:
# model
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH)
#         self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768, 2)
    
    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooled_output = self.bert(
            ids,
            attention_mask=mask,
            return_dict=False,
            token_type_ids=token_type_ids
        )

        logits = self.l0(sequence_output)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [8]:
# utils
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class utils:
    def jaccard(str1, str2): 
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

In [9]:
# engine

def loss_fn(o1, o2, t1, t2):
    l1 = nn.BCEWithLogitsLoss()(o1, t1)
    l2 = nn.BCEWithLogitsLoss()(o2, t2)
    return l1 + l2

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total = len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        
        ids = ids.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        targets_start = targets_start.to(device, dtype = torch.float)
        targets_end = targets_end.to(device, dtype = torch.float)
        
        optimizer.zero_grad()
        
        o1, o2 = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids
            )
        
        loss = loss_fn(o1, o2, targets_start, targets_end)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss = losses.avg)

def eval_fn(data_loader, model, device):
    model.eval()
    
    fin_output_start = []
    fin_output_end = []
    fin_padding_lens = []
    fin_tweet_tokens = []
    fin_orig_sentiment = []
    fin_orig_selected = []
    fin_orig_tweet = []
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d['ids']
            token_type_ids = d['token_type_ids']
            mask = d['mask']
            tweet_tokens = d['tweet_tokens']
            padding_len = d['padding_len']
            orig_sentiment = d['orig_sentiment']
            orig_selected = d['orig_selected']
            orig_tweet = d['orig_tweet']

            ids = ids.to(device, dtype = torch.long)
            token_type_ids = token_type_ids.to(device, dtype = torch.long)
            mask = mask.to(device, dtype = torch.long)

            o1, o2 = model(
                ids = ids,
                mask = mask,
                token_type_ids = token_type_ids
                )

            fin_output_start.append(torch.sigmoid(o1).cpu().detach().numpy())
            fin_output_end.append(torch.sigmoid(o2).cpu().detach().numpy())
            fin_padding_lens.extend(padding_len.cpu().detach().numpy())

            fin_tweet_tokens.extend(tweet_tokens)
            fin_orig_sentiment.extend(orig_sentiment)
            fin_orig_selected.extend(orig_selected)
            fin_orig_tweet.extend(orig_tweet)

        fin_output_start = np.vstack(fin_output_start)
        fin_output_end = np.vstack(fin_output_end)

        threshold = 0.3
        jaccards = []
        out = []

        for j in range(len(fin_tweet_tokens)):
            target_string = fin_orig_selected[j]
            tweet_tokens = fin_tweet_tokens[j]
            padding_len = fin_padding_lens[j]
            original_tweet = fin_orig_tweet[j]
            sentiment = fin_orig_sentiment[j]
            
            if padding_len>0:
                mask_start = fin_output_start[j,:][0: -padding_len] >= threshold
                mask_end = fin_output_end[j,:][0: -padding_len] >= threshold
            else:
                mask_start = fin_output_start[j,:] >= threshold
                mask_end = fin_output_end[j,:] >= threshold
                
            mask = [0]*len(mask_start)
            idx_start = np.nonzero(mask_start)[0]
            idx_end = np.nonzero(mask_end)[0]
        
            if len(idx_start) > 0:
                idx_start = idx_start[0]
                if len(idx_end) > 0:
                    idx_end = idx_end[0]
                else:
                    idx_end = idx_start
            else:
                idx_start = 0
                idx_end = 0
            
            for mj in range(idx_start, idx_end + 1):
                mask[mj] = 1
            
            output_token = [x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1]
            output_token = [x for x in output_token if x not in ("[CLS]", "[SEP]")]
            
            final_output = ""
            for ot in output_token:
                if ot.startswith("##"):
                    final_output = final_output + ot[2:]
                elif len(ot) == 1 and ot in string.punctuation:
                    final_output = final_output + ot
                else:
                    final_output = final_output + " " + ot
            
            final_output = final_output.strip()
            
            if sentiment == 'neutral' or len(original_tweet.split()) < 4:
                final_output = original_tweet
            
            jac = utils.jaccard(target_string.strip(), final_output.strip())
            jaccards.append(jac)
            
            out.append(final_output)
        
        mean_jac = np.mean(jaccards)
        return mean_jac, out

In [10]:
# train
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna()
    
    dfx = preprocessing(dfx)
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
        )
    
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)
    
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = config.TRAIN_BATCH_SIZE,
        num_workers = 4
    )
    
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = config.TRAIN_BATCH_SIZE,
        num_workers = 1
    )
    
    device = torch.device('cuda')
    model = TweetModel()
    model.to(device)
    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    
    optimizer = AdamW(model.parameters(), lr = 2e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0,
        num_training_steps = num_train_steps
        )
    
    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard, _ = eval_fn(valid_data_loader, model, device)
        print ("Epoch -- ", epoch+1)
        print (f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard

In [11]:
run()

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 773/773 [05:05<00:00,  2.53it/s, loss=0.0514]


Epoch --  1
Jaccard Score = 0.6135085776741219


100%|██████████| 773/773 [05:05<00:00,  2.53it/s, loss=0.0273]


Epoch --  2
Jaccard Score = 0.6356119449637566


100%|██████████| 773/773 [05:05<00:00,  2.53it/s, loss=0.0246]


Epoch --  3
Jaccard Score = 0.6455099089803653


100%|██████████| 773/773 [05:05<00:00,  2.53it/s, loss=0.0221]


Epoch --  4
Jaccard Score = 0.6555047030600513


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.0195]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  5
Jaccard Score = 0.6544880557234977


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.0165]


Epoch --  6
Jaccard Score = 0.6583562056163287


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.0139]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  7
Jaccard Score = 0.6568106261389729


100%|██████████| 773/773 [05:07<00:00,  2.52it/s, loss=0.0114]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  8
Jaccard Score = 0.6566552601504377


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00939]


Epoch --  9
Jaccard Score = 0.6608508744223028


100%|██████████| 773/773 [05:08<00:00,  2.51it/s, loss=0.00774]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  10
Jaccard Score = 0.6554138069557789


100%|██████████| 773/773 [05:08<00:00,  2.51it/s, loss=0.00649]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  11
Jaccard Score = 0.6576970026560637


100%|██████████| 773/773 [05:05<00:00,  2.53it/s, loss=0.00537]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  12
Jaccard Score = 0.6558390749698537


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00457]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  13
Jaccard Score = 0.6553891178619283


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00392]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  14
Jaccard Score = 0.655557397743887


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00342]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  15
Jaccard Score = 0.6544803002581422


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00298]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  16
Jaccard Score = 0.6500904691515337


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00259]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  17
Jaccard Score = 0.6525898946460472


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00234]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  18
Jaccard Score = 0.6558543504210026


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00211]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  19
Jaccard Score = 0.6490588123291657


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00187]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  20
Jaccard Score = 0.647828283544292


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00174]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  21
Jaccard Score = 0.6578793379306374


100%|██████████| 773/773 [05:07<00:00,  2.52it/s, loss=0.00152]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  22
Jaccard Score = 0.6560437073629459


100%|██████████| 773/773 [05:07<00:00,  2.52it/s, loss=0.0014]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  23
Jaccard Score = 0.6537101812562314


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00138]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  24
Jaccard Score = 0.6503562208936083


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00122]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  25
Jaccard Score = 0.6505471349757299


100%|██████████| 773/773 [05:07<00:00,  2.52it/s, loss=0.00114]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  26
Jaccard Score = 0.6553455193450562


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00106]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  27
Jaccard Score = 0.6514193684264324


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000996]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  28
Jaccard Score = 0.65685652568292


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000914]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  29
Jaccard Score = 0.6533221762612781


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000823]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  30
Jaccard Score = 0.6522970137389222


100%|██████████| 773/773 [05:07<00:00,  2.52it/s, loss=0.000781]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  31
Jaccard Score = 0.6545552449627873


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000714]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  32
Jaccard Score = 0.6517917623567582


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000681]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  33
Jaccard Score = 0.6502437084002477


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000613]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  34
Jaccard Score = 0.6548485890432706


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000625]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  35
Jaccard Score = 0.6538633629686184


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000553]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  36
Jaccard Score = 0.6497744233514712


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000534]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  37
Jaccard Score = 0.6533576801166271


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.0005]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  38
Jaccard Score = 0.6518379715888969


100%|██████████| 773/773 [05:08<00:00,  2.51it/s, loss=0.000476]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  39
Jaccard Score = 0.6521802572579011


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000457]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  40
Jaccard Score = 0.6580721444159825


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000379]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  41
Jaccard Score = 0.6577305165934998


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000379]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  42
Jaccard Score = 0.6573580896551469


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000339]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  43
Jaccard Score = 0.658617920694321


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.00033]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  44
Jaccard Score = 0.6552329305254257


100%|██████████| 773/773 [05:08<00:00,  2.51it/s, loss=0.000333]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  45
Jaccard Score = 0.6551668803100729


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000322]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  46
Jaccard Score = 0.6560984133866095


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000301]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  47
Jaccard Score = 0.6593419129607834


100%|██████████| 773/773 [05:07<00:00,  2.51it/s, loss=0.000289]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  48
Jaccard Score = 0.6575475543644635


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.000324]
  0%|          | 0/773 [00:00<?, ?it/s]

Epoch --  49
Jaccard Score = 0.6555148126774177


100%|██████████| 773/773 [05:06<00:00,  2.52it/s, loss=0.00025]


Epoch --  50
Jaccard Score = 0.658301136044717


In [12]:
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [13]:
test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=32,
    num_workers=1
)
device = 'cpu'

In [14]:
model = TweetModel()
model.load_state_dict(torch.load(config.MODEL_PATH))
model.eval()

TweetModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [15]:
_, out = eval_fn(data_loader, model, device = 'cpu')

In [16]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample.loc[:, 'selected_text'] = out
sample.to_csv("submission.csv", index=False)