In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import spacy

import time
import random

from sklearn.model_selection import train_test_split

TODOs:  
- make the embed non-trainable
- try simple cross entropy with start, end
- appropriate preprocessing --> lemmatize, stemmatize, (can't remove stop words, or can we?)
- try stacked lstms 
- try bidirectional
- use better lr or use lr scheduling
- try batch norm
- try to visualize the loss (both Lovasz softmax and bin crossentropy) for several examples -- good and bad!
- try a combination of both costs, ie, model returns 3 outputs 
- try to use the padding differently and ignore the pad index
- try to make changes to the model --> using conv1d layers
- try elmo embeddings --> char lvl, how will that work?
- try bert and its modifications --> will need more preprocessing  

In [2]:
import numpy as np

In [3]:
def foo(row):
    z = ['none', 'start']
    
    temp = row['start_idx']
    ans = np.zeros(len(row['text'].split()))
    ans[temp] = 1
    ans = np.array(ans, dtype=np.int32)
    ans = ' '.join([str(z[ans_]) for ans_ in ans])
    return ans

def foo2(row):
    try:
        temp = row['start_idx']
        temp += len(row['selected_text'].split())
        temp -= 1
        end_idx = temp
        ans = np.zeros(len(row['text'].split()))
        ans[temp] = 1
    except:
        print(row)
        import sys; sys.exit(0)
    ans = np.array(ans, dtype=np.int32)
    z = ['none', 'end']
    ans = ' '.join([str(z[ans_]) for ans_ in ans])
    return ans

def foo3(row):
    temp = row['text'].find(row['selected_text'])
    if temp>0: 
        if row['text'][temp-1]!=' ': 
            temp = row['text'].rfind(' ',0,temp)
            if temp==-1: 
                temp=0
    temp = len(row['text'][:temp].split())
    return temp

def foo4(row):
    row_final = []
    ele = 'none'
#     print(row['start'])
    for rows, rowe in zip(row['start'].split(), row['end'].split()):
#         print(rows)
        if rows == 'start': ele = 'selection'
        row_final.append(ele)
        if rowe == 'end': ele = 'none'
    row_final = ' '.join(row_final)
    return row_final

In [4]:
tqdm.pandas()
df = pd.read_csv('../tweet-sentiment-extraction/train.csv')

if len(df[df['textID']=='fdb77c3752'])!=0:
    df = df.drop([314])

df['start_idx'] = df.progress_apply(foo3, axis=1)
df['start'] = df.progress_apply(foo, axis=1)
df['end'] = df.progress_apply(foo2, axis=1)
df['selection'] = df.progress_apply(foo4, axis=1)
df.head()

  from pandas import Panel
100%|██████████| 27480/27480 [00:01<00:00, 23043.40it/s]
100%|██████████| 27480/27480 [00:01<00:00, 26551.43it/s]
100%|██████████| 27480/27480 [00:01<00:00, 20131.81it/s]
100%|██████████| 27480/27480 [00:00<00:00, 34612.88it/s]


Unnamed: 0,textID,text,selected_text,sentiment,start_idx,start,end,selection
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0,start none none none none none none,none none none none none none end,selection selection selection selection select...
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,start none none none none none none none none ...,none end none none none none none none none none,selection selection none none none none none n...
2,088c60f138,my boss is bullying me...,bullying me,negative,3,none none none start none,none none none none end,none none none selection selection
3,9642c003ef,what interview! leave me alone,leave me alone,negative,2,none none start none none,none none none none end,none none selection selection selection
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0,start none none none none none none none none ...,none none end none none none none none none no...,selection selection selection none none none n...


In [5]:
train_df, test_df = train_test_split(df, test_size=0.1)
train_df, val_df = train_test_split(train_df, test_size=0.1)


train_df.to_csv('df_changed_train.csv')
val_df.to_csv('df_changed_val.csv')
test_df.to_csv('df_changed_test.csv')

train_path = 'df_changed_train.csv'
test_path = 'df_changed_test.csv'
val_path = 'df_changed_val.csv'

In [6]:
text_field = data.Field(sequential=True)
# text_id_field = data.Field(sequential=False)

labels_field = data.Field(sequential=False)
# start_field = data.Field(unk_token=None, pad_token='none')
# end_field = data.Field(unk_token=None, pad_token='none')

selection_field = data.Field(unk_token=None, pad_token='none')

In [7]:
fields={    # 'textID': ('textID', text_id_field),
            'text': ('text', text_field),
            
            # 'selected_text': ('selected_text', selected_text_field),
            'sentiment': ('labels', labels_field), 
            # 'start': ('start', start_field), 
            # 'end': ('end', end_field)
            
            'selection': ('selection', selection_field)
        }

In [8]:
train_data, val_data, test_data = data.TabularDataset.splits(
    path='', 
    train=train_path,
    validation=val_path, 
    test=test_path, 
    format='csv',
    fields=fields)

In [9]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 22258
Number of validation examples: 2474
Number of testing examples: 2748


In [10]:
text_field.build_vocab(train_data, 
                 min_freq = 5,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
labels_field.build_vocab(train_data)
# start_field.build_vocab(train_data)
# end_field.build_vocab(train_data)
selection_field.build_vocab(train_data)

In [11]:
selection_field.vocab.freqs
assert(selection_field.vocab.stoi['none']==0)

In [12]:
BATCH_SIZE = 32

device = torch.device('cuda:2')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: x.text,
    sort_within_batch = False,
    device = device)

In [13]:
class SentimentExtractor(nn.Module):
    def __init__(self,
                input_dim,
                outputs_dim,
                embedding_dim,
                hidden_dim,
                pad_idx,
                dropout_rate,
                num_layers,
                bidirectional
                ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        
        if num_layers == 1:
            dropout_rate=0
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=num_layers, bidirectional=bidirectional, dropout=dropout_rate)
        
        # self.dropout = nn.Dropout(dropout_rate)
        
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, outputs_dim)
        else:
            self.fc = nn.Linear(hidden_dim, outputs_dim)
    
    def forward(self, text, sentiment):
        # embeddings = self.dropout(self.embedding(text))
        embeddings = self.embedding(text)
        output, (hidden, cell) = self.lstm(embeddings)
        
        
        print(output.shape)
        # return self.fc(self.dropout(output))
        return self.fc(output)

In [14]:
input_dim = len(text_field.vocab)
outputs_dim = 1
hidden_dim = 64
num_layers = 1
bidirectional = False
dropout_rate = 0.25
pad_idx = text_field.vocab.stoi[text_field.pad_token]
embedding_dim = 100

model = SentimentExtractor(input_dim=input_dim,
                 outputs_dim=outputs_dim,
                 hidden_dim=hidden_dim,
                 num_layers=num_layers,
                 bidirectional=bidirectional,
                 dropout_rate=dropout_rate,
                 embedding_dim=embedding_dim,
                 pad_idx=pad_idx)

# tag_pad_idx = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
# criterion = nn.CrossEntropyLoss(ignore_index=tag_pad_idx)

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
        
model.apply(init_weights);

# print(f'The model has {count_parameters(model):,} trainable parameters')

In [16]:
model.to(device)
for param in model.parameters():
    param.requires_grad = True

pretrained_embeddings = text_field.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings);

model.embedding.weight.data[1] = torch.zeros(embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=3e-6)

In [17]:
with torch.no_grad():
    model.eval()
    for batch in train_iterator:
        text = batch.text
        # preds = model(text)
        print(text.shape)
        print(text[0])
        break

torch.Size([26, 32])
tensor([  85,   70,  972,   44,    0,  556,    6,    3,  389, 1823,    3, 1764,
         209, 1963,    8,    0,  144,  241,   26,    0,   10,  152,    8,    0,
          71, 3063, 4047,    0,    0,   93, 1204,  915], device='cuda:2')


In [18]:
# def custom_categorical_accuracy(y_pred, y_true, tag_pad_idx):
#     y_pred = y_pred.argmax(dim=1, keepdim=True)
#     non_pad_element_idxs = (y_true!=tag_pad_idx).nonzero()
#     correct = y_pred[non_pad_element_idxs].squeeze(1).eq(y_true[non_pad_element_idxs])
#     return correct.sum() / torch.FloatTensor([y_true[non_pad_element_idxs].shape[0]])

In [254]:
from torch.autograd import Variable
import torch.nn.functional as F

def lovasz_grad(gt_sorted):
    """
    Computes gradient of the Lovasz extension w.r.t sorted errors
    See Alg. 1 in paper
    """
    p = len(gt_sorted)
    gts = gt_sorted.sum()
    intersection = gts - gt_sorted.float().cumsum(0)
    union = gts + (1 - gt_sorted).float().cumsum(0)
    jaccard = 1. - intersection / union
    if p > 1: # cover 1-pixel case
        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
    return jaccard

def lovasz_hinge_flat(logits, labels):
    """
    Binary Lovasz hinge loss
      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
      labels: [P] Tensor, binary ground truth labels (0 or 1)
      ignore: label to ignore
    """
    if len(labels) == 0:
        # only void pixels, the gradients should be 0
        return logits.sum() * 0.
    signs = 2. * labels.float() - 1.
    errors = (1. - logits * Variable(signs))
    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
    perm = perm.data
    gt_sorted = labels[perm]
    grad = lovasz_grad(gt_sorted)
    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
    return loss

In [261]:
def train_one_epoch(model, iterator, loss_func, optimizer):    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    optimizer.zero_grad()
    model.to(device)
    with torch.enable_grad():
#     with torch.no_grad():
        model.train()
#         model.eval()
        for batch in iterator:
            text = batch.text
#             print('Text shape: ', text.shape)

            preds = model(batch.text)
#             print(preds.shape)
#             print(preds)
#             print('\n\n\n\n')
            preds = preds.view(-1)
            true = batch.selection
#             print(true.shape)
#             print(true)
            true = true.view(-1)
            
            j_loss = loss_func(preds, true)
            j_loss.backward()
#             print(j_loss)
            optimizer.step()

            epoch_loss += j_loss.item()
#             break
        
    return epoch_loss/len(iterator)

In [263]:
def val_one_epoch(model, iterator, loss_func):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        
        for batch in iterator:
            text = batch.text
#             print('Text shape: ', text.shape)

            preds = model(batch.text)
#             print(preds.shape)
            preds = preds.view(-1)
            true = batch.selection
            true = true.view(-1)

            j_loss = loss_func(preds, true)
            
            epoch_loss += j_loss.item()

    return epoch_loss/len(iterator)

In [266]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [271]:
n_epochs = 100
best_val_loss = float('inf')

for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss = train_one_epoch(model, train_iterator, lovasz_hinge_flat, optimizer)
    val_loss = val_one_epoch(model, valid_iterator, lovasz_hinge_flat)
    
    end_time = time.time()
    
    epoch_time_min, epoch_time_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_time_min}m {epoch_time_secs}s')
    print('Train Loss: {:.3f}'.format(train_loss))
    print('Val Loss: {:.3f}'.format(val_loss))

Epoch: 01 | Epoch Time: 0m 3s
Train Loss: 1.055
Val Loss: 1.051
Epoch: 02 | Epoch Time: 0m 3s
Train Loss: 1.037
Val Loss: 1.036
Epoch: 03 | Epoch Time: 0m 3s
Train Loss: 1.028
Val Loss: 1.031
Epoch: 04 | Epoch Time: 0m 3s
Train Loss: 1.023
Val Loss: 1.027
Epoch: 05 | Epoch Time: 0m 3s
Train Loss: 1.019
Val Loss: 1.024
Epoch: 06 | Epoch Time: 0m 3s
Train Loss: 1.016
Val Loss: 1.020
Epoch: 07 | Epoch Time: 0m 3s
Train Loss: 1.012
Val Loss: 1.018
Epoch: 08 | Epoch Time: 0m 3s
Train Loss: 1.010
Val Loss: 1.015
Epoch: 09 | Epoch Time: 0m 3s
Train Loss: 1.008
Val Loss: 1.013
Epoch: 10 | Epoch Time: 0m 3s
Train Loss: 1.006
Val Loss: 1.011
Epoch: 11 | Epoch Time: 0m 3s
Train Loss: 1.004
Val Loss: 1.008
Epoch: 12 | Epoch Time: 0m 3s
Train Loss: 1.002
Val Loss: 1.006
Epoch: 13 | Epoch Time: 0m 3s
Train Loss: 1.000
Val Loss: 1.004
Epoch: 14 | Epoch Time: 0m 3s
Train Loss: 0.998
Val Loss: 1.003
Epoch: 15 | Epoch Time: 0m 3s
Train Loss: 0.997
Val Loss: 1.001
Epoch: 16 | Epoch Time: 0m 3s
Train Loss

In [47]:
# x = torch.randn(100,1000)
# y = torch.randn(1000,1000)

In [55]:
# device = torch.device('cpu')

In [50]:
# x.to(device); y.to(device)

In [51]:
# z = torch.matmul(x,y)

In [155]:
# x = torch.randn((32,18))

In [154]:
# x.nonzero().shape

In [52]:
# loss = nn.CrossEntropyLoss()
# input_ = torch.randn(3, 5, requires_grad=True)
# target = torch.empty(3, dtype=torch.long).random_(5)
# print(input_); print(target)
# output = loss(input_, target)

In [None]:
# def my_loss(true, pred):
#     # this is the new criterion
    
#     # expected true to be of shape (batch_size, None) -- but only 0, 1
#     # expecting pred to be of shape (batch_size, None) -- but float values
    
#     true_start, true_end = true
#     pred_start, pred_end = pred
    
#     pred_start_idx = (pred_start==1).nonzero()
#     pred_end_idx = (pred_end==1).nonzero()
    
#     if pred_start_idx > pred_end_idx: return 100
    
#     true_start_idx = (true_start==1).nonzero()
#     true_end_idx = (true_end==1).nonzero()
    
#     I = torch.min(true_end_idx, pred_end_idx) - torch.max(true_start_idx, pred_start_idx)
#     if I < 0: return 1
#     P = pred_end_idx - pred_start_idx
#     T = true_end_idx - true_start_idx
    
#     smooth = 1e-6
    
#     loss = 1 - ((I + smooth) / (P + T - I + smooth))
#     return loss

In [None]:
# for batch in valid_iterator:
#     text = batch.text
#     print(text)
#     pred = model(text)
#     pred = torch.transpose
#     break

In [297]:
# from torch.autograd import Variable
# import torch.nn.functional as F

# def lovasz_grad(gt_sorted):
#     """
#     Computes gradient of the Lovasz extension w.r.t sorted errors
#     See Alg. 1 in paper
#     """
#     p = len(gt_sorted)
#     gts = gt_sorted.sum()
#     intersection = gts - gt_sorted.float().cumsum(0)
#     union = gts + (1 - gt_sorted).float().cumsum(0)
#     jaccard = 1. - intersection / union
#     if p > 1: # cover 1-pixel case
#         jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
#     return jaccard

# def lovasz_hinge_flat(logits, labels):
#     """
#     Binary Lovasz hinge loss
#       logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
#       labels: [P] Tensor, binary ground truth labels (0 or 1)
#       ignore: label to ignore
#     """
#     if len(labels) == 0:
#         # only void pixels, the gradients should be 0
#         return logits.sum() * 0.
#     signs = 2. * labels.float() - 1.
#     errors = (1. - logits * Variable(signs))
#     errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
#     perm = perm.data
#     gt_sorted = labels[perm]
#     grad = lovasz_grad(gt_sorted)
#     loss = torch.dot(F.relu(errors_sorted), Variable(grad))
#     return loss

In [299]:
# x1 = torch.FloatTensor(2,3)
# a = [[1,0,1],[0,1,1]]
# a = np.array([np.array(aa,dtype=np.float32) for aa in a])
# x2 = torch.tensor(a)

In [302]:
# x1 = x1.view(-1)
# x2 = x2.view(-1)

In [1]:
# lovasz_hinge_flat(x1, x2)