## Loading Data

In [1]:
import pandas as pd

In [2]:
DATA_FOLDER = "Data_Processed/Shared_Task_eng/"

In [3]:
data_val = pd.read_csv(DATA_FOLDER+'val_1.csv')
data_test = pd.read_csv(DATA_FOLDER+'test_1.csv')
data_train = pd.read_csv(DATA_FOLDER+'train_1.csv')

In [4]:
data_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1309 non-null   int64 
 1   ID          1309 non-null   int64 
 2   Text        1308 non-null   object
 3   Label       1309 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 41.0+ KB


In [5]:
data_val.dropna(inplace=True)
data_test.dropna(inplace=True)
data_train.dropna(inplace=True)

In [6]:
MAX_LEN = 1024

In [7]:
MAX_FEATURES = 30000

In [8]:
len(data_train)

9161

In [9]:
len(data_train[data_train['Label']==1]),len(data_train[data_train['Label']==0])

(880, 8281)

## Embeddings
- Ref: https://medium.com/mlearning-ai/load-pre-trained-glove-embeddings-in-torch-nn-embedding-layer-in-under-2-minutes-f5af8f57416a

In [10]:
vocab2idx,embeddings = {},[]

In [11]:
EMBEDDING_PATH = "Embeddings/glove.6B.100d.txt"

In [12]:
with open(EMBEDDING_PATH,'rt') as f:
    full_content = f.read().strip().split('\n')

In [13]:
full_content[0]

'the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062'

In [14]:
for i,line in enumerate(full_content):
    word = line.split(' ')[0]
    embedding = [float(val) for val in line.split(' ')[1:]]
    vocab2idx[word]=i
    embeddings.append(embedding)

In [15]:
len(vocab2idx),len(embeddings)

(400000, 400000)

In [16]:
import numpy as np
embs_np = np.array(embeddings)

In [17]:
#embedding for '<pad>' token: 0s
pad_emb_np = np.zeros((1,embs_np.shape[1]))   
#embedding for '<unk>' token: mean
unk_emb_np = np.mean(embs_np,axis=0,keepdims=True)    

In [19]:
# insert these embeddings at the top of embs_npa
embs_np = np.vstack((embs_np,pad_emb_np,unk_emb_np))

In [20]:
print(embs_np.shape)

(400002, 100)


In [21]:
embs_np[400000]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [22]:
vocab2idx['<unk>']=400001
vocab2idx['<pad>']=400000

In [23]:
len(vocab2idx)

400002

### Encode text

In [38]:
sample = data_train.iloc[101]['Text']

In [39]:
sample

'bro thats needy as fuck'

In [40]:
def encode(text,max_len=MAX_LEN):
    encoded=[]
    for word in text.split(' '):
        word = word.lower()
        try:
            idx = vocab2idx[word]
        except:
            idx = vocab2idx['<unk>']
        encoded.append(idx)
    if(len(encoded)<max_len):
        padding = [vocab2idx['<pad>']]*(max_len-len(encoded))
        encoded.extend(padding)
    else:
        encoded=encoded[:max_len]
    return encoded

In [42]:
encoded = encode(sample,MAX_LEN)
len(encoded)

1024

### Embedding Layer

In [52]:
import torch

In [53]:
emb_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_np).float())

In [55]:
input = torch.LongTensor(encoded)
emb_layer(input).shape

torch.Size([1024, 100])

### Saving Embedding npa

In [56]:
NPA_PATH = "Embeddings/"

In [44]:
with open(NPA_PATH+'embs_np.npy','wb') as f:
    np.save(f,embs_np)

## Data Loaders

In [45]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [46]:
BATCH_SIZE = 32

In [47]:
X_train = data_train['Text'].values
Y_train = data_train['Label'].values
X_test = data_test['Text'].values
Y_test = data_test['Label'].values
X_val = data_val['Text'].values
Y_val = data_val['Label'].values

In [48]:
inputs = [encode(text) for text in X_train]

In [49]:
len(inputs),len(X_train)

(9161, 9161)

In [57]:
def get_dataloader(X,Y,batch_size,is_train=False):
    inputs = [encode(text) for text in X]
    labels = Y
    
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels,dtype=torch.long)
    
    data = TensorDataset(inputs,labels)
    
    if(is_train==False):
        sampler = SequentialSampler(data)
    else:
        sampler = RandomSampler(data) 
        
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    
    return dataloader

In [58]:
train_loader = get_dataloader(X_train,Y_train,BATCH_SIZE,True)
val_loader = get_dataloader(X_val,Y_val,BATCH_SIZE)
test_loader = get_dataloader(X_test,Y_test,BATCH_SIZE)

## Model

In [59]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [60]:
class LSTM(nn.Module):

    def __init__(self, weights,dimension=128):
        super(LSTM, self).__init__()
        
        self.weights = weights

        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_np).float())
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=100,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 2)

    def forward(self, text,labels):
        text_len = 1024

        text_emb = self.embedding(text)

        #packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        output, _ = self.lstm(text_emb)
        #output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        output = self.fc(text_fea)
        #text_fea = torch.squeeze(text_fea, 1)
        #text_out = torch.sigmoid(text_fea)
    
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.weights,dtype=torch.float))
        
        #oss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(output,labels)
        return loss,output

In [77]:
weights = [1,5]

In [78]:
model =LSTM(weights)

## Training

In [63]:
from transformers import AdamW

In [79]:
LR = 1e-4
EPOCHS = 20

In [80]:
 optimizer = AdamW(model.parameters(),
                  lr = LR, 
                  eps = 1e-8
                )

In [81]:
import random
def fix_the_random(seed_val = 42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [82]:
fix_the_random()

### Test output and input

In [83]:
device = torch.device("cpu")
from sklearn.metrics import *

In [84]:
i=0
for batch in train_loader:
    i+=1
    inputs= batch[0].to(device)
    labels = batch[1].to(device)
    
#     print(inputs)
    
    output = model(inputs,labels)
    
    loss = output[0]
    logits = output[1]
    
    print(loss.item())
    
    loss.backward()
    
    y_true = labels.cpu().data.squeeze().numpy()

    y_pred = torch.max(logits,1)[1]
    
    print(y_pred),print(y_true)
    y_pred = y_pred.cpu().data.squeeze().numpy()
    
    print(accuracy_score(y_true, y_pred))
    print(f1_score(y_true, y_pred, labels = np.unique(y_pred)))
    
    optimizer.step()
    
    if(i==10):
        break

0.6860517859458923
tensor([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 1])
[0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
0.71875
0.18181818181818182
0.6916463375091553
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
[0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
0.75
0.0
0.6878390908241272
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1])
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
0.78125
0.0
0.691704273223877
tensor([0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0])
[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
0.65625
0.0
0.6998888850212097
tensor([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0])
[0 0 0 0 0 0 0 0

In [99]:
y_pred=[1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 1, 1, 1, 0, 1, 1]
y_true=[0,1,0,1,0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

accuracy_score(y_true,y_pred)

### Main Train Loop

In [86]:
import time
import datetime
from tqdm import tqdm

In [95]:
def evalMetric(y_true, y_pred,prefix):
   accuracy = accuracy_score(y_true, y_pred)
   mf1Score = f1_score(y_true, y_pred, average='macro')
   f1Score  = f1_score(y_true, y_pred, labels = np.unique(y_pred))
   fpr, tpr, _ = roc_curve(y_true, y_pred)
   area_under_c = auc(fpr, tpr)
   recallScore = recall_score(y_true, y_pred, labels = np.unique(y_pred))
   precisionScore = precision_score(y_true, y_pred, labels = np.unique(y_pred))
   return dict({prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
                prefix+'f1Score': f1Score, prefix+'precision': precisionScore, 
                prefix+'recall': recallScore})

In [88]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [101]:
def EvaluateOnData(model,loader):
    
    model.eval() # put model in eval mode
    
    total_eval_loss = 0
    nb_eval_steps = 0

    y_pred = np.zeros(shape=(0),dtype='int')
    y_true = np.empty(shape=(0),dtype='int')
    
    for batch in loader:
        b_inputs = batch[0].to(device)
        b_labels = batch[1].to(device)
        
        with torch.no_grad(): # do not construct compute graph
            outputs = model(b_inputs,b_labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()

        b_y_true = b_labels.cpu().data.squeeze().numpy()

        b_y_pred = torch.max(logits,1)[1]
        b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

        y_pred = np.concatenate((y_pred,b_y_pred))
        y_true = np.concatenate((y_true,b_y_true))
        
    metrics = evalMetric(y_true,y_pred,"Val_")

    print(" Validation Accuracy: {0:.2f}".format(metrics['Val_accuracy']))

    # Calculate the average loss over all of the batches.
    avg_loss = total_eval_loss / len(loader)
    
    print("  Validation Loss: {0:.2f}".format(avg_loss))

    metrics['Val_avg_loss'] = avg_loss

    return metrics

In [102]:
def runTrainLoop(model,train_loader,t0,optimiser):
    print("")
    print('Training...')

    # Reset the total loss for this epoch.
    total_loss = 0
    model.train() # put model in train mode

    y_pred = np.zeros(shape=(0),dtype='int')
    y_true = np.empty(shape=(0),dtype='int')

    # For each batch of training data...
    for step, batch in tqdm(enumerate(train_loader)):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
        
        b_inputs = batch[0].to(device)
        b_labels = batch[1].to(device)
        
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        outputs = model(b_inputs,b_labels)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        logits = outputs[1]

        if step % 40 == 0 and not step == 0:
            print('batch_loss',loss.item())

        #Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()


        # Compute True and predicted labels to get
        # train metrics
        b_y_true = b_labels.cpu().data.squeeze().numpy()

        b_y_pred = torch.max(logits,1)[1]
        b_y_pred = b_y_pred.cpu().data.squeeze().numpy()
        
        # accumulate b_y_pred and b_y_true for each batch
        # and evaluate at once
        y_pred = np.concatenate((y_pred,b_y_pred))
        y_true = np.concatenate((y_true,b_y_true))

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        #scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_loader)

    train_metrics = evalMetric(y_true,y_pred,"Train_")

    print('avg_train_loss',avg_train_loss)
    print('train_f1Score',train_metrics['Train_f1Score'])
    print('train_accuracy',train_metrics['Train_accuracy'])

    train_metrics['Train_avg_loss'] = avg_train_loss

    return train_metrics

In [103]:
def train(model,train_loader,val_loader,optimiser,epochs):
    train_stats = []
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

          # Measure how long the training epoch takes.
        t0 = time.time()

        train_metrics = runTrainLoop(model,train_loader,t0,optimiser)
        
        print("")
        print("Running Validation...") 
        val_metrics = EvaluateOnData(model,val_loader)
        
        stats['epoch']=epoch_i+1
        
        stats.update(train_metrics)
        stats.update(val_metrics)

        train_stats.append(stats)
    
    return train_stats

In [92]:
train(model,train_loader,val_loader,optimizer,10)

0it [00:00, ?it/s]



Training...


40it [01:21,  2.05s/it]

batch_loss 0.694582998752594


80it [02:45,  2.13s/it]

batch_loss 0.713119626045227


120it [04:10,  2.15s/it]

batch_loss 0.636775553226471


160it [05:41,  2.42s/it]

batch_loss 0.734338104724884


200it [07:14,  2.30s/it]

batch_loss 0.4468008875846863


240it [08:54,  2.63s/it]

batch_loss 0.6707899570465088


280it [10:39,  2.70s/it]

batch_loss 0.6639588475227356


287it [10:56,  2.29s/it]


avg_train_loss 0.6365118128497426
train_f1Score 0.015590200445434297
train_accuracy 0.903503984281192

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.90
  Validation Loss: 0.63


Training...


40it [01:38,  2.49s/it]

batch_loss 0.5842412710189819


80it [03:25,  2.50s/it]

batch_loss 0.5210427045822144


120it [04:54,  2.08s/it]

batch_loss 0.42139768600463867


160it [06:20,  2.06s/it]

batch_loss 0.5073403120040894


200it [07:52,  2.31s/it]

batch_loss 0.45949965715408325


240it [09:16,  1.96s/it]

batch_loss 0.5583552122116089


280it [10:46,  1.94s/it]

batch_loss 0.5163238048553467


287it [10:58,  2.29s/it]


avg_train_loss 0.5784645129371603
train_f1Score 0.19504643962848298
train_accuracy 0.8864752756249318

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.73
  Validation Loss: 0.55


Training...


40it [01:29,  2.11s/it]

batch_loss 0.4915522038936615


80it [03:05,  2.30s/it]

batch_loss 0.43567827343940735


120it [04:41,  2.33s/it]

batch_loss 0.5196518898010254


160it [06:19,  2.49s/it]

batch_loss 0.5538341403007507


200it [07:55,  2.29s/it]

batch_loss 0.6837435364723206


240it [09:31,  2.24s/it]

batch_loss 0.5436350107192993


280it [11:09,  2.37s/it]

batch_loss 0.6906737089157104


287it [11:23,  2.38s/it]


avg_train_loss 0.5419966364052238
train_f1Score 0.3194125745754933
train_accuracy 0.8381181093767056

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.81
  Validation Loss: 0.54


Training...


40it [01:34,  2.59s/it]

batch_loss 0.3729540705680847


80it [03:14,  2.69s/it]

batch_loss 0.7452590465545654


120it [04:58,  2.72s/it]

batch_loss 0.2327832579612732


160it [06:38,  2.59s/it]

batch_loss 0.6020908951759338


200it [08:20,  2.43s/it]

batch_loss 0.5579232573509216


240it [10:03,  2.48s/it]

batch_loss 0.45141440629959106


280it [11:45,  2.59s/it]

batch_loss 0.3598862290382385


287it [12:02,  2.52s/it]


avg_train_loss 0.5197686439814884
train_f1Score 0.34669067987393065
train_accuracy 0.8416111778190154

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.77
  Validation Loss: 0.53


Training...


40it [01:41,  2.42s/it]

batch_loss 0.40852102637290955


80it [03:26,  2.66s/it]

batch_loss 0.334728866815567


120it [05:08,  2.57s/it]

batch_loss 0.794390857219696


160it [06:54,  2.78s/it]

batch_loss 0.38498279452323914


200it [08:42,  2.49s/it]

batch_loss 0.4041173458099365


240it [10:29,  2.65s/it]

batch_loss 0.5076082944869995


280it [12:18,  2.68s/it]

batch_loss 0.3602437973022461


287it [12:35,  2.63s/it]


avg_train_loss 0.5044918974102166
train_f1Score 0.37775735294117646
train_accuracy 0.852199541534767

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.88
  Validation Loss: 0.55


Training...


40it [01:45,  2.69s/it]

batch_loss 0.26938101649284363


80it [03:33,  2.68s/it]

batch_loss 0.6233281493186951


120it [05:23,  2.66s/it]

batch_loss 0.6074079871177673


160it [07:13,  2.75s/it]

batch_loss 0.5119422078132629


200it [09:02,  2.83s/it]

batch_loss 0.5399578809738159


240it [10:51,  2.64s/it]

batch_loss 0.4891956150531769


280it [12:41,  2.72s/it]

batch_loss 0.7480677962303162


287it [12:59,  2.72s/it]


avg_train_loss 0.5038676430643227
train_f1Score 0.38548307994114756
train_accuracy 0.8632245388058072

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.80
  Validation Loss: 0.52


Training...


40it [01:46,  2.69s/it]

batch_loss 0.43802499771118164


80it [03:35,  2.77s/it]

batch_loss 0.5172142386436462


120it [05:23,  2.67s/it]

batch_loss 0.3982948958873749


160it [07:13,  2.74s/it]

batch_loss 0.3961125612258911


200it [09:03,  2.79s/it]

batch_loss 0.7021610736846924


240it [10:57,  2.79s/it]

batch_loss 0.8751559853553772


280it [12:47,  2.69s/it]

batch_loss 0.8478622436523438


287it [13:05,  2.74s/it]


avg_train_loss 0.49057138532088607
train_f1Score 0.3899596593455849
train_accuracy 0.8514354328130117

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.86
  Validation Loss: 0.53


Training...


40it [01:47,  2.78s/it]

batch_loss 0.34622833132743835


80it [03:40,  2.88s/it]

batch_loss 0.8159197568893433


120it [05:35,  2.77s/it]

batch_loss 0.36838939785957336


160it [07:27,  2.83s/it]

batch_loss 0.33635732531547546


200it [09:25,  3.00s/it]

batch_loss 0.43092402815818787


240it [11:18,  2.80s/it]

batch_loss 0.4542291760444641


280it [13:11,  2.57s/it]

batch_loss 0.3616260588169098


287it [13:27,  2.81s/it]


avg_train_loss 0.48380928499565723
train_f1Score 0.40814393939393934
train_accuracy 0.8635520139722738

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.84
  Validation Loss: 0.52


Training...


40it [01:39,  2.43s/it]

batch_loss 0.5086437463760376


80it [03:18,  2.40s/it]

batch_loss 0.29981809854507446


120it [05:00,  2.60s/it]

batch_loss 0.4857662618160248


160it [06:46,  2.74s/it]

batch_loss 0.5236760973930359


200it [08:27,  2.45s/it]

batch_loss 0.13514329493045807


240it [10:14,  2.62s/it]

batch_loss 0.4792105555534363


280it [12:00,  2.65s/it]

batch_loss 0.6411433219909668


287it [12:17,  2.57s/it]


avg_train_loss 0.482620335102912
train_f1Score 0.40517626059794737
train_accuracy 0.8544918677000327

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.80
  Validation Loss: 0.52


Training...


40it [01:38,  2.55s/it]

batch_loss 0.7466442584991455


80it [03:23,  2.52s/it]

batch_loss 0.5486990809440613


120it [05:09,  2.64s/it]

batch_loss 0.615241289138794


160it [06:52,  2.50s/it]

batch_loss 0.40957292914390564


200it [08:34,  2.58s/it]

batch_loss 0.5289916396141052


240it [10:16,  2.47s/it]

batch_loss 0.44440916180610657


280it [12:00,  2.70s/it]

batch_loss 0.4740843176841736


287it [12:17,  2.57s/it]


avg_train_loss 0.4726619808204498
train_f1Score 0.4121878967414304
train_accuracy 0.8483789979259906

Running Validation...
 Validation Accuracy: 0.84
  Validation Loss: 0.53


[{'epoch': 1,
  'train metrics': {'accuracy': 0.903503984281192,
   'mF1Score': 0.48242779076449854,
   'f1Score': 0.015590200445434297,
   'precision': 0.3888888888888889,
   'recall': 0.007954545454545454,
   'avg_train_loss': 0.6365118128497426},
  'val_metrics': {'accuracy': 0.9029051987767585,
   'mF1Score': 0.4822191675783844,
   'f1Score': 0.015503875968992248,
   'precision': 0.25,
   'recall': 0.008,
   'avg_val_loss': 0.6279183124623647}},
 {'epoch': 2,
  'train metrics': {'accuracy': 0.8864752756249318,
   'mF1Score': 0.5669888686692033,
   'f1Score': 0.19504643962848298,
   'precision': 0.3058252427184466,
   'recall': 0.1431818181818182,
   'avg_train_loss': 0.5784645129371603},
  'val_metrics': {'accuracy': 0.7270642201834863,
   'mF1Score': 0.565886831744379,
   'f1Score': 0.30136986301369867,
   'precision': 0.19948186528497408,
   'recall': 0.616,
   'avg_val_loss': 0.5529958758412338}},
 {'epoch': 3,
  'train metrics': {'accuracy': 0.8381181093767056,
   'mF1Score': 0

In [94]:
train_stats = [{'epoch': 1,
  'train metrics': {'accuracy': 0.903503984281192,
   'mF1Score': 0.48242779076449854,
   'f1Score': 0.015590200445434297,
   'precision': 0.3888888888888889,
   'recall': 0.007954545454545454,
   'avg_train_loss': 0.6365118128497426},
  'val_metrics': {'accuracy': 0.9029051987767585,
   'mF1Score': 0.4822191675783844,
   'f1Score': 0.015503875968992248,
   'precision': 0.25,
   'recall': 0.008,
   'avg_val_loss': 0.6279183124623647}},
 {'epoch': 2,
  'train metrics': {'accuracy': 0.8864752756249318,
   'mF1Score': 0.5669888686692033,
   'f1Score': 0.19504643962848298,
   'precision': 0.3058252427184466,
   'recall': 0.1431818181818182,
   'avg_train_loss': 0.5784645129371603},
  'val_metrics': {'accuracy': 0.7270642201834863,
   'mF1Score': 0.565886831744379,
   'f1Score': 0.30136986301369867,
   'precision': 0.19948186528497408,
   'recall': 0.616,
   'avg_val_loss': 0.5529958758412338}},
 {'epoch': 3,
  'train metrics': {'accuracy': 0.8381181093767056,
   'mF1Score': 0.6137730654578513,
   'f1Score': 0.3194125745754933,
   'precision': 0.2678983833718245,
   'recall': 0.39545454545454545,
   'avg_train_loss': 0.5419966364052238},
  'val_metrics': {'accuracy': 0.8111620795107034,
   'mF1Score': 0.6189346125311533,
   'f1Score': 0.34828496042216356,
   'precision': 0.25984251968503935,
   'recall': 0.528,
   'avg_val_loss': 0.5423922451531015}},
 {'epoch': 4,
  'train metrics': {'accuracy': 0.8416111778190154,
   'mF1Score': 0.6282860268508217,
   'f1Score': 0.34669067987393065,
   'precision': 0.2870991797166294,
   'recall': 0.4375,
   'avg_train_loss': 0.5197686439814884},
  'val_metrics': {'accuracy': 0.7668195718654435,
   'mF1Score': 0.5942659839004124,
   'f1Score': 0.32967032967032966,
   'precision': 0.22727272727272727,
   'recall': 0.6,
   'avg_val_loss': 0.5303168107823628}},
 {'epoch': 5,
  'train metrics': {'accuracy': 0.852199541534767,
   'mF1Score': 0.6469487867146115,
   'f1Score': 0.37775735294117646,
   'precision': 0.31712962962962965,
   'recall': 0.46704545454545454,
   'avg_train_loss': 0.5044918974102166},
  'val_metrics': {'accuracy': 0.8753822629969419,
   'mF1Score': 0.6408379168316164,
   'f1Score': 0.35059760956175295,
   'precision': 0.3492063492063492,
   'recall': 0.352,
   'avg_val_loss': 0.5513924912708562}},
 {'epoch': 6,
  'train metrics': {'accuracy': 0.8632245388058072,
   'mF1Score': 0.6542658291064823,
   'f1Score': 0.38548307994114756,
   'precision': 0.33908541846419327,
   'recall': 0.4465909090909091,
   'avg_train_loss': 0.5038676430643227},
  'val_metrics': {'accuracy': 0.7958715596330275,
   'mF1Score': 0.6176590049429886,
   'f1Score': 0.3566265060240964,
   'precision': 0.25517241379310346,
   'recall': 0.592,
   'avg_val_loss': 0.5234039542151661}},
 {'epoch': 7,
  'train metrics': {'accuracy': 0.8514354328130117,
   'mF1Score': 0.6526891081514451,
   'f1Score': 0.3899596593455849,
   'precision': 0.3219837157660992,
   'recall': 0.4943181818181818,
   'avg_train_loss': 0.49057138532088607},
  'val_metrics': {'accuracy': 0.8631498470948012,
   'mF1Score': 0.6431648974677467,
   'f1Score': 0.36298932384341637,
   'precision': 0.3269230769230769,
   'recall': 0.408,
   'avg_val_loss': 0.529910690900756}},
 {'epoch': 8,
  'train metrics': {'accuracy': 0.8635520139722738,
   'mF1Score': 0.6655155230590918,
   'f1Score': 0.40814393939393934,
   'precision': 0.34983766233766234,
   'recall': 0.48977272727272725,
   'avg_train_loss': 0.48380928499565723},
  'val_metrics': {'accuracy': 0.8363914373088684,
   'mF1Score': 0.6308632076466989,
   'f1Score': 0.355421686746988,
   'precision': 0.28502415458937197,
   'recall': 0.472,
   'avg_val_loss': 0.5227955608833127}},
 {'epoch': 9,
  'train metrics': {'accuracy': 0.8544918677000327,
   'mF1Score': 0.6611417028379949,
   'f1Score': 0.40517626059794737,
   'precision': 0.33357825128581925,
   'recall': 0.5159090909090909,
   'avg_train_loss': 0.482620335102912},
  'val_metrics': {'accuracy': 0.8012232415902141,
   'mF1Score': 0.6163357400722023,
   'f1Score': 0.35,
   'precision': 0.2545454545454545,
   'recall': 0.56,
   'avg_val_loss': 0.5201039634099821}},
 {'epoch': 10,
  'train metrics': {'accuracy': 0.8483789979259906,
   'mF1Score': 0.662576184099771,
   'f1Score': 0.4121878967414304,
   'precision': 0.32838840188806473,
   'recall': 0.553409090909091,
   'avg_train_loss': 0.4726619808204498},
  'val_metrics': {'accuracy': 0.8386850152905199,
   'mF1Score': 0.638822982849889,
   'f1Score': 0.3701492537313432,
   'precision': 0.29523809523809524,
   'recall': 0.496,
   'avg_val_loss': 0.5321731847233888}}]

In [122]:
def processStats(stats):
    data={}
    cols = ['Epoch','Train_accuracy','Train_mF1Score',
    'Train_f1Score','Train_precision','Train_recall',
    'Train_avg_train_loss',
    'Val_accuracy','Val_mF1Score',
    'Val_f1Score','Val_precision','Val_recall',
    'Val_avg_val_loss'
    ]

    for col in cols:
        data[col]=[]
    
    for stat in stats:
        data['Epoch'].append(stat['epoch'])

        for key1,val1 in stat['train metrics'].items():
            data['Train_'+key1].append(val1)

        for key1,val1 in stat['val_metrics'].items():
            data['Val_'+key1].append(val1)
    
    return data

In [123]:
data = processStats(train_stats)

In [115]:
train_stats[0]['train metrics']

{'accuracy': 0.903503984281192,
 'mF1Score': 0.48242779076449854,
 'f1Score': 0.015590200445434297,
 'precision': 0.3888888888888889,
 'recall': 0.007954545454545454,
 'avg_train_loss': 0.6365118128497426}

In [127]:
df = pd.DataFrame.from_dict(data)

In [129]:
df.head(10)

Unnamed: 0,Epoch,Train_accuracy,Train_mF1Score,Train_f1Score,Train_precision,Train_recall,Train_avg_train_loss,Val_accuracy,Val_mF1Score,Val_f1Score,Val_precision,Val_recall,Val_avg_val_loss
0,1,0.903504,0.482428,0.01559,0.388889,0.007955,0.636512,0.902905,0.482219,0.015504,0.25,0.008,0.627918
1,2,0.886475,0.566989,0.195046,0.305825,0.143182,0.578465,0.727064,0.565887,0.30137,0.199482,0.616,0.552996
2,3,0.838118,0.613773,0.319413,0.267898,0.395455,0.541997,0.811162,0.618935,0.348285,0.259843,0.528,0.542392
3,4,0.841611,0.628286,0.346691,0.287099,0.4375,0.519769,0.76682,0.594266,0.32967,0.227273,0.6,0.530317
4,5,0.8522,0.646949,0.377757,0.31713,0.467045,0.504492,0.875382,0.640838,0.350598,0.349206,0.352,0.551392
5,6,0.863225,0.654266,0.385483,0.339085,0.446591,0.503868,0.795872,0.617659,0.356627,0.255172,0.592,0.523404
6,7,0.851435,0.652689,0.38996,0.321984,0.494318,0.490571,0.86315,0.643165,0.362989,0.326923,0.408,0.529911
7,8,0.863552,0.665516,0.408144,0.349838,0.489773,0.483809,0.836391,0.630863,0.355422,0.285024,0.472,0.522796
8,9,0.854492,0.661142,0.405176,0.333578,0.515909,0.48262,0.801223,0.616336,0.35,0.254545,0.56,0.520104
9,10,0.848379,0.662576,0.412188,0.328388,0.553409,0.472662,0.838685,0.638823,0.370149,0.295238,0.496,0.532173


In [130]:
df.to_csv('LSTM_stats.csv')

## Final Code

In [32]:
# Core
import random
import io

# Basics
import numpy as np
import pandas as pd
import torch

# Utility
from tqdm import tqdm #progress-bar
from itertools import chain, repeat, islice #padding

# Dataloader
from torch.utils.data import TensorDataset, DataLoader,RandomSampler, SequentialSampler

# Optimiser
from transformers import AdamW

# Metrics
from sklearn.metrics import *

# Model
import torch.nn as nn

## Model

In [36]:
class LSTM_Model(nn.Module):

    def __init__(self, weights,embs_np,dimension=128):
        super(LSTM_Model, self).__init__()
        
        self.weights = weights
        input_size = embs_np.shape[1]

        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_np).float())
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 2)

    def forward(self, text,labels):
        text_len = 1024

        text_emb = self.embedding(text)

        #packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        output, _ = self.lstm(text_emb)
        #output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        output = self.fc(text_fea)
        #text_fea = torch.squeeze(text_fea, 1)
        #text_out = torch.sigmoid(text_fea)
    
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.weights,dtype=torch.float))
        
        #oss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(output,labels)
        return loss,output

## Main Class

In [73]:
class LSTM:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
                
        self.device = torch.device(args['device'])
        
        self.embeddings,self.id2word,self.word2id = self.load_vec(args['embedding_path'])
        
    ##----------------------------------------------------------##
    ##------------------- Utility Functions --------------------##
    ##----------------------------------------------------------##
    def load_vec(self,emb_path, nmax=50000):
        vectors = []
        word2id = {}
        with io.open(emb_path, 'r', encoding='utf-8', newline='\n', 
                     errors='ignore') as f:
            next(f)
            for i, line in enumerate(f):
                word, vect = line.rstrip().split(' ', 1)
                vect = np.fromstring(vect, sep=' ')
                assert word not in word2id, 'word found twice'
                vectors.append(vect)
                word2id[word] = len(word2id)
                if len(word2id) == nmax:
                    break
        id2word = {v: k for k, v in word2id.items()}
        embeddings = np.vstack(vectors)
        merged_vec = self.add_pad_unk(embeddings)
        return merged_vec, id2word, word2id
    
    def encode_data(self,data,max_len):
        new_data=[]
        
        for row in tqdm(data):
            encoded=[]
            words=row.split(' ')
            unk_index = len(list(self.word2id.keys()))
            pad_index = unk_index+1
            num = min(max_len,len(words))
            for word in words[0:num]:
                word=word.lower()
                try:
                    index=self.word2id[word]
                except KeyError:
                    index=unk_index
                encoded.append(index)
            if(len(encoded)<max_len):
                padding = [pad_index]*(max_len-len(encoded))
                encoded.extend(padding)
            else:
                encoded=encoded[0:max_len]
            new_data.append(encoded)
                                                   
        return new_data
    
    def add_pad_unk(self,vector):
        pad_vec = np.zeros((1,vector.shape[1])) 
        unk_vec = np.mean(vector,axis=0,keepdims=True) 
        
        merged_vec=np.append(vector, unk_vec, axis=0)
        merged_vec=np.append(merged_vec, pad_vec, axis=0)
        
        return merged_vec
    
    ##----------------------------------------------------------##
    ##---------------------- Data Loader -----------------------##
    ##----------------------------------------------------------## 
    def get_dataloader(self,X,Y,max_len,batch_size,is_train=False):
        inputs = self.encode_data(X,max_len)
        labels = Y
        
        inputs = torch.tensor(inputs)
        labels = torch.tensor(labels,dtype=torch.long)

        data = TensorDataset(inputs,labels)

        if(is_train==False):
            sampler = SequentialSampler(data)
        else:
            sampler = RandomSampler(data) 

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------##  
    def get_optimiser(self,learning_rate,model):
         return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
        
    def evalMetric(self,y_true, y_pred,prefix):
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score  = f1_score(y_true, y_pred, labels = np.unique(y_pred))
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        area_under_c = auc(fpr, tpr)
        recallScore = recall_score(y_true, y_pred, labels = np.unique(y_pred))
        precisionScore = precision_score(y_true, y_pred, labels = np.unique(y_pred))
        return dict({prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
                        prefix+'f1Score': f1Score, prefix+'precision': precisionScore, 
                        prefix+'recall': recallScore})
    
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    def evaluate(self,model,loader,which):
    
        model.eval() # put model in eval mode

        total_eval_loss = 0
        nb_eval_steps = 0

        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in loader:
            b_inputs = batch[0].to(self.device)
            b_labels = batch[1].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_inputs,b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_eval_loss += loss.item()

            b_y_true = b_labels.cpu().data.squeeze().numpy()

            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)

        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser):
        
        total_loss = 0
        model.train() # put model in train mode

        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for step, batch in tqdm(enumerate(train_loader)):

            b_inputs = batch[0].to(self.device)
            b_labels = batch[1].to(self.device)

            model.zero_grad()        

            outputs = model(b_inputs,b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_loss += loss.item()

            loss.backward()

            b_y_true = b_labels.cpu().data.squeeze().numpy()

            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimiser.step()

        avg_train_loss = total_loss / len(train_loader)

        train_metrics = self.evalMetric(y_true,y_pred,"Train_")

        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    def train(self,model,data_loaders,optimiser,epochs):
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            train_metrics = self.run_train_loop(model,train_loader,optimiser)

            print("")
            print("Running Validation...") 
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            stats['epoch']=epoch_i+1

            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats
    
    ##-----------------------------------------------------------##
    ##--------------------- The Pipeline ------------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        X_train = df_train['Text'].values
        Y_train = df_train['Label'].values
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values
        X_val = df_val['Text'].values
        Y_val = df_val['Label'].values
        
        train_dl = self.get_dataloader(X_train,Y_train,args['max_len'],
                                       args['batch_size'],
                                       True)
        val_dl = self.get_dataloader(X_val,Y_val,args['max_len'],args['batch_size'])
        test_dl = self.get_dataloader(X_test,Y_test,args['max_len'],args['batch_size'])
        
        model = LSTM_Model(args['weights'],self.embeddings)
        
        optimiser =self.get_optimiser(args['learning_rate'],model)
        
        train_stats = self.train(model,[train_dl,val_dl,test_dl],
                            optimiser,args['epochs'])

In [41]:
DATA_FOLDER = "Data_Processed/Shared_Task_eng/"

In [42]:
df_train = pd.read_csv(DATA_FOLDER+"train_1.csv")
df_val = pd.read_csv(DATA_FOLDER+"val_1.csv")
df_test = pd.read_csv(DATA_FOLDER+"test_1.csv")

In [43]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)
df_test.dropna(inplace=True)

In [44]:
len(df_train),len(df_val),len(df_test)

(9161, 1308, 2615)

In [45]:
df_train=df_train.iloc[0:1000]
df_val=df_val.iloc[0:200]
df_test=df_test.iloc[0:200]

In [76]:
args={
    'seed_val':42,
    'batch_size':32,
    'max_len':1024,
    'weights':[1.0,5.0],
    'epochs': 10,
    'learning_rate':1e-4,
    'device':'cpu',
    'embedding_path': "Embeddings/cc.en.300.vec",
}

In [77]:
lstm = LSTM(args)

In [78]:
lstm.run(args,df_train,df_val,df_test)

100%|██████████| 1000/1000 [00:00<00:00, 2300.20it/s]
100%|██████████| 200/200 [00:00<00:00, 2370.13it/s]
100%|██████████| 200/200 [00:00<00:00, 2286.57it/s]
0it [00:00, ?it/s]



Training...


32it [01:18,  2.45s/it]


avg_train_loss 0.6898938808590174
train_f1Score 0.110803324099723
train_accuracy 0.679

Running Validation...


0it [00:00, ?it/s]

Validation Loss:  0.670798625264849
Validation Accuracy:  0.905


Training...


32it [01:20,  2.52s/it]


avg_train_loss 0.6688580773770809
train_f1Score 0.0
train_accuracy 0.898

Running Validation...


0it [00:00, ?it/s]

Validation Loss:  0.652502885886601
Validation Accuracy:  0.905


Training...


29it [01:20,  2.79s/it]


KeyboardInterrupt: 