In [1]:
import pandas as pd
import numpy as np
import io
from tqdm import tqdm

In [2]:
DATA_PATH = "Data/Shared Task/"
TRAIN_FILE_NAME = "eng/trac2_eng_train.csv"
VAL_FILE_NAME = "eng/trac2_eng_dev.csv"

In [3]:
EMBEDDING_PATH = "Embeddings/wiki.multi.en.vec"

## Load MUSE Embeddings

In [4]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

## Load Data

In [5]:
df_train = pd.read_csv(DATA_PATH+TRAIN_FILE_NAME)
df_val = pd.read_csv(DATA_PATH+VAL_FILE_NAME)

In [77]:
df_train.sample(10)

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
4120,C4.743,Kabir Singh' inspired man kills girl\n<http://...,CAG,NGEN
1562,C33.384,I m from pakistan \nKoi pakistan se girl ho t...,NAG,NGEN
2046,C7.2173,You Reated 9/10 \nWow 😍 \nI Reated 10 / 10,NAG,NGEN
1547,C4.2034,There is no one character of doctor like kabir...,NAG,NGEN
882,C4.2182,"I dunno, but I had this gut feeling inside of ...",NAG,NGEN
396,C7.2288,I read a blog post of India Today they were tr...,OAG,NGEN
3883,C59.1200,great show,NAG,NGEN
1586,C7.2595.5,"LKMKA, L - Leftist.these should be kich out",OAG,GEN
3006,C4.1160,"sir, please don't stop making video",NAG,NGEN
1793,C25.583.1,Yes so ppl like u can take dowry with an open ...,CAG,NGEN


In [100]:
len(df_train[df_train['Sub-task B']=='NGEN']),len(df_train[df_train['Sub-task B']=='GEN'])

(3954, 309)

## Encode Data
- Encode text to embeddings
- Encode labels to integers

In [7]:
vector,id2word,word2id = load_vec(EMBEDDING_PATH)

In [8]:
def encode_data(df,word2id,textColName,labelColName):
        max_len=0
        for index,row in tqdm(df.iterrows(),total=len(df)):
            
            if(max_len<len(row[textColName].split(' '))):
                max_len=len(row[textColName].split(' '))
        
        new_data=[]
        
        
        for index,row in df.iterrows():
            list_token_id=[]
            words=row[textColName].split(' ')
            for word in words:
                try:
                    index=word2id[word]
                except KeyError:
                    index=len(list(word2id.keys()))
                list_token_id.append(index)
            with_padding_text=list(pad(list_token_id, max_len, len(list(word2id.keys()))+1))
            new_data.append([with_padding_text,row[labelColName],row[textColName]])
        return new_data

In [9]:
from itertools import chain, repeat, islice
def pad_infinite(iterable, padding=None):
       return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
       return islice(pad_infinite(iterable, padding), size)

In [10]:
train_data=encode_data(df_train,word2id,"Text","Sub-task B")
val_data=encode_data(df_val,word2id,"Text","Sub-task B")

100%|██████████| 4263/4263 [00:00<00:00, 15645.35it/s]
100%|██████████| 1066/1066 [00:00<00:00, 16898.52it/s]


In [11]:
len(train_data)

4263

Data is a list of the form [word_vectors,label,'Next Part']

In [12]:
print("Word Vector length: ",len(train_data[0][0]))

Word Vector length:  779


### Get Vocab size

In [13]:
pad_vec=np.random.randn(1,300) 
unk_vec=np.random.randn(1,300)
merged_vec=np.append(vector, unk_vec, axis=0)
merged_vec=np.append(merged_vec, pad_vec, axis=0)

In [14]:
merged_vec.shape

(50002, 300)

In [15]:
vocab_size = merged_vec.shape[0]

### Encode Labels

In [16]:
def encodeLabels(data,misogynyLabel,nonmisogynyLabel):
  for sample in data:
    if(sample[1]==misogynyLabel):
      sample[1]=1;
    elif(sample[1]==nonmisogynyLabel):
      sample[1]=0;

In [17]:
encodeLabels(train_data,"GEN","NGEN")
encodeLabels(val_data,"GEN","NGEN")

## Data Loader

In [18]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

In [79]:
def return_cnngru_dataloader(samples, batch_size,is_train=False):
  inputs = [ele[0] for ele in samples]
  labels = [ele[1] for ele in samples]

  inputs = torch.tensor(inputs)
  labels = torch.tensor(labels,dtype=torch.long)

  data = TensorDataset(inputs,labels)

  if(is_train==False):
      sampler = SequentialSampler(data)
  else:
      sampler = RandomSampler(data)  
  
  dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

  return dataloader

In [80]:
BATCH_SIZE = 32

In [81]:
train_dataloader = return_cnngru_dataloader(train_data,BATCH_SIZE,True)
validation_dataloader=return_cnngru_dataloader(val_data,BATCH_SIZE,False)

## Model

In [22]:
import torch.nn as nn

In [319]:
args= {
    'train_embed': False,
    'weights': [1.0,10.0],
    'vocab_size': vocab_size
}

In [320]:
def global_max_pooling(tensor, dim, topk):
    """Global max pooling"""
    ret, _ = torch.topk(tensor, topk, dim)
    return ret

class CNN_GRU(nn.Module):
    def __init__(self,args,vector):
        super(CNN_GRU, self).__init__()
        self.embedsize = vector.shape[1]
        self.conv1 = nn.Conv1d(self.embedsize,100, 2)
        self.conv2 = nn.Conv1d(self.embedsize,100, 3,padding=1)
        self.conv3 = nn.Conv1d(self.embedsize,100, 4,padding=2)
        self.maxpool1D = nn.MaxPool1d(4, stride=4)
        self.seq_model = nn.GRU(100, 100, bidirectional=False, batch_first=True)
        self.embedding = nn.Embedding(args["vocab_size"], self.embedsize)
        self.embedding.weight = nn.Parameter(torch.tensor(vector.astype(np.float32), dtype=torch.float32))
        self.embedding.weight.requires_grad = args["train_embed"]
        self.num_labels=2
        self.weights=args['weights']
        self.out = nn.Linear(100, self.num_labels)

        
    def forward(self,x,labels=None):
        batch_size=x.size(0)
        h_embedding = self.embedding(x)
        new_conv1=self.maxpool1D(self.conv1(h_embedding.permute(0,2,1)))
        new_conv2=self.maxpool1D(self.conv2(h_embedding.permute(0,2,1)))
        new_conv3=self.maxpool1D(self.conv3(h_embedding.permute(0,2,1)))
        concat=self.maxpool1D(torch.cat([new_conv1, new_conv2,new_conv3], dim=2))
        h_seq, _ = self.seq_model(concat.permute(0,2,1))
        global_h_seq=torch.squeeze(global_max_pooling(h_seq, 1, 1)) 
        output=self.out(global_h_seq)
        
        if labels is not None:
        	loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.weights,dtype=torch.float))
        	loss = loss_fct(output.view(-1, self.num_labels), labels.view(-1))
        	return loss,output
        return output

In [321]:
model = CNN_GRU(args,merged_vec)

## Training

In [322]:
from transformers import AdamW

In [323]:
LR = 1e-4
EPOCHS = 20

In [324]:
 optimizer = AdamW(model.parameters(),
                  lr = LR, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8
                )

In [325]:
import random
def fix_the_random(seed_val = 42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [326]:
fix_the_random()

### Testing train loop

In [327]:
from sklearn.metrics import *

In [284]:
i=0
for batch in train_dataloader:
    i+=1
    inputs= batch[0].to(device)
    labels = batch[1].to(device)
    
#     print(inputs.shape)
#     print(labels.shape)
    
    output = model(inputs,labels)
    
#     print(output.shape)
    
    loss = output[0]
    logits = output[1]
    
    print(loss.item())
    
    loss.backward()
    
    y_true = labels.cpu().data.squeeze().numpy()

    y_pred = torch.max(logits,1)[1]
    
    print(y_pred),print(y_true)
    y_pred = y_pred.cpu().data.squeeze().numpy()
    
    print(accuracy_score(y_true, y_pred))
    print(f1_score(y_true, y_pred, labels = np.unique(y_pred)))
    
    optimizer.step()
    
    if(i==10):
        break

0.8678257465362549
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.03125
0.06060606060606061
0.7170535922050476
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.09375
0.06451612903225806
0.6959130764007568
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
[0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1]
0.84375
0.0
0.5491247773170471
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.96875
0.0
0.5842214822769165
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 

### Main Train Loop

In [328]:
import time
import datetime

In [329]:
def evalMetric(y_true, y_pred,prefix):
   accuracy = accuracy_score(y_true, y_pred)
   mf1Score = f1_score(y_true, y_pred, average='macro')
   f1Score  = f1_score(y_true, y_pred, labels = np.unique(y_pred))
   fpr, tpr, _ = roc_curve(y_true, y_pred)
   area_under_c = auc(fpr, tpr)
   recallScore = recall_score(y_true, y_pred, labels = np.unique(y_pred))
   precisionScore = precision_score(y_true, y_pred, labels = np.unique(y_pred))
   return dict({prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
                prefix+'f1Score': f1Score, prefix+'precision': precisionScore, 
                prefix+'recall': recallScore})

In [330]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [331]:
def EvaluateOnData(model,loader):
    
    model.eval() # put model in eval mode
    
    total_eval_loss = 0
    nb_eval_steps = 0

    y_pred = np.zeros(shape=(0),dtype='int')
    y_true = np.empty(shape=(0),dtype='int')
    
    for batch in loader:
        b_inputs = batch[0].to(device)
        b_labels = batch[1].to(device)
        
        with torch.no_grad(): # do not construct compute graph
            outputs = model(b_inputs,b_labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()

        b_y_true = b_labels.cpu().data.squeeze().numpy()

        b_y_pred = torch.max(logits,1)[1]
        b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

        y_pred = np.concatenate((y_pred,b_y_pred))
        y_true = np.concatenate((y_true,b_y_true))
        
    metrics = evalMetric(y_true,y_pred,"Val_")

    print(" Validation Accuracy: {0:.2f}".format(metrics['Val_accuracy']))

    # Calculate the average loss over all of the batches.
    avg_loss = total_eval_loss / len(loader)
    
    print("  Validation Loss: {0:.2f}".format(avg_loss))

    metrics['Val_avg_loss'] = avg_loss

    return metrics

In [332]:
def runTrainLoop(model,train_loader,t0,optimiser):
    print("")
    print('Training...')

    # Reset the total loss for this epoch.
    total_loss = 0
    model.train() # put model in train mode

    y_pred = np.zeros(shape=(0),dtype='int')
    y_true = np.empty(shape=(0),dtype='int')

    # For each batch of training data...
    for step, batch in tqdm(enumerate(train_loader)):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
        
        b_inputs = batch[0].to(device)
        b_labels = batch[1].to(device)
        
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        outputs = model(b_inputs,b_labels)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        logits = outputs[1]

        if step % 40 == 0 and not step == 0:
            print('batch_loss',loss.item())

        #Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()


        # Compute True and predicted labels to get
        # train metrics
        b_y_true = b_labels.cpu().data.squeeze().numpy()

        b_y_pred = torch.max(logits,1)[1]
        b_y_pred = b_y_pred.cpu().data.squeeze().numpy()
        
        # accumulate b_y_pred and b_y_true for each batch
        # and evaluate at once
        y_pred = np.concatenate((y_pred,b_y_pred))
        y_true = np.concatenate((y_true,b_y_true))

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        #scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_loader)

    train_metrics = evalMetric(y_true,y_pred,"Train_")

    print('avg_train_loss',avg_train_loss)
    print('train_f1Score',train_metrics['Train_f1Score'])
    print('train_accuracy',train_metrics['Train_accuracy'])

    train_metrics['Train_avg_loss'] = avg_train_loss

    return train_metrics

In [333]:
def train(model,train_loader,val_loader,optimiser,epochs):
    train_stats = []
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

          # Measure how long the training epoch takes.
        t0 = time.time()

        train_metrics = runTrainLoop(model,train_loader,t0,optimiser)
        
        print("")
        print("Running Validation...") 
        val_metrics = EvaluateOnData(model,val_loader)
        
        stats = {}
        
        stats['epoch']=epoch_i+1
        
        stats.update(train_metrics)
        stats.update(val_metrics)

        train_stats.append(stats)
    
    return train_stats

In [334]:
device = torch.device("cpu")

In [335]:
train_stats = train(model,train_dataloader,validation_dataloader,optimizer,EPOCHS)

0it [00:00, ?it/s]



Training...


41it [00:07,  5.11it/s]

batch_loss 0.5896029472351074


81it [00:15,  5.10it/s]

batch_loss 0.4902842044830322


121it [00:23,  5.14it/s]

batch_loss 0.6900213360786438


134it [00:25,  5.16it/s]


avg_train_loss 0.6553460544169839
train_f1Score 0.16276477146042362
train_accuracy 0.8238329814684494

Running Validation...


1it [00:00,  5.15it/s]

 Validation Accuracy: 0.80
  Validation Loss: 0.62


Training...


41it [00:07,  5.41it/s]

batch_loss 0.4612290561199188


81it [00:15,  5.17it/s]

batch_loss 0.8244192600250244


121it [00:23,  4.94it/s]

batch_loss 0.7256960868835449


134it [00:26,  5.14it/s]


avg_train_loss 0.6395130186383404
train_f1Score 0.19722901385493075
train_accuracy 0.7689420595824537

Running Validation...


1it [00:00,  5.71it/s]

 Validation Accuracy: 0.76
  Validation Loss: 0.62


Training...


41it [00:07,  5.09it/s]

batch_loss 0.9179143309593201


81it [00:15,  5.15it/s]

batch_loss 0.7374975085258484


121it [00:23,  5.01it/s]

batch_loss 0.4540960192680359


134it [00:25,  5.20it/s]


avg_train_loss 0.6338299973242318
train_f1Score 0.20802377414561662
train_accuracy 0.749941355852686

Running Validation...


1it [00:00,  5.51it/s]

 Validation Accuracy: 0.91
  Validation Loss: 0.63


Training...


41it [00:07,  5.05it/s]

batch_loss 0.7036627531051636


81it [00:15,  5.02it/s]

batch_loss 0.6168279051780701


121it [00:23,  5.35it/s]

batch_loss 0.5486738085746765


134it [00:25,  5.24it/s]


avg_train_loss 0.6367299923701073
train_f1Score 0.18600682593856652
train_accuracy 0.7762139338494018

Running Validation...


1it [00:00,  5.41it/s]

 Validation Accuracy: 0.74
  Validation Loss: 0.62


Training...


41it [00:08,  5.33it/s]

batch_loss 0.7423107624053955


81it [00:15,  4.93it/s]

batch_loss 0.6425331830978394


121it [00:23,  5.21it/s]

batch_loss 0.5328797698020935


134it [00:26,  5.15it/s]


avg_train_loss 0.6295516281875212
train_f1Score 0.20036652412950517
train_accuracy 0.6929392446633826

Running Validation...


1it [00:00,  5.45it/s]

 Validation Accuracy: 0.73
  Validation Loss: 0.63


Training...


41it [00:07,  5.05it/s]

batch_loss 0.5604899525642395


81it [00:15,  5.15it/s]

batch_loss 0.518089771270752


121it [00:23,  5.20it/s]

batch_loss 0.730506420135498


134it [00:25,  5.17it/s]


avg_train_loss 0.6340348800616478
train_f1Score 0.20361990950226244
train_accuracy 0.7522871217452498

Running Validation...


1it [00:00,  5.25it/s]

 Validation Accuracy: 0.64
  Validation Loss: 0.63


Training...


41it [00:08,  5.01it/s]

batch_loss 0.8015630841255188


81it [00:15,  5.08it/s]

batch_loss 0.5658503770828247


121it [00:23,  5.19it/s]

batch_loss 0.5749363303184509


134it [00:26,  5.14it/s]


avg_train_loss 0.6344919333707041
train_f1Score 0.20726072607260723
train_accuracy 0.7182735163030729

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.65
  Validation Loss: 0.63


Training...


41it [00:07,  5.14it/s]

batch_loss 0.7022523283958435


81it [00:15,  5.19it/s]

batch_loss 0.5381507277488708


121it [00:23,  5.29it/s]

batch_loss 0.49218177795410156


134it [00:25,  5.22it/s]


avg_train_loss 0.6278990818493402
train_f1Score 0.2051282051282051
train_accuracy 0.72366877785597

Running Validation...


1it [00:00,  5.83it/s]

 Validation Accuracy: 0.67
  Validation Loss: 0.63


Training...


41it [00:07,  5.30it/s]

batch_loss 0.612427830696106


81it [00:15,  5.12it/s]

batch_loss 0.6621097326278687


121it [00:23,  5.36it/s]

batch_loss 0.5450568199157715


134it [00:25,  5.20it/s]


avg_train_loss 0.6292072372205222
train_f1Score 0.19568151147098517
train_accuracy 0.7203847056063805

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.81
  Validation Loss: 0.62


Training...


41it [00:07,  5.19it/s]

batch_loss 0.6330550909042358


81it [00:15,  5.39it/s]

batch_loss 0.5309252142906189


121it [00:22,  5.40it/s]

batch_loss 0.7789674401283264


134it [00:25,  5.30it/s]


avg_train_loss 0.6297627407223431
train_f1Score 0.18772563176895307
train_accuracy 0.7361013370865588

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.57
  Validation Loss: 0.65


Training...


41it [00:07,  5.33it/s]

batch_loss 0.7399698495864868


81it [00:15,  5.17it/s]

batch_loss 0.5843307375907898


121it [00:23,  4.86it/s]

batch_loss 0.6415930390357971


134it [00:26,  5.12it/s]


avg_train_loss 0.6311814974048244
train_f1Score 0.1973030518097942
train_accuracy 0.7346938775510204

Running Validation...


1it [00:00,  5.38it/s]

 Validation Accuracy: 0.76
  Validation Loss: 0.62


Training...


41it [00:07,  4.97it/s]

batch_loss 0.46096330881118774


81it [00:15,  5.10it/s]

batch_loss 0.5134145617485046


121it [00:23,  5.24it/s]

batch_loss 0.5325857996940613


134it [00:25,  5.16it/s]


avg_train_loss 0.6255035502697105
train_f1Score 0.20875420875420878
train_accuracy 0.7243725076237392

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.79
  Validation Loss: 0.61


Training...


41it [00:08,  5.10it/s]

batch_loss 0.6289860606193542


81it [00:16,  5.12it/s]

batch_loss 0.8440609574317932


121it [00:23,  5.28it/s]

batch_loss 0.6486392021179199


134it [00:26,  5.12it/s]


avg_train_loss 0.6218367196730713
train_f1Score 0.2115830115830116
train_accuracy 0.7604973023692235

Running Validation...


1it [00:00,  5.68it/s]

 Validation Accuracy: 0.66
  Validation Loss: 0.62


Training...


41it [00:08,  4.61it/s]

batch_loss 0.5124680399894714


81it [00:15,  5.04it/s]

batch_loss 0.49595698714256287


121it [00:23,  5.06it/s]

batch_loss 0.7312994599342346


134it [00:25,  5.15it/s]


avg_train_loss 0.6275608942135057
train_f1Score 0.21536252692031588
train_accuracy 0.7436077879427633

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.71
  Validation Loss: 0.62


Training...


41it [00:08,  4.95it/s]

batch_loss 0.8137422800064087


81it [00:15,  5.05it/s]

batch_loss 0.7655908465385437


121it [00:23,  5.26it/s]

batch_loss 0.6766626834869385


134it [00:26,  5.12it/s]


avg_train_loss 0.6237769055722365
train_f1Score 0.2041958041958042
train_accuracy 0.7330518414262257

Running Validation...


1it [00:00,  5.26it/s]

 Validation Accuracy: 0.58
  Validation Loss: 0.64


Training...


41it [00:07,  5.32it/s]

batch_loss 0.4436948895454407


81it [00:15,  5.28it/s]

batch_loss 0.7013877630233765


121it [00:23,  5.12it/s]

batch_loss 0.5792557001113892


134it [00:26,  5.13it/s]


avg_train_loss 0.6217894934451402
train_f1Score 0.2068502350570853
train_accuracy 0.7229650480882008

Running Validation...


1it [00:00,  5.31it/s]

 Validation Accuracy: 0.75
  Validation Loss: 0.61


Training...


41it [00:07,  5.34it/s]

batch_loss 0.5777749419212341


81it [00:15,  5.06it/s]

batch_loss 0.9759151935577393


121it [00:23,  5.22it/s]

batch_loss 0.6877971887588501


134it [00:25,  5.23it/s]


avg_train_loss 0.61810189530031
train_f1Score 0.20870767104353835
train_accuracy 0.731409805301431

Running Validation...


1it [00:00,  5.50it/s]

 Validation Accuracy: 0.73
  Validation Loss: 0.62


Training...


41it [00:08,  5.24it/s]

batch_loss 0.5116952657699585


81it [00:15,  5.02it/s]

batch_loss 0.5719243288040161


121it [00:23,  4.67it/s]

batch_loss 0.5590273141860962


134it [00:26,  5.14it/s]


avg_train_loss 0.6228241175413132
train_f1Score 0.20849933598937584
train_accuracy 0.7203847056063805

Running Validation...


1it [00:00,  5.95it/s]

 Validation Accuracy: 0.68
  Validation Loss: 0.63


Training...


41it [00:08,  5.25it/s]

batch_loss 0.7105596661567688


81it [00:15,  5.29it/s]

batch_loss 0.41482114791870117


121it [00:23,  5.23it/s]

batch_loss 0.7235344648361206


134it [00:25,  5.23it/s]


avg_train_loss 0.6207500965292774
train_f1Score 0.20383036935704515
train_accuracy 0.7269528501055594

Running Validation...


0it [00:00, ?it/s]

 Validation Accuracy: 0.63
  Validation Loss: 0.63


Training...


41it [00:07,  5.23it/s]

batch_loss 0.7106671333312988


81it [00:15,  5.05it/s]

batch_loss 0.46385622024536133


121it [00:23,  5.28it/s]

batch_loss 0.6183617115020752


134it [00:25,  5.17it/s]


avg_train_loss 0.6242387127965244
train_f1Score 0.2154273801250869
train_accuracy 0.7351630307295332

Running Validation...
 Validation Accuracy: 0.64
  Validation Loss: 0.63


In [336]:
train_stats[19]['Val_f1Score']

0.18454935622317595

## Final Code

In [7]:
# Core
import io  #open embedding file
import random  #fix random

# Basics
import pandas as pd
import numpy as np
import torch

# Utility
from tqdm import tqdm #progress-bar
from itertools import chain, repeat, islice #padding

# Dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Model
import torch.nn as nn

# Optimiser
from transformers import AdamW

# Metrics
from sklearn.metrics import *

## Model

In [40]:
def global_max_pooling(tensor, dim, topk):
    """Global max pooling"""
    ret, _ = torch.topk(tensor, topk, dim)
    return ret

class CNN_GRU_Model(nn.Module):
    def __init__(self,args,vector):
        super(CNN_GRU_Model, self).__init__()
        self.embedsize = vector.shape[1]
        self.conv1 = nn.Conv1d(self.embedsize,100, 2)
        self.conv2 = nn.Conv1d(self.embedsize,100, 3,padding=1)
        self.conv3 = nn.Conv1d(self.embedsize,100, 4,padding=2)
        self.maxpool1D = nn.MaxPool1d(4, stride=4)
        self.seq_model = nn.GRU(100, 100, bidirectional=False, batch_first=True)
        self.embedding = nn.Embedding(args["vocab_size"], self.embedsize)
        self.embedding.weight = nn.Parameter(torch.tensor(vector.astype(np.float32), dtype=torch.float32))
        self.embedding.weight.requires_grad = args["train_embed"]
        self.num_labels=2
        self.weights=args['weights']
        self.out = nn.Linear(100, self.num_labels)

        
    def forward(self,x,labels=None):
        batch_size=x.size(0)
        h_embedding = self.embedding(x)
        new_conv1=self.maxpool1D(self.conv1(h_embedding.permute(0,2,1)))
        new_conv2=self.maxpool1D(self.conv2(h_embedding.permute(0,2,1)))
        new_conv3=self.maxpool1D(self.conv3(h_embedding.permute(0,2,1)))
        concat=self.maxpool1D(torch.cat([new_conv1, new_conv2,new_conv3], dim=2))
        h_seq, _ = self.seq_model(concat.permute(0,2,1))
        global_h_seq=torch.squeeze(global_max_pooling(h_seq, 1, 1)) 
        output=self.out(global_h_seq)
        
        if labels is not None:
        	loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.weights,dtype=torch.float))
        	loss = loss_fct(output.view(-1, self.num_labels), labels.view(-1))
        	return loss,output
        return output

## Main Class

In [75]:
class CNN_GRU:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
        
        self.vector,id2word,self.word2id = self.load_vec(args['embedding_path'])
        
        self.device = torch.device(args['device'])
    
    ##-----------------------------------------------------------##
    ##------------------ Utility Functions ----------------------##
    ##-----------------------------------------------------------##
    def load_vec(self,emb_path, nmax=50000):
        vectors = []
        word2id = {}
        with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
            next(f)
            for i, line in enumerate(f):
                word, vect = line.rstrip().split(' ', 1)
                vect = np.fromstring(vect, sep=' ')
                assert word not in word2id, 'word found twice'
                vectors.append(vect)
                word2id[word] = len(word2id)
                if len(word2id) == nmax:
                    break
        id2word = {v: k for k, v in word2id.items()}
        embeddings = np.vstack(vectors)
        return embeddings, id2word, word2id
    
    
    def pad_infinite(self,iterable, padding=None):
        return chain(iterable, repeat(padding))
    

    def pad(self,iterable, size, padding=None):
        return islice(self.pad_infinite(iterable, padding), size)
    
    
    def encode_data(self,df,word2id):
        max_len=0
        for index,row in tqdm(df.iterrows(),total=len(df)):
            
            if(max_len<len(row['Text'].split(' '))):
                max_len=len(row['Text'].split(' '))
        
        new_data=[]
        
        
        for index,row in df.iterrows():
            list_token_id=[]
            words=row['Text'].split(' ')
            for word in words:
                try:
                    index=word2id[word]
                except KeyError:
                    index=len(list(word2id.keys()))
                list_token_id.append(index)
            with_padding_text=list(self.pad(list_token_id, max_len, len(list(word2id.keys()))+1))
            new_data.append([with_padding_text,row['Label'],row['Text']])
        return new_data
    
    
    def add_pad_unk(self,vector):
        pad_vec=np.random.randn(1,300) 
        unk_vec=np.random.randn(1,300)
        
        merged_vec=np.append(vector, unk_vec, axis=0)
        merged_vec=np.append(merged_vec, pad_vec, axis=0)
        
        return merged_vec
    
    ##-----------------------------------------------------------##
    ##------------------ Dataloader -----------------------------##
    ##-----------------------------------------------------------##
    
    def get_dataloader(self,samples, batch_size,is_train=False):
        inputs = [ele[0] for ele in samples]
        labels = [ele[1] for ele in samples]

        inputs = torch.tensor(inputs)
        labels = torch.tensor(labels,dtype=torch.long)

        data = TensorDataset(inputs,labels)

        if(is_train==False):
            sampler = SequentialSampler(data)
        else:
            sampler = RandomSampler(data)  

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------##  
    
    def get_optimiser(self,learning_rate,model):
         return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
        
    def evalMetric(self,y_true, y_pred,prefix):
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score  = f1_score(y_true, y_pred, labels = np.unique(y_pred))
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        area_under_c = auc(fpr, tpr)
        recallScore = recall_score(y_true, y_pred, labels = np.unique(y_pred))
        precisionScore = precision_score(y_true, y_pred, labels = np.unique(y_pred))
        return dict({prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
                        prefix+'f1Score': f1Score, prefix+'precision': precisionScore, 
                        prefix+'recall': recallScore})
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    
    def evaluate(self,model,loader,which):
    
        model.eval() # put model in eval mode

        total_eval_loss = 0
        nb_eval_steps = 0

        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in loader:
            b_inputs = batch[0].to(self.device)
            b_labels = batch[1].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_inputs,b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_eval_loss += loss.item()

            b_y_true = b_labels.cpu().data.squeeze().numpy()

            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)

        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser):
        
        total_loss = 0
        model.train() # put model in train mode

        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for step, batch in tqdm(enumerate(train_loader)):

            b_inputs = batch[0].to(self.device)
            b_labels = batch[1].to(self.device)

            model.zero_grad()        

            outputs = model(b_inputs,b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_loss += loss.item()

            loss.backward()

            b_y_true = b_labels.cpu().data.squeeze().numpy()

            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimiser.step()

        avg_train_loss = total_loss / len(train_loader)

        train_metrics = self.evalMetric(y_true,y_pred,"Train_")

        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    
    def train(self,model,data_loaders,optimiser,epochs):
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            train_metrics = self.run_train_loop(model,train_loader,optimiser)

            print("")
            print("Running Validation...") 
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            stats['epoch']=epoch_i+1

            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats
    
    ##-----------------------------------------------------------##
    ##------------------------ The Pipeline ---------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        train_data=self.encode_data(df_train,self.word2id)
        val_data=self.encode_data(df_val,self.word2id)
        test_data=self.encode_data(df_test,self.word2id)
        
        merged_vec = self.add_pad_unk(self.vector)
        
        args['model']['vocab_size'] = merged_vec.shape[1]
        
        train_dl = self.get_dataloader(train_data,args['batch_size'],True)
        val_dl = self.get_dataloader(val_data,args['batch_size'],False)
        test_dl = self.get_dataloader(test_data,args['batch_size'],False)
        
        model = CNN_GRU_Model(args['model'],merged_vec)
        
        optimiser=self.get_optimiser(args['learning_rate'],model)
        
        train_stats = self.train(model,[train_dl,val_dl,test_dl],
                            optimiser,args['epochs'])

In [76]:
DATA_FOLDER = "Data_Processed/Shared_Task_eng/"

In [77]:
df_train = pd.read_csv(DATA_FOLDER+"train_1.csv")
df_val = pd.read_csv(DATA_FOLDER+"val_1.csv")
df_test = pd.read_csv(DATA_FOLDER+"test_1.csv")

In [78]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)
df_test.dropna(inplace=True)

In [79]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,ID,Text,Label
0,953,954,Fuck yourself Feminism,1
1,3413,3414,"I subscribe to that propaganda, for real.\r\nI...",0
2,728,729,You are genuinely struggling with how approach...,1
3,1322,1323,Please do review on joker which is based on ev...,0
4,2096,2097,But 99% liberals are praising this movie. Only...,0


In [80]:
args={
    'seed_val':42,
    'embedding_path': "Embeddings/wiki.multi.en.vec",
    'batch_size': 32,
    'learning_rate': 1e-4,
    'epochs': 10,
    'device':'cpu',
    'model':{
        'train_embed': False,
        'weights': [1.0,8.0],
    }
}

In [81]:
cnn_gru = CNN_GRU(args)

In [83]:
cnn_gru.run(args,df_train,df_val,df_test)

100%|██████████| 9161/9161 [00:00<00:00, 15493.18it/s]
100%|██████████| 1308/1308 [00:00<00:00, 16681.77it/s]
100%|██████████| 2615/2615 [00:00<00:00, 17030.19it/s]
0it [00:00, ?it/s]



Training...


287it [03:31,  1.36it/s]


avg_train_loss 0.6221524381471428
train_f1Score 0.27088305489260145
train_accuracy 0.7332168977185897

Running Validation...


0it [00:00, ?it/s]

Validation Loss:  0.6043016016483307
Validation Accuracy:  0.7431192660550459


Training...


4it [00:03,  1.15it/s]


KeyboardInterrupt: 