In [0]:
!pip install wget

In [0]:
!pip install transformers

In [0]:
import torch
from transformers import BertTokenizer,BertForSequenceClassification,AdamW,get_linear_schedule_with_warmup, BertModel
import tensorflow as tf
import os
import wget
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.nn import CrossEntropyLoss, MSELoss

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import csv


In [0]:
#Get GPU name
gpu=tf.test.gpu_device_name()
print(gpu)

In [0]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device=torch.device("cuda")
    print('There are %d GPU(s) available.' %torch.cuda.device_count())
    print('We will use the GPU:',torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU.')
    device = torch.device("cpu")

In [0]:
#Download the PAWS Wiki dataset
url='https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz'

if not os.path.exists('./paws_wiki_labeled_final.tar.gz'):
    wget.download(url, './paws_wiki_labeled_final.tar.gz')

In [0]:
if not os.path.exists('./pawsWiki_l_final/'):
    !tar -xvf paws_wiki_labeled_final.tar.gz

In [0]:
#Get training and validation data
train=pd.read_csv('./final/train.tsv',sep='\t')
print(train.shape)
print(train.head())

val=pd.read_csv('./final/dev.tsv',sep='\t')
test=pd.read_csv('./final/test.tsv',sep='\t')

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [0]:
#Data Preprocessing and tensor generation
seed=2
  
train=train.sample(n=49401,random_state=seed)
tokenized_train=tokenizer.batch_encode_plus(train.iloc[:,1:3].to_numpy().tolist(),max_length=128,pad_to_max_length=True,return_tensors='pt')
labels_train=torch.tensor(train.label.values[:])

val=val.sample(n=8000,random_state=seed)
tokenized_val=tokenizer.batch_encode_plus(val.iloc[:,1:3].to_numpy().tolist(),max_length=128,pad_to_max_length=True,return_tensors='pt')
labels_val=torch.tensor(val.label.values[:])

In [0]:
test=test.sample(n=1000,random_state=seed)
tokenized_test=tokenizer.batch_encode_plus(test.iloc[:,1:3].to_numpy().tolist(),max_length=128,pad_to_max_length=True,return_tensors='pt')
labels_test=torch.tensor(test.label.values[:])

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

batch_size=32

train_data=TensorDataset(tokenized_train['input_ids'],tokenized_train['attention_mask'],tokenized_train['token_type_ids'],labels_train)
train_sampler=RandomSampler(train_data)
train_dataloader=DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data=TensorDataset(tokenized_val['input_ids'],tokenized_val['attention_mask'],tokenized_val['token_type_ids'],labels_val)
val_sampler=RandomSampler(val_data)
val_dataloader=DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [0]:
test_data=TensorDataset(tokenized_test['input_ids'],tokenized_test['attention_mask'],tokenized_test['token_type_ids'],labels_test)
test_sampler=RandomSampler(test_data)
test_dataloader=DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [0]:

class CBERT(nn.Module):
    def __init__(self,num_labels):
        super(CBERT, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
        
    def forward(self, input_ids,token_type,attention_mask,labels=None):
        outputs = self.bert(input_ids,token_type_ids=token_type,attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output_dp = self.dropout(pooled_output)
        logits = self.classifier(pooled_output_dp)

        outputs = (logits,pooled_output) + outputs[2:]
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
                      
        return outputs

In [0]:
#Bert Model transformer with a single sequence classification layer on top
# model=BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=2,output_attentions=False,output_hidden_states=True)
model = CBERT(2)
model.cuda()


In [0]:
optimizer=AdamW(model.parameters(),lr=2e-5)

In [0]:
epochs=4

#Training steps is no_of_batches*no_of_epochs
total_steps=len(train_dataloader)*epochs

#Learning rate scheduler
scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps = total_steps)

In [0]:
# Function to calculate the accuracy of predictions
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    #return np.sum(pred_flat == labels_flat) / len(labels_flat)
    return accuracy_score(pred_flat , labels_flat)


def flat_precision(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(pred_flat , labels_flat)
    
def flat_recall(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(pred_flat , labels_flat)
    
def flat_f1score(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    #return np.sum(pred_flat == labels_flat) / len(labels_flat)
    return f1_score(pred_flat , labels_flat)
    

In [0]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []


In [0]:
epochs = 1
for epoch_i in range(0, epochs):
    
    #Put model into training mode
    model.train()

    total_loss=0

    for step, batch in enumerate(train_dataloader):

        #Unpack the training batch
        b_input_ids = batch[0].to(device)
        b_attention_mask=batch[1].to(device)
        b_token_type = batch[2].to(device)
        b_labels = batch[3].to(device)

        #Clear previously calculated gradients before performing a backward pass
        #model.zero_grad()        //Not sure if useful or not

        #Perform a forward pass and get the loss
        outputs=model(b_input_ids,b_token_type,b_attention_mask,b_labels)

        # logits = outputs[0]
        
        # #Move logits and labels to CPU
        # logits = logits.detach().cpu().numpy()
        # label_ids = b_labels.to('cpu').numpy()
        
        # #Calculate the accuracy for this batch of train sentences.
        # tmp_train_accuracy = flat_accuracy(logits, label_ids)
        # tain_accuracy += tmp_train_accuracy
        
        loss = outputs[0]
        total_loss += loss.item()

        #Perform backward pass to calculate gradients
        loss.backward()

        #Clip the norm of the gradients to 1.0.
        #Used to prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        #Update weights
        optimizer.step()

        #Update learning rate
        scheduler.step()

    #Avg loss over training data for an epoch
    avg_train_loss = total_loss / len(train_dataloader)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    # print("  Acerage train Accuracy: {0:.2f}".format(tain_accuracy/len(train_dataloader)))


    #### Validation

    #Evaluation mode
    model.eval()

    #Tracking variables 
    val_accuracy=0
    val_precision = 0
    val_recall = 0
    val_f1score = 0
    nb_val_steps=0

    #Evaluate data for one epoch
    for batch in val_dataloader:
        
        #Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        #Unpack the inputs from dataloader
        b_input_ids,b_attention_mask, b_token_type, b_labels = batch
        
        #Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids,b_token_type,b_attention_mask)
        
        #Get the "logits" output by the model. The "logits" are output values prior to applying an activation function like the softmax.
        logits = outputs[0]
        
        #Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        #Calculate the accuracy for this batch of test sentences.
        tmp_val_accuracy = flat_accuracy(logits, label_ids)
        tmp_val_precision = flat_precision(logits, label_ids)
        tmp_val_recall = flat_recall(logits, label_ids)
        tmp_val_f1score = flat_f1score(logits, label_ids)
          

        #Accumulate the total accuracy.
        val_accuracy += tmp_val_accuracy
        val_precision += tmp_val_precision
        val_recall += tmp_val_recall
        val_f1score += tmp_val_f1score
        #Track the number of batches
        nb_val_steps += 1

    # Report the final accuracy for this validation run.
    print("Average Val  Accuracy: {0:.2f}".format(val_accuracy/nb_val_steps))
    print("Average Val  precison: {0:.2f}".format(val_precision/nb_val_steps))
    print("Average Val  recall: {0:.2f}".format(val_recall/nb_val_steps))
    print("Average Val  f1score: {0:.2f}".format(val_f1score/nb_val_steps))



In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
model_save_name = 'paws_49k_4e_cbert.pt'
path = F"/content/gdrive/My Drive/models/paws/{model_save_name}" 


In [0]:
torch.save(model.state_dict(), path)

In [0]:
model.load_state_dict(torch.load(path))
model.eval()

In [0]:
test_sample = test.sample(n=500,random_state=seed)
tokenized_test=tokenizer.batch_encode_plus(test_sample.iloc[:,1:3].to_numpy().tolist(),max_length=128,pad_to_max_length=True,return_tensors='pt')
labels_test=torch.tensor(test_sample.label.values[:])
batch = tokenized_test['input_ids'],tokenized_test['attention_mask'],tokenized_test['token_type_ids'],labels_test

batch = tuple(t.to(device) for t in batch)
b_input_ids,b_attention_mask, b_token_type, b_labels = batch
with torch.no_grad():        
    test_outputs = model(b_input_ids,b_token_type,attention_mask=b_attention_mask)
    


In [0]:
test_sentence_embeddings = test_outputs[1].tolist()


In [0]:
train_sentence_embeddings = [] 
labels_train = []

In [0]:
for step, batch in enumerate(train_dataloader):

    #Unpack the training batch
    b_input_ids = batch[0].to(device)
    b_attention_mask=batch[1].to(device)
    b_token_type = batch[2].to(device)
    b_labels = batch[3].to(device)
    labels_train.extend(b_labels)
    #Clear previously calculated gradients before performing a backward pass
    #model.zero_grad()        //Not sure if useful or not

    #Perform a forward pass and get the loss
    outputs=model(b_input_ids,b_token_type,b_attention_mask)[1]
    # print(outputs[1].shape)
    train_sentence_embeddings.extend(outputs.tolist())


In [0]:
mp ={}
import scipy


In [0]:
mp = {}
for i in range(2,10):
  closest_n = 7
  correct_pred = 0
  for test_idx,test_embedding in enumerate(test_sentence_embeddings[:10]):
    distances = scipy.spatial.distance.cdist([test_embedding], train_sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    count_yes = 0
    for idx, distance in results[0:closest_n]:
    #         print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))
        if(labels_train[idx].tolist() == 1):
            count_yes +=1
    pred = 0 
    if(count_yes>closest_n/2):
        pred = 1
    if(pred ==  labels_test[test_idx]):
        correct_pred+=1
  print(closest_n)
  print(float(correct_pred)/len(labels_test[:100]))
  mp[closest_n] = float(correct_pred)/len(labels_test[:100])