In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import re
import nltk 
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [2]:
df = pd.read_csv("/kaggle/input/full-quora-csv/questions.csv")
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404346,404346,789792,789793,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404347,404347,789794,789795,Do you believe there is life after death?,Is it true that there is life after death?,1
404348,404348,789796,789797,What is one coin?,What's this coin?,0
404349,404349,789798,789799,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [3]:
class QuoraDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.word2index = {}  # sets index accodringly to unique ness - most common lower index e.g.1 
        self.word2count = {}  # counts each unique word 
        self.index2word = {}  # reverse of word2index
        self.n_words = 0
        # self.questions_pair, self.labels = self.convert_data_to_tuples(df, False, False)
        self.questions_pair = []
        self.labels = []
        self.convert_data_to_tuples(self.df)
    
    def __len__(self):
        return len(self.questions_pair)
    

    def __getitem__(self, index):
        questions = self.questions_pair[index]
        if questions:
            q1 = questions[0]
            q1_indices = []
            for word in q1.split():
                q1_indices.append(self.word2index[word])

            q2 = questions[1]
            q2_indices = []
            for word in q2.split():
                q2_indices.append(self.word2index[word])

            return {
                'q1': q1,
                'q2': q2,
                'q1_token': q1_indices, 
                'q2_token': q2_indices, 
                'labels': self.labels[index], 
            }

    
    def text_to_wordlist(self, text, remove_stopwords = False, stem_words = False):
        text = text.lower().split()

        # Optionally, remove stop words
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]

        text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        # Optionally, shorten words to their stems
        if stem_words:
            text = text.split()
            stemmer = SnowballStemmer('english')
            stemmed_words = [stemmer.stem(word) for word in text]
            text = " ".join(stemmed_words)

        # Return a list of words
        text = text.strip()
        return text
    
    
    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)
            
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words + 1
            self.word2count[word] = 1
            self.index2word[self.n_words + 1] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    
    def convert_data_to_tuples(self, df, remove_stopwords = False, stem_words = False):
        for i, row in df.iterrows():
            q1 = self.text_to_wordlist(str(row['question1']), remove_stopwords = False, stem_words = False)
            q2 = self.text_to_wordlist(str(row['question2']), remove_stopwords = False, stem_words = False)
            self.add_sentence(q1)
            self.add_sentence(q2)
            label = int(row['is_duplicate'])
            if q1 and q2:
                self.questions_pair.append((q1, q2))
                self.labels.append(label)

In [4]:
dataset = QuoraDataset(df)

## DataLoader

In [5]:
print(len(dataset.questions_pair))

404331


In [6]:
def collate(batch):
    q1_text_list = []
    q2_text_list = []
    q1_list = []
    q2_list = []
    labels = []
    for item in batch:
        q1_text_list.append(item['q1'])
        q2_text_list.append(item['q2'])
        q1_list.append(item['q1_token'])
        q2_list.append(item['q2_token'])
        labels.append(item['labels'])
          
        
    q1_lengths = [len(q) for q in q1_list]
    q2_lengths = [len(q) for q in q2_list]
    
    return {
        'q1_text': q1_text_list,
        'q2_text': q2_text_list, 
        'q1_token': q1_list, 
        'q2_token': q2_list,
        'q1_lengths': q1_lengths, 
        'q2_lengths': q2_lengths,
        'labels': labels
    }

In [7]:
batch_size = 32
train_split = 0.8
val_split = 0.2

dataset_size = len(dataset)
indices = list(range(dataset_size))

split_train = int(train_split*dataset_size)

shuffle_dataset = True
random_seed = 46

if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[:split_train], indices[split_train:]

assert len(train_indices) + len(val_indices) == dataset_size

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate)
val_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler, collate_fn=collate)

print('Training Set Size {}, Validation Set Size {},'.format(len(train_indices), len(val_indices)))

Training Set Size 323464, Validation Set Size 80867,


## Model

In [8]:
class EmbeddingLSTMNet(nn.Module):
    def __init__(self, num_vocab, embedding_dim, hidden_cells,
                 num_layers, embedding_rquires_grad, dropout):
        super(EmbeddingLSTMNet, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
                            input_size=embedding_dim, 
                            hidden_size=hidden_cells, 
                            num_layers=num_layers, 
                            batch_first=True
                            )
        self.fc1 = nn.Linear(hidden_cells, hidden_cells)
        self.fc2 = nn.Linear(hidden_cells, hidden_cells)
        self.relu = nn.ReLU()
        # initialize embeddings 
        # self.embedding = nn.Embedding.from_pretrained(pretrained_weights)
        self.embedding = nn.Embedding(num_embeddings=num_vocab + 1, embedding_dim=embedding_dim)
        self.embedding.weight.requires_grad = embedding_rquires_grad

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def forward(self, question, lengths):
        """ 
        Params:
        -------
        question : (batch dim, sequence)
                   i.e. [ [i1, i2, i3],
                          [j1, j2, j4, j5] ]
        lenghts : list
                  list all the lengths of each question  
        
        Return:
        -------
        result : torch.tensor
                 output tesnor of of forward pass 
        """
        # Reverse the sequence lengths indices in decreasing order (pytorch requirement for pad and pack)
        sorted_indices = np.flipud(np.argsort(lengths))
        lengths = np.flipud(np.sort(lengths))
        lengths = lengths.copy()
        
        # Reorder questions in the decreasing order of their lengths
        ordered_questions = [torch.LongTensor(question[i]).to(self.device) for i in sorted_indices]
        # Pad sequences with 0s to the max length sequence in the batch
        ordered_questions = pad_sequence(ordered_questions, batch_first=True)
        # Retrieve Embeddings
        embeddings = self.embedding(ordered_questions).to(self.device)
        
        
        # Model forward 
        embeddings = self.dropout(embeddings)
        # Pack the padded sequences and pass it through LSTM
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
        out, (hn, cn) = self.lstm(packed)
        # Unpack the padded sequence and pass it through the linear layers 
        unpacked, unpacked_len = pad_packed_sequence(out, batch_first=True, total_length=int(lengths[0]))
        out = self.fc1(unpacked)
        out = self.relu(out)
        out = self.fc2(out)
        
        # Reorder the output to the original order in which the questions were passed
        result = torch.FloatTensor(out.size())
        for i, encoded_matrix in enumerate(out):
            result[sorted_indices[i]] = encoded_matrix
        return result


class SiameseNetwork(nn.Module):
    def __init__(self, embedding_lstm_net):
        super(SiameseNetwork, self).__init__()
        """
        Siamese LSTM Network 

        Params:
        -------
        embedding_lstm_net : nn.Module embedded LSTM Network 
        """
        self.embedding = embedding_lstm_net
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def forward(self, q1, q2, q1_lengths, q2_lengths):
        """ Forward pass 
        Params:
        -------
        q1 : pad sequence tensor question 1  
        q2 : pad sequence tensor question 2  
        q1_lengths : torch.tensor original lengths of each question 1
        q2_lengths : torch.tensor original lengths of each question 1
        Returns:
        --------
        similarity_score : torch.tensor
        """
        output_q1 = self.embedding(q1, q1_lengths)
        output_q2 = self.embedding(q2, q2_lengths)
        similarity_score = torch.zeros(output_q1.size()[0]).to(self.device)
        # Calculate Similarity Score between both questions in a single pair
        for index in range(output_q1.size()[0]):
            # Sequence lenghts are being used to index and retrieve the activations before the zero padding since they were not part of original question
            q1 = output_q1[index, q1_lengths[index] - 1, :]
            q2 = output_q2[index, q2_lengths[index] - 1, :]
            similarity_score[index] = self.manhattan_distance(q1, q2)
        
        return similarity_score
    
    def manhattan_distance(self, q1, q2):
        """ Computes the Mannhatten distance between the two question tokens """
        return torch.exp(-torch.sum(torch.abs(q1 - q2), dim=0)).to(self.device)
    def cosine_similarity(self, q1, q2):
        cos = nn.CosineSimilarity(dim=0, eps=1e-6)
        return cos(q1, q1)

In [9]:
num_vocab = len(dataset.word2index)
embedding_dim = 300
hidden_cells = 100
num_layers = 3
embedding_rquires_grad = False
dropout = 0.0

# embedding net
embedding_net = EmbeddingLSTMNet(
    num_vocab = num_vocab,
    embedding_dim = embedding_dim,
    hidden_cells = hidden_cells,
    num_layers = num_layers,
    embedding_rquires_grad = embedding_rquires_grad,
    dropout = dropout)

# siamese model
model = SiameseNetwork(embedding_net).to(device)

In [10]:
# test model class with one batch from the dataloader 
for i, batch in enumerate(train_dataloader):
    q1, q2 = batch['q1_token'], batch['q2_token']
    q1_len, q2_len = batch['q1_lengths'], batch['q2_lengths']
    y = torch.FloatTensor(batch['labels'])

model(q1, q2, q1_len, q2_len)

tensor([0.7516, 0.8349, 0.8511, 0.7871, 0.8032, 0.8300, 0.7910, 0.7947],
       device='cuda:0', grad_fn=<CopySlices>)

## Model Trainer

In [11]:
class ModelTrainer:
    def __init__(
        self, model, hparams, train_dataloader, val_dataloader, 
        train_indices, val_indices,lr_scheduler_enabler=True):
        """ 
        This Class fits the model 

        Params:
        -------  
        model : nn.Module
                Pytorch NN Model that is spposed to be fitted/trained
        hparams : dict
                  Dictionary of Hyperparametes  
        train_dataloader : torch.utils.data.DataLoader
                           Training DataLoader
        val_dataloader : torch.utils.data.DataLoader
                         Validation DataLoader 
        train_indices : list 
                        list of the train indices
        val_indices : list
                      list of the val indices 
        lr_scheduler_enabler : bool
                               if True enables Learning rate scheduler, if False disables it
        """
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.lr_scheduler_enabler = lr_scheduler_enabler
        self.hparams = hparams 
        self.learning_rate = hparams['learning_rate']
        self.epochs = hparams['epoch']
        self.train_indices = train_indices
        self.val_indices = val_indices

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if self.device == "cuda":
            self.threshold = hparams['threshold'].to(self.device)
        else:
            self.threshold = hparams['threshold'].to(self.device)

        self.model = model
        self.optimizer = self.optimization()
        self.loss_fn = self.loss()
        self.lr_scheduler = self.learning_rate_scheduler() 

        self.data = dict()
        self.data["train_loss"] = list()
        self.data["train_acc"] = list()
        self.data["val_loss"] = list()
        self.data["val_acc"] = list()


    def train_epoch(self, epoch):
        """ Trains an epoch """
        self.model.train()

        loss_history = []
        correct_total = 0
        with tqdm(self.train_dataloader, unit="batch") as tepoch:
            for i, batch in enumerate(tepoch):
                tepoch.set_description(f"Epoch [{epoch+1}/{self.epochs}]  Training")
                if self.device == "cuda":
                    q1, q2 = batch['q1_token'].to(self.device), batch['q2_token'].to(self.device)
                    q1_len, q2_len = batch['q1_lengths'].to(self.device), batch['q2_lengths'].to(self.device)
                    y = torch.FloatTensor(batch['labels']).to(self.device)
                else:
                    q1, q2 = batch['q1_token'], batch['q2_token']
                    q1_len, q2_len = batch['q1_lengths'], batch['q2_lengths']
                    y = torch.FloatTensor(batch['labels'])
                
                # Reset the gardients 
                self.optimizer.zero_grad()

                # Model forward and predictions
                similarity = self.model(q1, q2, q1_len, q2_len)
                y_pred = (similarity > self.threshold).float() * 1
                y = y.to("cuda")
                correct = self.inferece(y_pred, y)
                correct_total += correct

                # Calculate the loss 
                loss = self.loss_fn(similarity, y)
                loss_history.append(loss.item())

                # Calculate gradients by performign the backward pass
                loss.backward()
                
                # Update weights
                self.optimizer.step()

                if i % 100 == 0:
                    tepoch.set_postfix(train_loss=np.mean(loss_history), train_acc=f'{(correct/y.size()[0])*100} %' )
            
            # Enable learning rate scheduler  
            if self.lr_scheduler_enabler:
                self.lr_scheduler.step()

        return  np.mean(loss_history), (correct_total/len(self.train_indices))*100

    def evaluate(self):
        """ Validates an epoch """
        self.model.eval()

        loss_history = []
        correct_total = 0
        with torch.no_grad():
            for i, batch in enumerate(self.val_dataloader):
                if self.device == "cuda":
                    q1, q2 = batch['q1_token'].to(self.device), batch['q2_token'].to(self.device)
                    q1_len, q2_len = batch['q1_lengths'].to(self.device), batch['q2_lengths'].to(self.device)
                    y = torch.FloatTensor(batch['labels']).to(self.device)
                else:
                    q1, q2 = batch['q1_token'], batch['q2_token']
                    q1_len, q2_len = batch['q1_lengths'], batch['q2_lengths']
                    y = torch.FloatTensor(batch['labels'])

                # Model forward and predictions
                similarity = self.model(q1, q2, q1_len, q2_len)
                y_pred = (similarity > self.threshold).float() * 1
                correct = self.inferece(y_pred, y)
                correct_total += correct

                # Calculate the loss
                y = y.to("cuda")
                loss = self.loss_fn(similarity, y)
                loss_history.append(loss.item())

        avg_val_acc =  correct_total/len(self.val_indices) * 100 
        return np.mean(loss_history), avg_val_acc
    
    def inferece(self, y_pred, y):
        """ Performs inference """
        y = y.to("cuda")
        return (y_pred == y).sum().item()

    def fit(self):
        """ Fits the model """
        train_loss = 0
        val_loss = 0
        val_acc = 0
        best_acc = 0
        index_model = 0
        for e in range(self.epochs):
            train_loss, train_acc = self.train_epoch(e)
            val_loss, val_acc = self.evaluate()
            if val_acc > best_acc:
                best_acc = val_acc
                PATH = f"Model_{index_model}_val_acc_{best_acc}"
                torch.save(self.model.state_dict(), PATH)
                index_model += 1
                
            print(f'Epoch [{e+1}/{self.epochs}] Validation: val_loss: {val_loss} val_acc: {val_acc} %')
            
            self.data["train_loss"].append(train_loss)
            self.data["train_acc"].append(train_acc)
            self.data["val_loss"].append(val_loss)
            self.data["val_acc"].append(val_acc)

    def test(self):
        """ Tests the model """
        self.model.eval()

        predictions = []
        labels_list = []
        loss_history = []
        correct_total = 0
        with torch.no_grad():
            for i, batch in enumerate(self.val_dataloader):
                if self.device == "cuda":
                    q1, q2 = batch['q1_token'].to(self.device), batch['q2_token'].to(self.device)
                    q1_len, q2_len = batch['q1_lengths'].to(self.device), batch['q2_lengths'].to(self.device)
                    y = torch.FloatTensor(batch['labels']).to(self.device)
                else:
                    q1, q2 = batch['q1_token'], batch['q2_token']
                    q1_len, q2_len = batch['q1_lengths'], batch['q2_lengths']
                    y = torch.FloatTensor(batch['labels'])

                # Model forward and predictions
                similarity = self.model(q1, q2, q1_len, q2_len)
                y_pred = (similarity > self.threshold).float() * 1
                predictions.append(y_pred), labels_list.append(y)
                correct = self.inferece(y_pred, y)
                correct_total += correct

                # Calculate the loss 
                
                loss = self.loss_fn(similarity, y)
                loss_history.append(loss.item())
        
        # Calculate the accuracy
        avg_val_acc =  correct_total/len(self.val_indices) * 100 
        print('- - - Model Performance - - -')
        print(f'\nModel Accuracy:  {avg_val_acc}')
        print(f'Correct predictions: {correct_total}, Incorret predictions: {len(self.val_indices) - correct_total}')
        print('')
        cm = plotConfusionMatrix(np.hstack(predictions), np.hstack(labels_list),['similar', 'dissimilar'], title="Confusion Matrix Plot of Test Set")
        print(f'TP: {cm[0,0]}')
        print(f'FP: {cm[1,0]}')
        print(f'FN: {cm[0,1]}')
        print(f'TN: {cm[1,1]}')
        print(f'\nPrecision Score: {precision_score(np.hstack(predictions), np.hstack(labels_list))}')
        print(f'Recall Score: {recall_score(np.hstack(predictions), np.hstack(labels_list))}')
        print(f'F1 Score: {f1_score(np.hstack(predictions), np.hstack(labels_list))}')

        # adopted from https://www.codegrepper.com/code-examples/python/roc+curve+pytorch
        fpr, tpr, threshold = roc_curve(np.hstack(predictions), np.hstack(labels_list))
        roc_auc = auc(fpr, tpr)
        plt.title('Receiver Operating Characteristic (ROC)')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.grid()
        plt.show()
        # adopted from https://www.codegrepper.com/code-examples/python/roc+curve+pytorch

    def predict(self, test_sample_dict):
        """ Uses the model to predict the similarity of a given input pair of questions"""
        self.model.eval()
        
        print('question 1:', test_sample_dict['q1_text'])
        print('question 2:', test_sample_dict['q2_text'])
        print('tokens  q1:', test_sample_dict['q1_token'])
        print('tokens  q2:', test_sample_dict['q2_token'])

        q1, q2 = test_sample_dict['q1_token'], test_sample_dict['q2_token']
        q1_len, q2_len = test_sample_dict['q1_lengths'], test_sample_dict['q2_lengths']
        y = torch.FloatTensor(test_sample_dict['labels'])
        
        # Model forward and predictions
        similarity = self.model(q1, q2, q1_len, q2_len)
        y_pred = (similarity > self.threshold).float() * 1
        
        print(f'\n\nModel predicts {y_pred.item()} --> Actual value {y.item()}')
        if y_pred.item() == y.item():
            print(f'Model prediction is correct :)')

            if y_pred.item() == 1.0:
                print(f'\nThe questions {test_sample_dict["q1_text"]} and {test_sample_dict["q2_text"]} are similar!')
            else:
                print(f'\nThe questions {test_sample_dict["q1_text"]} and {test_sample_dict["q2_text"]} are dissimilar!')    
        else:
            print(f'Model prediction is inaccurate :(')
            if y_pred.item() == 1.0:
                print(f'\nThe questions {test_sample_dict["q1_text"]} and {test_sample_dict["q2_text"]} should be dissimilar!')
            else:
                print(f'\nThe questions {test_sample_dict["q1_text"]} and {test_sample_dict["q2_text"]} should be similar!')  
        
    def optimization(self):
        """ Initializes the optimizer """
        return torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    def learning_rate_scheduler(self):
        """ Initializes the learning rate scheduler """
        return torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9)

    def loss(self):
        """ Initializes the loss """
        return nn.MSELoss() #nn.CrossEntropyLoss()
    
    def return_data(self):
        """ Output the data """
        return self.data

In [12]:
hparams = {
    'threshold': torch.Tensor([0.5]),  # threshold for determining similiarity
    'learning_rate': 1e-03,  # learning rate
    'epoch': 10  # number of epochs
}

In [13]:
trainer = ModelTrainer(
    model, 
    hparams, 
    train_dataloader, 
    val_dataloader,
    train_indices, 
    val_indices
)

In [None]:
trainer.fit()

Epoch [1/10]  Training:  84%|████████▍ | 8535/10109 [05:08<00:55, 28.59batch/s, train_acc=71.875 %, train_loss=0.164]