In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from transformers import BertTokenizerFast, BertModel
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


In [2]:
import torch
import torch.nn as nn

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.model_selection import train_test_split

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

**test whether the system has a GPU support and fix device variable accordingly**

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Set the random seeds for deterministic results.**

In [4]:
SEED = 1234

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data analysis

In [None]:
df = pd.read_csv("/kaggle/input/qqp-new/train.tsv", sep='\t')
df.info()
# remove Null Values
df.dropna(inplace=True)

In [None]:
# train_sentences_lens = train_df['question1'].apply(lambda x: len(x.split(' '))).tolist()
# train_sentences_lens.extend(train_df['question2'].apply(lambda x: len(x.split(' '))).tolist())
# sns.distplot(train_sentences_lens)

As we see from the graph, the number of cases where words counts greater than 40 is too small.

In [None]:
MAX_LEN = 40

In [None]:
print(len(df))
df.head()

363846


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [None]:
# train_df = df[:75000]
# val_df = df[75000:90000]

# print(len(train_df))
# print(len(val_df))

In [None]:
train_df = df[:50000]
val_df = df[50000:60000]

print(len(train_df))
print(len(val_df))

50000
10000


In [None]:
qids = pd.Series(list(train_df['qid1']) + list(train_df['qid2']))

print ('Unique Questions number: {}\n'.format(len(np.unique(qids))))

q_vals=qids.value_counts()[0:5]
print ('Top 5 most frequently asked questions: ')

for pair in q_vals.iteritems():
    print(train_df.loc[train_df['qid2']==pair[0]]['question1'].head(1).values + " count: " + str(pair[1]))

q_vals=q_vals.values

Unique Questions number: 88759

Top 5 most frequently asked questions: 
['Is it unhealthy to look forward to death? count: 23']
['Instagram: is using too much hashtags vulgar? count: 14']
['How could I lose a few pounds quickly? count: 14']
['How can I earn money part time online? count: 13']
['How can I loose weight in a week? count: 12']


### Checking whether there are any repeated pair of questions

In [None]:
duplicate_rows = train_df[train_df.duplicated(['qid1','qid2'])]
print ("Number of duplicate questions : ", len(duplicate_rows))

Number of duplicate questions :  0


# Dataset Preparation

In [None]:
BERT_VERSION = 'bert-base-uncased'
POOLED_OUTPUT_DIM = 768

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_VERSION)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# split data to train and validation sets
# train_df, val_df = train_test_split(train_df, test_size=0.1)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
class BertDataSet:
    def __init__(self, first_questions, second_questions, targets, tokenizer):
        self.first_questions = first_questions
        self.second_questions = second_questions
        self.targets = targets
        self.tokenizer = tokenizer
        self.length = len(first_questions)
        
    def __len__(self):
        return self.length

    def __getitem__(self, item):
        first_question = str(self.first_questions[item])
        second_question = str(self.second_questions[item])

        # removes extra white spaces from questions
        first_question = " ".join(first_question.split())
        second_question = " ".join(second_question.split())
        
        ### [CLS] question1 [SEP] questions2 [SEP] ... [PAD]
        inputs = self.tokenizer.encode_plus(
            first_question,
            second_question,
            add_special_tokens=True,
            padding='max_length',
            max_length=2 * MAX_LEN + 3, # max length of 2 questions and 3 special tokens
            truncation=True   
        )
        
        # return targets 0, when using data set in testing and targets are none
        return {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            "targets": torch.tensor(int(self.targets[item]), dtype=torch.long) if self.targets is not None else 0
        }
        

In [None]:
# creates dataset and returns dataloader of it
def get_data_loader(df, targets, batch_size, shuffle, tokenizer):
    dataset = BertDataSet(
        first_questions=df["question1"].values,
        second_questions=df["question2"].values,
        targets=targets,
        tokenizer=tokenizer
    )
    
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle
    )
    
    return data_loader

In [None]:
# training batch size we gonna use throughout this notebook.
BS = 48

In [None]:
# create data loaders of training and validation data.
train_data_loader = get_data_loader(
    df=train_df,
    targets=train_df["is_duplicate"].values,
    batch_size=BS,
    shuffle=True,
    tokenizer=tokenizer
)

val_data_loader = get_data_loader(
    df=val_df,
    targets=val_df["is_duplicate"].values,
    batch_size=4 * BS,
    shuffle=True,
    tokenizer=tokenizer
)

# Model

In [None]:
class BertModel(nn.Module):
    def __init__(self, bert_path):
        super(BertModel, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(POOLED_OUTPUT_DIM, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled = self.bert(ids, attention_mask=mask,token_type_ids=token_type_ids)
        
        # add dropout to prevent overfitting.
        pooled = self.dropout(pooled) 
        return self.out(pooled)

model = BertModel(BERT_VERSION).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




# Training

In [None]:
# loss function is simple binary cross entropy loss
# need sigmoid to put probabilities in [0,1] interval
def loss_fn(outputs, targets):
    outputs = torch.squeeze(outputs)
    return nn.BCELoss()(nn.Sigmoid()(outputs), targets)

In [None]:
# computes perplexity on validation data
def calculate_perplexity(data_loader, model, device):
    model.eval()
    
    # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
    with torch.no_grad():
        total_loss = 0
        for batch in data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            total_loss += loss_fn(outputs, targets).item()
            
    model.train()

    return np.exp(total_loss / len(data_loader))

In [None]:
def train_loop(epochs, train_data_loader, val_data_loader, model, optimizer, device, scheduler=None):
    it = 1
    total_loss = 0
    curr_perplexity = None
    perplexity = None
    
    model.train()
    for epoch in range(epochs):
        print('Epoch: ', epoch + 1)
        for batch in train_data_loader:
            ids = batch["ids"].to(device, dtype=torch.long)
            mask = batch["mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.float)

            optimizer.zero_grad()
            
            # do forward pass, will save intermediate computations of the graph for later backprop use.
            outputs = model(ids, mask=mask, token_type_ids=token_type_ids)
            
            loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            
            # running backprop.
            loss.backward()
            
            # doing gradient descent step.
            optimizer.step()
            
            # we are logging current loss/perplexity in every 100 iteration
            if it % 100 == 0:
                
                # computing validation set perplexity in every 500 iteration.
                if it % 500 == 0:
                    curr_perplexity = calculate_perplexity(val_data_loader, model, device)
                    
                    if scheduler is not None:
                        scheduler.step()

                    # making checkpoint of best model weights.
                    if not perplexity or curr_perplexity < perplexity:
                        torch.save(model.state_dict(), 'saved_model')
                        perplexity = curr_perplexity

                print('| Iter', it, '| Avg Train Loss', total_loss / 100, '| Dev Perplexity', curr_perplexity)
                total_loss = 0

            it += 1
            torch.cuda.empty_cache

In [None]:
def run(model, train_df, device, train_data_loader, val_data_loader):
    EPOCHS = 3
    
    lr = 3e-5
    num_training_steps = int(len(train_data_loader) * EPOCHS)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    
    train_loop(EPOCHS, train_data_loader, val_data_loader,  model, optimizer, device, scheduler)

In [None]:
run(model, train_df, device, train_data_loader, val_data_loader)

Epoch:  1
| Iter 100 | Avg Train Loss 0.5441265693306923 | Dev Perplexity None
| Iter 200 | Avg Train Loss 0.41708993166685104 | Dev Perplexity None
| Iter 300 | Avg Train Loss 0.41198499530553817 | Dev Perplexity None
| Iter 400 | Avg Train Loss 0.3844555076956749 | Dev Perplexity None
| Iter 500 | Avg Train Loss 0.362734806984663 | Dev Perplexity 1.4798520481679371
| Iter 600 | Avg Train Loss 0.3655783827602863 | Dev Perplexity 1.4798520481679371
| Iter 700 | Avg Train Loss 0.35742919743061063 | Dev Perplexity 1.4798520481679371
| Iter 800 | Avg Train Loss 0.34129196882247925 | Dev Perplexity 1.4798520481679371
| Iter 900 | Avg Train Loss 0.3408063222467899 | Dev Perplexity 1.4798520481679371
| Iter 1000 | Avg Train Loss 0.33567011684179304 | Dev Perplexity 1.3758246944501382
Epoch:  2
| Iter 1100 | Avg Train Loss 0.2813141692429781 | Dev Perplexity 1.3758246944501382
| Iter 1200 | Avg Train Loss 0.22725310616195202 | Dev Perplexity 1.3758246944501382
| Iter 1300 | Avg Train Loss 0.2

# Testing

In [None]:
# test_df = pd.read_csv("/kaggle/input/qqp-new/test.tsv", sep='\t')
test_df = df[60000:70000]
test_df.info()
test_df.dropna(inplace=True)
print(len(test_df))
test_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 60000 to 69999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            10000 non-null  int64 
 1   qid1          10000 non-null  int64 
 2   qid2          10000 non-null  int64 
 3   question1     10000 non-null  object
 4   question2     10000 non-null  object
 5   is_duplicate  10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 546.9+ KB
10000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
60000,220588,327713,265849,Can someone propose flags for New Zealand with...,Why is New Zealand considering a new flag?,0
60001,140549,223336,223337,What are some interview tricks for a fresher?,What are some interview tricks and etiquettes ...,1
60002,305900,429354,76735,Can a Gemini man and a Gemini woman have a suc...,What is the compatibility of a Gemini man and ...,1
60003,164731,72639,43909,How may I know whether my wife is cheating on me?,How do I know my partner is cheating on me?,1
60004,335903,463184,78272,How do I exit from a WhatsApp group without le...,What are polite ways to exit a WhatsApp group?,0


In [None]:
str1 = test_df['question1'].values[0]
str2 = test_df['question2'].values[0]
print(str1)
print(str2)

Can someone propose flags for New Zealand without being a New Zealander now that they are considering changing their flag?
Why is New Zealand considering a new flag?


In [None]:
import torch.nn.functional as F
# this function returns probabilities for every test case.
def test(model, test_df, device):
    predictions = torch.empty(0).to(device, dtype=torch.float)
    
    test_dataset = BertDataSet(
        first_questions=test_df["question1"].values,
        second_questions=test_df["question2"].values,
        targets=None,
        tokenizer=tokenizer
    )
    
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=512
    )
    
    with torch.no_grad():
        model.eval()
        for batch in tqdm(test_data_loader):
            ids = batch["ids"]
            mask = batch["mask"]
            token_type_ids = batch["token_type_ids"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            predictions = torch.cat((predictions, nn.Sigmoid()(outputs)))
            
    return predictions.cpu().numpy().squeeze()

predictions = test(model, test_df, device)
len(predictions)

100%|██████████| 20/20 [00:22<00:00,  1.11s/it]


10000

In [None]:
predictions[:10]

array([1.4656390e-01, 9.6779877e-01, 3.4735525e-01, 9.7892362e-01,
       2.4780437e-01, 9.8654538e-01, 5.5120763e-04, 9.9289036e-01,
       9.4114757e-01, 8.8789716e-04], dtype=float32)

In [None]:
print(len(test_df['is_duplicate'].values))
test_df['is_duplicate'].values[:10]

10000


array([0, 1, 1, 1, 0, 1, 0, 1, 1, 0])

In [None]:
# from sklearn.metrics import f1_score
# for i in range(0, 1000):
#     print(i)
#     preds = np.array(predictions) > i/1000
#     print("f1:", f1_score(test_df['is_duplicate'], preds, average='micro'))

In [None]:
from sklearn.metrics import f1_score

preds = np.array(predictions) > 0.5
f1 = f1_score(test_df['is_duplicate'], preds, average='micro')
print(f"F1 score: {f1}")

F1 score: 0.872


In [None]:
from sklearn.metrics import log_loss
# calculate log loss
logloss = log_loss(test_df["is_duplicate"], predictions)
print(f"Log loss: {logloss}")

Log loss: 0.32408189836947565


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_df['is_duplicate'], preds)

0.872

# Evaluation

In [None]:
# def val_metrics(predictions, Y):
#     correct = 0
#     total = 0
#     sum_loss = 0.0
#     for y in :
#         loss = F.binary_cross_entropy_with_logits(y_hat, y)
#         y_pred = y_hat > 0
#         correct += (y_pred.float() == y).float().sum()
#         total += y.shape[0]
#         sum_loss += loss.item()*y.shape[0]
#     return sum_loss/total, correct/total

In [None]:
# prints if two questions is similar and score of confidence
def eval(model, tokenizer, first_question, second_question, device):
    inputs = tokenizer.encode_plus(
        first_question,
        second_question,
        add_special_tokens=True,
    )

    ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(device, dtype=torch.long)
    mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(device, dtype=torch.long)
    token_type_ids = torch.tensor([inputs["token_type_ids"]], dtype=torch.long).to(device, dtype=torch.long)

    with torch.no_grad():
        model.eval()
        output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        prob = nn.Sigmoid()(output).item()

        print("questions [{}] and [{}] are {} with score {}".format(first_question, second_question, 'similar' if prob > 0.5 else 'not similar', prob))

In [None]:
# change questions to test model
first_question = "Can someone propose flags for New Zealand without being a New Zealander now that they are considering changing their flag?"
second_question = "Why is New Zealand considering a new flag?"
eval(model, tokenizer, first_question, second_question, device)

questions [Can someone propose flags for New Zealand without being a New Zealander now that they are considering changing their flag?] and [Why is New Zealand considering a new flag?] are not similar with score 0.14656387269496918
