#Quora Question Pairs (BERT continuation)

## This paper is the continuation of identification question pairs that have the same intent.

In this part we will try 2 approaches using BERT:
  1. Encoding sentenses and measure cosine similarity;
  2. Train BERT classifier.
   

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 10.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 23.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 30.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertTokenizer
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("train.csv", engine='python', error_bad_lines=False, index_col='id')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
len(df)

404290

In [None]:
data_list = list(zip(list(df['question1']), list(df['question2']), list(df['is_duplicate'])))

In [None]:
data_list = data_list[:100000]

### Configuration

In [None]:
config = {
    'model_name': 'prajjwal1/bert-tiny',
    'pad_len': 100,
    'batch_size': 16,
    'lr': 5e-5,
    'split': 0.9,
    'epochs': 2
}

### Get tokenizer and model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(config['model_name'], do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    config['model_name'], # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Load the AdamW optimizer
optimizer = AdamW(model.parameters(),
                  lr = config['lr'], # args.learning_rate 
                  eps = 1e-8 # args.adam_epsilon 
                )
criterion = nn.CrossEntropyLoss()

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

### Create dataset and dataloaders

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, pad_len):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_len = pad_len

    def __len__(self):
        return len(self.data)

    def _check_question_len_(self, question: list, max_len: int):
        if len(question) > max_len:
            question = question[:max_len]
        return question
    
    def __getitem__(self, indx):
        sample = self.data[indx]
        question1 = sample[0]
        question2 = sample[1]
        label = int(sample[2])
        # concatenate sentences and tokenize
        questions_tokens = self.tokenizer(question1 + '.' + question2,
                                   add_special_tokens = True,
                                   truncation = True,
                                   max_length = config['pad_len'] * 2, 
                                   pad_to_max_length = True,
                                   return_attention_mask = True,
                                   return_tensors = 'pt',
                                   is_split_into_words=False
                                   )
        sample = questions_tokens['input_ids'][0].to(device)
        attn_mask = questions_tokens['attention_mask'][0].to(device)
        label = torch.LongTensor([label]).to(device)
        return {
            'question_pairs': sample,
            'attention_mask': attn_mask,
            'label': label
        }


In [None]:
# splitting dataset
def train_test_split(data: list, percent_train: int) -> tuple:
  edge = int(len(data) * percent_train)
  train = data[:edge]
  test = data[edge:]
  return train, test

In [None]:
# splitting and create dataloader
train, test = train_test_split(data_list, 0.9)
train_dataset = PairsDataset(train, tokenizer, config['pad_len'])
test_dataset = PairsDataset(test, tokenizer, config['pad_len'])

train_loader = torch.utils.data.DataLoader(train_dataset, config['batch_size'], shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, config['batch_size'], shuffle=False)

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, element

In [None]:
def train_model(model, dataloader, optimizer, criterion, epoch_num):
  loss_list = []
  total_steps = epoch_num * len(dataloader)
  step = 0
  for epoch in range(epoch_num):
    model.train()
    for i, batch in enumerate(dataloader):
      step += 1
      model.zero_grad()
      input_ids = batch['question_pairs'].to(device)
      input_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      out = model(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
      loss = out[0]
      loss_list.append(loss.detach().item())
      loss.backward()
      optimizer.step()
      if (i+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epoch_num}], Step [{step}/{total_steps}], Loss: {loss.item():.4f}, Batch: {i}')

In [None]:
train_model(model, train_loader, optimizer, criterion, config['epochs'])



Epoch [1/2], Step [100/11250], Loss: 0.7159, Batch: 99
Epoch [1/2], Step [200/11250], Loss: 0.7145, Batch: 199
Epoch [1/2], Step [300/11250], Loss: 0.7514, Batch: 299
Epoch [1/2], Step [400/11250], Loss: 0.6357, Batch: 399
Epoch [1/2], Step [500/11250], Loss: 0.4808, Batch: 499
Epoch [1/2], Step [600/11250], Loss: 0.7559, Batch: 599
Epoch [1/2], Step [700/11250], Loss: 0.5588, Batch: 699
Epoch [1/2], Step [800/11250], Loss: 0.5744, Batch: 799
Epoch [1/2], Step [900/11250], Loss: 0.7104, Batch: 899
Epoch [1/2], Step [1000/11250], Loss: 0.5158, Batch: 999
Epoch [1/2], Step [1100/11250], Loss: 0.6240, Batch: 1099
Epoch [1/2], Step [1200/11250], Loss: 0.4622, Batch: 1199
Epoch [1/2], Step [1300/11250], Loss: 0.4407, Batch: 1299
Epoch [1/2], Step [1400/11250], Loss: 0.4360, Batch: 1399
Epoch [1/2], Step [1500/11250], Loss: 0.6277, Batch: 1499
Epoch [1/2], Step [1600/11250], Loss: 0.3654, Batch: 1599
Epoch [1/2], Step [1700/11250], Loss: 0.6719, Batch: 1699
Epoch [1/2], Step [1800/11250], Lo

In [None]:
#Test dataset
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions_test = []
true_labels = []
# Predict 
for batch in test_loader:
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch['question_pairs'], batch['attention_mask']
  
    # Telling the model not to compute or store gradients, saving memory and 

    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    #label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions_test.append(logits)
    true_labels.extend(batch['label'].view(-1).cpu().tolist())

predictions_test = np.concatenate(predictions_test, axis=0)
predictions_test = np.argmax(predictions_test, axis=1)
f1 = f1_score(all_predictions_test, true_labels, average='micro') 
print ("F1 score: {:.2%}".format(f1))
print('    DONE.')



    DONE.


In [None]:
true_labels

[1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,


In [None]:

len(predictions_test * 16)

10000

In [None]:
import numpy as np
predictions_test_0 = np.concatenate(predictions_test, axis=0)
predictions_test_0

array([[-0.8493309 ,  0.9644388 ],
       [-0.49059242,  0.3957568 ],
       [-0.49093825,  0.48052153],
       ...,
       [-0.38956925,  0.3530875 ],
       [ 1.6686777 , -2.1583014 ],
       [ 0.14449486, -0.44310337]], dtype=float32)

In [None]:
all_predictions_test = np.argmax(predictions_test_0, axis=1)

In [None]:
all_predictions_test

array([1, 1, 1, ..., 1, 0, 0])

F1 score: 79.70%
