In [1]:
import datasets
import torch
import pandas as pd
import torch.nn.functional as F
import math
import pickle

from collections import Counter
from spacy.lang.en import English
from tqdm.auto import tqdm
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2TokenizerFast, BertConfig, BertModel, BertTokenizerFast

In [2]:
config = BertConfig()

In [3]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [4]:
config.is_decoder

False

In [4]:
bert_config = BertConfig.from_pretrained('bert-base-uncased')

In [5]:
bert_config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [7]:
bert_model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
bert_model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [15]:
bert_model.requires_grad_(False)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [16]:
bert_model.encoder.layer[0].attention.self.query.weight

Parameter containing:
tensor([[-0.0164,  0.0261, -0.0263,  ...,  0.0154,  0.0768,  0.0548],
        [-0.0326,  0.0346, -0.0423,  ..., -0.0527,  0.1393,  0.0078],
        [ 0.0105,  0.0334,  0.0109,  ..., -0.0279,  0.0258, -0.0468],
        ...,
        [-0.0085,  0.0514,  0.0555,  ...,  0.0282,  0.0543, -0.0541],
        [-0.0198,  0.0944,  0.0617,  ..., -0.1042,  0.0601,  0.0470],
        [ 0.0015, -0.0952,  0.0099,  ..., -0.0191, -0.0508, -0.0085]])

In [2]:
qqp_dataset = datasets.load_dataset("glue","qqp")

Reusing dataset glue (/home/dingyizhou/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
qqp_paraphrase_dataset = qqp_dataset.filter(lambda example: example['label']==1)

Loading cached processed dataset at /home/dingyizhou/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e0a57b1225e8408c.arrow
Loading cached processed dataset at /home/dingyizhou/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e7f91f78dbacfd6e.arrow
Loading cached processed dataset at /home/dingyizhou/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-495a383dd1f86105.arrow


In [6]:
len(qqp_paraphrase_dataset["train"]['question1'])

134378

In [7]:
qqp_paraphrase_dataset["train"]['question1'][0]

'How do I control my horny emotions?'

In [28]:
qqp_paraphrase_dataset

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 134378
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 14885
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 0
    })
})

In [None]:
nlp = English()
tokenizer = nlp.tokenizer

In [30]:
bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [12]:

total_sentence_lst = []
train_sentence_lst1, train_sentence_lst2, eval_sentence_lst1, eval_sentence_lst2 = [], [] ,[], []
for sentence in tqdm(qqp_paraphrase_dataset["train"]["question1"]):
    word_lst = [x.text for x in tokenizer(sentence)]
    train_sentence_lst1.append(word_lst)
    total_sentence_lst.append(word_lst)

for sentence in tqdm(qqp_paraphrase_dataset["train"]["question2"]):
    word_lst = [x.text for x in tokenizer(sentence)]
    train_sentence_lst2.append(word_lst)
    total_sentence_lst.append(word_lst)

for sentence in tqdm(qqp_paraphrase_dataset["validation"]["question1"]):
    word_lst = [x.text for x in tokenizer(sentence)]
    eval_sentence_lst1.append(word_lst)
    total_sentence_lst.append(word_lst)

for sentence in tqdm(qqp_paraphrase_dataset["validation"]["question2"]):
    word_lst = [x.text for x in tokenizer(sentence)]
    eval_sentence_lst2.append(word_lst)
    total_sentence_lst.append(word_lst)

  0%|          | 0/134378 [00:00<?, ?it/s]

  0%|          | 0/134378 [00:00<?, ?it/s]

  0%|          | 0/14885 [00:00<?, ?it/s]

  0%|          | 0/14885 [00:00<?, ?it/s]

In [13]:
counter = Counter()
for tokenized_sentence in total_sentence_lst:
    counter.update(tokenized_sentence)

In [14]:
vocab_threshold = 5

vocab_dict = {'[START]': 0, '[END]': 1, '[UNK]': 2, '[PAD]': 3}
for k, v in counter.items():
    if v >= vocab_threshold:
        vocab_dict[k] = len(vocab_dict)

print(len(counter), len(vocab_dict))

38190 15672


In [41]:
len(qqp_paraphrase_dataset['train']['question1'])

134378

In [43]:
raw_train_qqp = datasets.Dataset.from_dict({"question1_text": qqp_paraphrase_dataset['train']['question1'], "question2_text": qqp_paraphrase_dataset['train']['question2'],  "question1_wordlst": train_sentence_lst1, "question2_wordlst": train_sentence_lst2})
raw_eval_qqp = datasets.Dataset.from_dict({"question1_text": qqp_paraphrase_dataset['validation']['question1'], "question2_text": qqp_paraphrase_dataset['validation']['question2'], "question1_wordlst": eval_sentence_lst1, "question2_wordlst": eval_sentence_lst2})

In [None]:

def tokenize_func(samples):
    input_ids = []
    attention_mask = []
    for seq in samples['text']:
        if len(seq) < max_len-1:
            input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq] + [1] + [vocab_dict['[PAD]']]*(max_len-len(seq)-2))
            attention_mask.append([1]*(len(seq)+2) + [0]*(max_len-len(seq)-2))
        else:
            input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq[:max_len-2]] + [1])
            attention_mask.append([1]*max_len)
    result_dict = {'input_ids': input_ids, 'attention_mask': attention_mask}
    return result_dict

In [48]:
bert_tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [55]:
max_len = 32

def total_tokenize_func(samples):
    res = {}

    # using bert tokenizer
    # input_ids, token_type_ids, attention_masks
    tmp = bert_tokenizer(samples["question1_text"], padding='max_length', truncation=True)
    for k,v in tmp.items():
        res['question1_'+k+'_bert'] = v
    tmp = bert_tokenizer(samples["question2_text"], padding='max_length', truncation=True)
    for k,v in tmp.items():
        res['question2_'+k+'_bert'] = v

    # using custom tokenizer
    question1_input_ids, question2_input_ids = [], []
    question1_attention_mask, question2_attention_mask = [], []
    for seq in samples['question1_wordlst']:
        if len(seq) < max_len-1:
            question1_input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq] + [1] + [vocab_dict['[PAD]']]*(max_len-len(seq)-2))
            question1_attention_mask.append([1]*(len(seq)+2) + [0]*(max_len-len(seq)-2))
        else:
            question1_input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq[:max_len-2]] + [1])
            question1_attention_mask.append([1]*max_len)

    for seq in samples['question2_wordlst']:
        if len(seq) < max_len-1:
            question2_input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq] + [1] + [vocab_dict['[PAD]']]*(max_len-len(seq)-2))
            question2_attention_mask.append([1]*(len(seq)+2) + [0]*(max_len-len(seq)-2))
        else:
            question2_input_ids.append([0] + [vocab_dict.get(x, vocab_dict['[UNK]']) for x in seq[:max_len-2]] + [1])
            question2_attention_mask.append([1]*max_len)

    res['question1_input_ids'] = question1_input_ids
    res['question2_input_ids'] = question2_input_ids
    res['question1_attention_mask'] = question1_attention_mask
    res['question2_attention_mask'] = question2_attention_mask

    return res

In [56]:
raw_eval_qqp

Dataset({
    features: ['question1_text', 'question2_text', 'question1_wordlst', 'question2_wordlst'],
    num_rows: 14885
})

In [57]:
qqp_eval = raw_eval_qqp.map(total_tokenize_func, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

In [61]:
qqp_train = raw_train_qqp.map(total_tokenize_func, batched=True)

  0%|          | 0/135 [00:00<?, ?ba/s]

In [63]:
qqp_train.save_to_disk("data/qqp_paraphrase_train")

In [64]:
qqp_eval.save_to_disk("data/qqp_paraphrase_eval")

In [65]:
vocab_dict

{'[START]': 0,
 '[END]': 1,
 '[UNK]': 2,
 '[PAD]': 3,
 'How': 4,
 'do': 5,
 'I': 6,
 'control': 7,
 'my': 8,
 'horny': 9,
 'emotions': 10,
 '?': 11,
 'What': 12,
 'can': 13,
 'one': 14,
 'after': 15,
 'MBBS': 16,
 'is': 17,
 'the': 18,
 'best': 19,
 'self': 20,
 'help': 21,
 'book': 22,
 'you': 23,
 'have': 24,
 'read': 25,
 'Why': 26,
 'did': 27,
 'it': 28,
 'change': 29,
 'your': 30,
 'life': 31,
 'will': 32,
 'be': 33,
 'Hillary': 34,
 'Clinton': 35,
 "'s": 36,
 'policy': 37,
 'towards': 38,
 'India': 39,
 'if': 40,
 'she': 41,
 'becomes': 42,
 'president': 43,
 'Which': 44,
 'to': 45,
 'study': 46,
 'for': 47,
 'general': 48,
 'relativity': 49,
 'from': 50,
 'basic': 51,
 'are': 52,
 'coolest': 53,
 'Android': 54,
 'hacks': 55,
 'and': 56,
 'tricks': 57,
 'know': 58,
 'motivational': 59,
 'videos': 60,
 'lose': 61,
 'weight': 62,
 'fast': 63,
 'does': 64,
 'an': 65,
 'IQ': 66,
 'test': 67,
 'work': 68,
 'what': 69,
 'determined': 70,
 'Is': 71,
 'safe': 72,
 'use': 73,
 'Xiaomi': 7

In [67]:
with open("data/qqp_paraphrase_vocab_dict", "wb") as f:
    pickle.dump(vocab_dict, f)

In [2]:
qqp_train = datasets.load_from_disk("data/qqp_paraphrase_train")
qqp_eval = datasets.load_from_disk("data/qqp_paraphrase_eval")

In [3]:
qqp_eval

Dataset({
    features: ['question1_text', 'question2_text', 'question1_wordlst', 'question2_wordlst', 'question1_input_ids_bert', 'question1_token_type_ids_bert', 'question1_attention_mask_bert', 'question2_input_ids_bert', 'question2_token_type_ids_bert', 'question2_attention_mask_bert', 'question1_input_ids', 'question2_input_ids', 'question1_attention_mask', 'question2_attention_mask'],
    num_rows: 14885
})

In [4]:
with open("data/qqp_paraphrase_vocab_dict", "rb") as f:
    vocab_dict = pickle.load(f)

In [5]:
vocab_dict

{'[START]': 0,
 '[END]': 1,
 '[UNK]': 2,
 '[PAD]': 3,
 'How': 4,
 'do': 5,
 'I': 6,
 'control': 7,
 'my': 8,
 'horny': 9,
 'emotions': 10,
 '?': 11,
 'What': 12,
 'can': 13,
 'one': 14,
 'after': 15,
 'MBBS': 16,
 'is': 17,
 'the': 18,
 'best': 19,
 'self': 20,
 'help': 21,
 'book': 22,
 'you': 23,
 'have': 24,
 'read': 25,
 'Why': 26,
 'did': 27,
 'it': 28,
 'change': 29,
 'your': 30,
 'life': 31,
 'will': 32,
 'be': 33,
 'Hillary': 34,
 'Clinton': 35,
 "'s": 36,
 'policy': 37,
 'towards': 38,
 'India': 39,
 'if': 40,
 'she': 41,
 'becomes': 42,
 'president': 43,
 'Which': 44,
 'to': 45,
 'study': 46,
 'for': 47,
 'general': 48,
 'relativity': 49,
 'from': 50,
 'basic': 51,
 'are': 52,
 'coolest': 53,
 'Android': 54,
 'hacks': 55,
 'and': 56,
 'tricks': 57,
 'know': 58,
 'motivational': 59,
 'videos': 60,
 'lose': 61,
 'weight': 62,
 'fast': 63,
 'does': 64,
 'an': 65,
 'IQ': 66,
 'test': 67,
 'work': 68,
 'what': 69,
 'determined': 70,
 'Is': 71,
 'safe': 72,
 'use': 73,
 'Xiaomi': 7

In [7]:
len(qqp_train)

134378

In [14]:
from data_utils import load_qqp_dataset_and_tokenizer_from_disk, QQPParaphraseDataset

In [3]:
qqp_train, qqp_eval, vocab_dict = load_qqp_dataset_and_tokenizer_from_disk(data_path='data',)

In [9]:
len(qqp_train['question1_text'])


120940

In [8]:
len(qqp_eval['question1_text'])

13438

In [10]:
qqp_train.keys()

dict_keys(['question1_text', 'question2_text', 'question1_wordlst', 'question2_wordlst', 'question1_input_ids_bert', 'question1_token_type_ids_bert', 'question1_attention_mask_bert', 'question2_input_ids_bert', 'question2_token_type_ids_bert', 'question2_attention_mask_bert', 'question1_input_ids', 'question2_input_ids', 'question1_attention_mask', 'question2_attention_mask'])

In [12]:
qqp_eval['question1_text']

['"Is ""Pokemon Ranger and The Temple of The Sea"" considered appropriate for kids?"',
 'What is the best textbook for Hebrew?',
 'How do I take control on masturbation?',
 'Who is your favourite female movie director and why?',
 'Do ghost actually exists?',
 'Which is the best institute in Mumbai for doing Financial Modeling certification course?',
 'What is exam pattern of MH CET MBA?',
 'What is the difference between a porn figure and a prostitute?',
 '"Why my \'\'i"" is different than yours?"',
 'How do you become more masculine?',
 'How can improve my managerial skills?',
 'Why do people ask question on Quora that can be easily and definitively answered by Googling?',
 'How did you find a job abroad?',
 'Why would a boy love a girl?',
 'What do you hate about school?',
 'What are the consequences of lying about your ethnicity on your college applications?',
 'What are compounds? What are some examples?',
 'What would you do if you woke up to find that nuclear war had started?',
 

In [13]:
qqp_eval['question2_text']

['"Is ""Pokémon Ranger and The Temple of The Sea"" considered childish?"',
 "What's the best self study book to learn Hebrew?",
 'How do I control on masturbation?',
 'Who is the best female movie director?',
 'Does ghost really exist?',
 'Which is the best institute in Mumbai from where a fresher can learn financial modeling?',
 'What is the exam pattern of MH CET MBA?',
 'What is a difference between a prostitute and a porn star?',
 '"Why my \'\' I "" is different than yours?"',
 'How can one become more masculine?',
 'How do you improve your managerial skills?',
 'Why do so many people ask questions on Quora that can be easily answered by any number of legitimate sources on the Web? Have they not heard of Google or Bing?',
 'How do I find a job abroad?',
 'Why do boys love girls?',
 'Why do you hate school?',
 'Can I lie about my ethnicity to top college admissions?',
 'What are some examples of compounds?',
 'What would you do, if nuclear war began?',
 'How should I start learning 

In [15]:
train_dataset = QQPParaphraseDataset(qqp_train)

In [18]:
eval_dataset = QQPParaphraseDataset(qqp_eval)

In [22]:
with open("qqp_train_dict", "wb") as f:
    pickle.dump(qqp_train, f)

with open("qqp_eval_dict", "wb") as f:
    pickle.dump(qqp_eval, f)

In [21]:
eval_dataset[0]

{'question1_input_ids_bert': tensor([  101,  1000,  2003,  1000,  1000, 20421, 11505,  1998,  1996,  3379,
          1997,  1996,  2712,  1000,  1000,  2641,  6413,  2005,  4268,  1029,
          1000,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,  