In [1]:
import pandas as pd
import numpy as np
import requests
import json
import pandas as pd
import io
from nltk.parse import DependencyGraph
from typing import List, Tuple, Dict
import tqdm
import spacy
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from collections import Counter
import time
import pickle

### Data Loading

In [2]:
df = pd.read_csv("quora_duplicate_questions.tsv", sep = "\t")
#df = df.head(30000)

In [3]:
duplicates = df[df["is_duplicate"] == 1]
question1, question2 = duplicates["question1"].str.lower().tolist(), duplicates["question2"].str.lower().tolist()
duplicates.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
15,15,31,32,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,1
16,16,33,34,What does manipulation mean?,What does manipulation means?,1
18,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1
20,20,41,42,Why do rockets look white?,Why are rockets and boosters painted white?,1
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1


### Encoder 

In [13]:
class BertEncoder(object):
    
    def __init__(self, device = 'cpu'):
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model.eval()
        self.model.to(device)
        self.device = device
        
    def tokenize(self, original_sentence: List[str]) -> Tuple[List[str], Dict[int, int]]:

        """
        Parameters
        ----------
        Returns
        -------
        bert_tokens: The sentence, tokenized by BERT tokenizer.
        orig_to_tok_map: An output dictionary consisting of a mapping (alignment) between indices in the original tokenized sentence, and indices in the sentence tokenized by the BERT tokenizer. See https://github.com/google-research/bert
        """

        bert_tokens = ["[CLS]"]
        orig_to_tok_map = {}
        has_subwords = False
        is_subword = []

        for i, w in enumerate(original_sentence):
            tokenized_w = self.tokenizer.tokenize(w)
            has_subwords = len(tokenized_w) > 1
            is_subword.append(has_subwords)
            bert_tokens.extend(tokenized_w)

            orig_to_tok_map[i] = len(bert_tokens) - 1

        bert_tokens.append("[SEP]")
        
        return (bert_tokens, orig_to_tok_map)

    
    def encode(self, sentence: str):
    
        tokenized_text, orig2tok = self.tokenize(sentence.split(" "))
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens]).to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor)
            predictions = outputs[0].detach().cpu().numpy()[0,0, :] # take last hidden layer, the state over the CLS token
            return (predictions.squeeze(), orig2tok)

### encode questions

In [14]:
bert = BertEncoder('cuda')

In [None]:
states = []

for q1, q2 in tqdm.tqdm(zip(question1, question2), total = len(question1)):
    
    v1, v2 = bert.encode(q1), bert.encode(q2)
    states.append((v1, v2))

 38%|███▊      | 57322/149263 [21:24<36:12, 42.31it/s] 

In [None]:
pair_ids = duplicates["id"].tolist()

In [None]:
q1,q2 = list(zip(*states))
q1,_ = list(zip(*q1))
q2,_ = list(zip(*q2))

states_with_strings_and_ids = list(zip(q1,q2,question1,question2, pair_ids))
with open("bert-base.cls.last-layer.pickle", "wb") as f:
    pickle.dump(states_with_strings_and_ids, f)

In [None]:
def to_string(np_array):
        return "\t".join(["%0.4f" % x for x in np_array])
    
with open("vecs.tsv", "w") as f:
    for v in q1 + q2:
        f.write(to_string(v[0]) + "\n")

with open("labels.tsv", "w") as f:
    for s in question1 + question2:
        f.write(s + "\n")

In [25]:
q1[0].shape

(27, 768)