# Sentence BERT
The purpose of this notebook is to experiment with the functionallity Sentence BERT brings in the context of semantic search and clustering. 

## Initial setup

In [103]:
import csv
import json
import time

import numpy as np
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

def timer(f, *args, **kwargs):
    start = time.time()
    out = f(*args, **kwargs)
    print(f"Took {round(time.time() - start , 3)} to execute {f.__name__}")
    return out
#end def

class SimMeasurer():
    def __init__(self, text: list, model: SentenceTransformer, embedded_text=[]):
        self.text = text
        self.model = model

        if not embedded_text:
            self.embedded_text = model.encode(text)
        else:
            self.embedded_text = embedded_text
        #end if

        self.precomputed_norm = norm(self.embedded_text, axis=1)
    #end def

    def matrix_cosine_sim(self, query):
        # Query needs to be 2d row vector
        query_norm = norm(query)        

        prod = np.ndarray.flatten(np.matmul(self.embedded_text, query))
        #vector_norm = np.ndarray.flatten((query_norm * self.precomputed_norm))

        cosine_sim = np.divide(prod, query_norm * self.precomputed_norm)
        return cosine_sim
    #end def

    def get_n_most_similar(self, query, n=1):
        similarity_scores = self.matrix_cosine_sim(query)

        top_idx = []
        top_scores = []
        for i in range(n):
            idx = np.argmax(similarity_scores)
            score = similarity_scores[idx]

            top_idx.append(idx)
            top_scores.append(score)

            similarity_scores[idx] = 0
        #end for

        return top_idx, top_scores
    #end def

    def get_most_similar(self, query, n=3, scores=False):
        start = time.time()
        query_vector = np.reshape(model.encode([query]), (-1,1))
        top_idx, scores = self.get_n_most_similar(query_vector, n)
        print(f'Took {round(time.time()-start, 3)} s to encode query and find {n} most similar sentences')

        if scores:
            return [self.text[t] for t in top_idx], scores
        else:
            return [self.text[t] for t in top_idx]
    #end def

#end class

  
class QuoraDataset():
    def __init__(self, file_path):
        self.file_path = file_path
        self.dict_data = self._read_from_file()

        self.len = len(self.dict_data)
    #end def

    def _read_from_file(self):
        with open(self.file_path, 'r') as f:
            reader = csv.DictReader(f)
            dict_data = [row for row in reader]

        return dict_data
    #end def

    def get_questions(self, n=1, deduplicate=True):
        questions = set()
        for row in self.dict_data:
            for q_idx in [1,2]:
                questions.add(row.get(f'question{q_idx}'))
                if len(questions) >= n:
                    return list(questions)
                #end if
                
                if row.get('is_duplicate') == '1':
                    break
                #end if
            #end for
        #end for
    #end def

    def __len__(self):
        return self.len
    #end def

    def __str__(self):
        return f"I am a collection of {self.len} questions from path {self.file_path}"
    #end def


#end class     

Load dataset and model

In [94]:
model = timer(SentenceTransformer, 'bert-base-nli-stsb-mean-tokens')
dataset = timer(QuoraDataset, 'train.csv')
print(dataset)

100%|██████████| 405M/405M [00:47<00:00, 8.47MB/s]
Took 52.265 to execute SentenceTransformer
Took 2.807 to execute QuoraDataset
I am a collection of 404290 questions from path train.csv


Extract questions from dataset and encode them using the sentnece bert model 

In [95]:
questions = timer(dataset.get_questions, n=1000)
encoded_questions = timer(model.encode, questions)

Took 0.001 to execute get_questions
Took 19.124 to execute encode


Measure the similarity between a query and the embedded questions from above

In [150]:
sim = SimMeasurer(questions, model, encoded_questions)

query = 'How do i learn faster?'
most_sim = sim.get_most_similar(query, n=10)

print(query + "\n------------------")
offset = len(max(most_sim[0], key=len))
for t, s in zip(most_sim[0], most_sim[1]):
    print("Score: {:.3} - {:<{offset}}".format(s, t, offset=offset))
    #print(f"{t} \t\t (Score: {round(s, 3)})")

Took 0.041 s to encode query and find 10 most similar sentences
How do i learn faster?
------------------
Score: 0.544 - How we can travel faster than light?                            
Score: 0.533 - How do I manage time for studies?                               
Score: 0.53 - How do I start writing again?                                   
Score: 0.52 - How do I keep motivation to learn a language?                   
Score: 0.508 - How can I become a good speaker?                                
Score: 0.492 - How do I post a question that was marked as needing improvement?
Score: 0.491 - What is the alternative to machine learning?                    
Score: 0.475 - How do I start writing?                                         
Score: 0.469 - How can one learn to trust again?                               
Score: 0.463 - How much funds should I raise?                                  


## Classifier

In [None]:
import torch


In [186]:
import re
text = extract_key("Description")

sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
newline_pattern = re.compile(r'\s+')
sentences = [re.split(sentence_pattern, t) for t in text]
#sentences = [re.sub(newline_pattern, ' ', s) for s in sentences]

Extracted 47 values for key Description from dataset


## Kaggle Quora dataset
This dataset is perfect for a couple of reasons:
    1. It ties in to the report i wrote previously
    2. It has clean sentences that I can embed
    3. I can in a later iteration fine-tune the sentence bert model on this data which will allow me to dig futrther into the architecture of the model etc. 


['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']


In [179]:
from random import sample
from random import seed
seed(42)
import csv
with open('train.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    print(header)
    data = [r for r in reader]


dev_size = round(0.1*len(labels))

dev_data_idx = set(sample(range(len(data)), dev_size))

with open('quora-train.csv', 'w') as train, open('quora-dev.csv', 'w') as dev:
    train_writer = csv.writer(train)
    dev_writer = csv.writer(dev)
    for i, row in enumerate(data):
        if i in dev_data_idx:
            dev_writer.writerow(row)
        else:
            train_writer.writerow(row)

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']


## Fine tuneing SentenceBERT on this task
