## Sentence Transformers (specifically stsb-mpnet-base-v2)
This is THE BEST sentence-level embedding model on huggingface. 
But we'll see if it's good enough for the real world. 


In [None]:
!pip install sentence-transformers rank_bm25

In [3]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted", "Kastan is a fun programmer"]

model = SentenceTransformer('sentence-transformers/stsb-mpnet-base-v2')
embeddings = model.encode(sentences)
# print(embeddings)
print("embeddings.shape:", embeddings.shape)

score01 = embeddings[0] @ embeddings[1] #1.0473
score02 = embeddings[0] @ embeddings[2] #1.0095
# score02 = embeddings[0] @ embeddings[3] #1.0095

print(score01, score02) # the first two are closer than the first and third

Ignored unknown kwarg option direction
embeddings.shape: (3, 768)
5.535631 0.17808935


# Doc Query
This does pure question to text lookup (no generation).
But I like that because hopefully it's more factual. 

Also this implementation works directly with PDFs! That's awesome for easily using all kinds of new data!

In [None]:
from docquery import document, pipeline
import json
import re

class DocQuery():
    def __init__(self):
        pass

    def query(self, data, top_k):
        p = pipeline('document-question-answering')
        doc = document.load_document("../Student_Notes.pdf")
        questions = []
        for i in range(len(data)):
            question = re.sub('\nQ.', '', data[i]['questions'])
            questions.append(question)

        all_data = []
        for q in questions:
        # print(q, p(question=q, **doc.context))
            answer = p(question=q, **doc.context, top_k=top_k)
            data = {}
            data['questions'] = q
            data['answers'] = answer
            all_data.append(data)
        return all_data

In [None]:
# Call the DocQuery class
d = DocQuery()
with open('../GPT-3_section_level.json') as f:  # use the questions in section data as query
    query = json.load(f)
all_data = d.query(query, 3)
with open('docquery_section_output.json', 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=4) 

In [None]:
!docquery scan "What is the invoice number?" https://templates.invoicehome.com/invoice-template-us-neat-750px.png

### Simple Wiki Retrieval 

In [None]:
# retreive answers from given context (Retrieval Pipeline)
import re
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os

class WikiRetrieval(): 
    def __init__(self):
        pass

    def retrieval_pipeline(self):
        # retrieve top 5 answers from given context and question 
        with open('GPT-3_paragraph_level.json') as f:
            data = json.load(f)

        # top_k sets how many answers you want the pipeline to return (each with a score)
        model = "deepset/roberta-base-squad2"
        pipe = pipeline('question-answering', model=model, tokenizer=model, max_answer_len=128, top_k=5)
        all_retrieve_data = []
        for i in range(len(data)):
            question = re.sub('\nQ.', '', data[i]['questions'])
            context = re.sub('\n', ' ', data[i]['positive_ctxs']['text'])
            if not question or not context:
                continue
            retrieval = pipe(question=question, context=context)
            all_retrieve_data.append(retrieval)

        # with open('section_level_retrieval.json', 'w', encoding='utf-8') as f:
        #     json.dump(all_retrieve_data, f, ensure_ascii=False, indent=4) 
        # Returns something like this:
        # [{'score': 0.47350358963012695, 'start': 20, 'end': 28, 'answer': 'textbook'},
        #  {'score': 0.1505853682756424,
        #   'start': 20,
        #   'end': 41,
        #   'answer': 'textbook and in class'},
        #  {'score': 0.041666436940431595,
        #   'start': 16,
        #   'end': 28,
        #   'answer': 'the textbook'}]
        return all_retrieve_data


    
    def search(self, query):
    # This function will search all wikipedia articles for passages that
    # answer the query
        if not torch.cuda.is_available():
            print("Warning: No GPU found. Please add GPU to your notebook")

        #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
        bi_encoder = SentenceTransformer('sentence-transformers/stsb-mpnet-base-v2')
        bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
        top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

        #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

        # As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
        # about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

        wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

        if not os.path.exists(wikipedia_filepath):
            util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

        passages = []
        with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
            for line in fIn:
                data = json.loads(line.strip())

                #Add all paragraphs
                #passages.extend(data['paragraphs'])

                #Only add the first paragraph
                passages.append(data['paragraphs'][0])

        print("Passages:", len(passages))

        # We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
        corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
        
        ##### Sematic Search #####
        # Encode the query using the bi-encoder and find potentially relevant passages
        question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
        question_embedding = question_embedding.cuda()
        hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
        hits = hits[0]  # Get the hits for the first query

        ##### Re-Ranking #####
        # Now, score all retrieved passages with the cross_encoder
        cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
        cross_scores = cross_encoder.predict(cross_inp)

        # Sort results by the cross-encoder scores
        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]

        # Output of top-5 hits from re-ranker
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
        output = []
        for hit in hits[0:5]:
            temp = passages[hit['corpus_id']].replace("\n", " ")
            output.append(temp)
        data = {}
        data[query] = output
        return data
    
    def search_wiki(self, data):
        # This function will search all wikipedia articles for passages that answer the query
        all_retrieve_data = []
        for i in range(len(data)):
            question = re.sub('\nQ.', '', data[i]['questions'])
            retrieved_passage = self.search(query=question)
            if not question:
                continue
            all_retrieve_data.append(retrieved_passage)

        return all_retrieve_data
        

In [None]:
# Call the WikiRetrieval class
r = WikiRetrieval()
with open('GPT-3_paragraph_level.json') as f:
    query = json.load(f)
all_retrieve_data = r.search_wiki(query)
with open('wiki_retrieval_paragraph.json', 'w', encoding='utf-8') as f:
    json.dump(all_retrieve_data, f, ensure_ascii=False, indent=4) 

### Paragraph Retrieval (Cross-Encoder Re-Ranking)

In [None]:
# retreive answers from given context (Retrieval Pipeline)
import re
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os

class ParagraphRetrieval(): 
    def __init__(self):
        pass
    
    def clean(self, text):
        new_text = re.sub('\n', '', text)
        return new_text
    
    def search(self, query):
        # This function will search all wikipedia articles for passages that
        # answer the query
        if not torch.cuda.is_available():
            print("Warning: No GPU found. Please add GPU to your notebook")

        bi_encoder = SentenceTransformer('sentence-transformers/stsb-mpnet-base-v2')
        bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
        top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

        #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

        # We use paragraphs.json as the retrieval dataset
        passages = []
        with open('../paragraphs.json') as f:
            data = json.load(f)
            n = int(len(data)/100)
            for k in range(n):
                if(k==n):
                    start = k*100
                    end = (list(data.keys())[-1])
                else:
                    start = k*100
                    end = k*100+99
                for i in range(start, end):
                    paragraph = data[str(i)]
                    paragraph = self.clean(paragraph)
                    passages.append(paragraph)

        print("Passages:", len(passages))

        # We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
        corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

        ##### Sematic Search #####
        # Encode the query using the bi-encoder and find potentially relevant passages
        question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
        question_embedding = question_embedding.cuda()
        hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
        hits = hits[0]  # Get the hits for the first query

        ##### Re-Ranking #####
        # Now, score all retrieved passages with the cross_encoder
        cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
        cross_scores = cross_encoder.predict(cross_inp)

        # Sort results by the cross-encoder scores
        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]


        # Output of top-5 hits from re-ranker
    #     print("\n-------------------------\n")
    #     print("Top-3 Cross-Encoder Re-ranker hits")
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
        output = []
        data = {}
        for hit in hits[0:5]:
            temp = passages[hit['corpus_id']].replace("\n", " ")
            out = str(hit['cross-score']) + " " + temp
            output.append(out)
        data[query] = output
        return data
    
    def search_paragraph(self, data):
        # This function will search all wikipedia articles for passages that answer the query
        all_retrieve_data = []
        for i in range(len(data)):
            question = re.sub('\nQ.', '', data[i]['questions'])
            retrieved_passage = self.search(query=question)
            if not question:
                continue
            all_retrieve_data.append(retrieved_passage)

        return all_retrieve_data
        

In [None]:
# Call the class
p = ParagraphRetrieval()
with open('../GPT-3_paragraph_level.json') as f:
    query = json.load(f)
all_retrieve_data = p.search_paragraph(query)
with open('sentence_transformer_paragraph_cross_encoder.json', 'w', encoding='utf-8') as f:
    json.dump(all_retrieve_data, f, ensure_ascii=False, indent=4) 