In [1]:
from typing import List, Dict
from transformers import pipeline
import spacy
import re
import pandas as pd
import pickle
import os
from nlp import Document, Question, PassageTokenizer, BagWords, QA
from llm import Embedder, Agent
from openai.embeddings_utils import cosine_similarity
from tqdm import tqdm
from time import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import time
# import datetime

# def tt(elapsed):
#     # Round to the nearest second.
#     elapsed_rounded = int(round((elapsed)))
    
#     # Format as hh:mm:ss
#     return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
class Cache():
    '''
    This is just a dictionary of documents. Should be
    a service, such as Az Table or Cosmos
    '''
    def __init__(self, documents: Dict = None):
        documents = documents if documents else {}
        self.documents = documents
    
    def set_document(self, doc: Document):
        self.documents[doc.id] = doc

    def get_document(self, id: str):
        if id in self.documents.keys():
            return self.documents[id]
        else:
            return None
    
    def save(self, root: str):
        path = os.path.join(root, 'cache.pkl')
        with open(path, 'wb') as f:
            pickle.dump(self.documents, f)
    
    @classmethod
    def load(cls, path):
        with open(path, 'rb') as f:
            documents = pickle.load(f)
        return cls(documents)

In [4]:
class ConversationPipeline():
    
    def __init__(self, 
                    cache_path: str = None, 
                    bow_path: str = None,
                    bow_corpus: List[str] = None,
                    similarity_threshold: float = 0.5, 
                    qa_threshold: float = 0.5, 
                    do_sem_similairty: bool = True, 
                    do_sin_similairty: bool = True, 
                    do_qa: bool = True):
    
        
        if bow_path:
            self.bow = BagWords.load(bow_path)  
        elif bow_corpus:
            self.bow = BagWords(bow_corpus)
        else:
            raise("ERROR: need to init BOW somehow")
        self.cache = Cache.load(cache_path) if cache_path else Cache()
        self.tok = PassageTokenizer() # missing arguments
        self.embedder = Embedder() # missing arguments
        self.qa = QA() # missing arguments
        self.similarity_threshold = similarity_threshold
        self.qa_threshold = qa_threshold
        self.do_sem_similairty = do_sem_similairty
        self.do_sin_similairty = do_sin_similairty
        self.do_qa = do_qa

    def encode_document(self, full_text: str, id: str, metadata: str):
        doc = self.cache.get_document(id)
        if doc is None:
            doc = Document(full_text, id, metadata)
            doc.passages = self.tok.get_passages(doc.document)
            doc.embeddings = self.embedder(doc.passages)
            doc.bow = self.bow.transform(doc.passages)
            self.cache.set_document(doc)
        return doc
    
    def encode_question(self, question: str):
        emb = self.embedder(question)
        bow = self.bow.transform(question)
        return Question(question, emb, bow)


    def get_best_passages(self, doc: Document, question: Question) -> List[Dict]:
        passages = []

        # check semantic similarity
        if self.do_sem_similairty:
            sem_sims = [self.embedder.similarity(question.embedding, d) for d in doc.embeddings]
        else:
            sem_sims = [0]*doc.size

        # check sintax similarity
        if self.do_sin_similairty:
            sin_sims = [self.bow.similarity(question.bow, d) for d in doc.bow]
        else:
            sin_sims = [0]*doc.size
        
        sims = [max(sem, sim) for sem, sim in zip(sem_sims, sin_sims)]
        for p, s in zip(doc.passages, sims):
            if s < self.similarity_threshold:
                continue
            
            passage = {'doc_id': doc.id, 'metadata': doc.metadata, 'passage': p} #doc.passages[idx]
            # check reading comprehension
            if self.do_qa:
                score = self.qa(p, question) #doc.passages[idx]
                if score < self.qa_threshold:
                    continue
                passage['score'] = score
            else:
                passage['score'] = s
            
            passages.append(passage)
        
        return passages

We are assuming a search service returns a list of documents,
each having a system ID, used to match the cache, and metadata 
to be return to the user as a consumable reference to the document

In [5]:
documents = pd.read_csv('./documents.csv')
documents['metadata'] = documents['id'].apply(lambda r: f"bla-{r}")
documents.head(3)

Unnamed: 0,doc,id,metadata
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,0,bla-0
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,1,bla-1
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,2,bla-2


We instantiate the conversation pipeline with default parameters, using the loaded documents as corpus

In [6]:
corpus = list(documents.doc)
corpus[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [7]:
agent = ConversationPipeline(bow_corpus=corpus)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [8]:
question = 'What size were the doors of the Bricklin car?'
t0 = time()
q = agent.encode_question(question)
print(time()-t0)
print(q.__dict__.keys())
print(q.question)

0.4715242385864258
dict_keys(['question', 'embedding', 'bow'])
What size were the doors of the Bricklin car?


In [9]:
document = documents.iloc[0]
print(document['doc'])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [10]:
t0 = time()
doc = agent.encode_document(document['doc'], document['id'], document['metadata'])
print(time()-t0)
print(doc.__dict__.keys())
print("\n***".join(doc.passages))

0.6968181133270264
dict_keys(['document', 'id', 'metadata', 'passages', 'embeddings', 'bow'])
From: lerxst@wam.umd.edu (where's my thing)
 Subject: WHAT car is this!?
 Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day.
***Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin.
***It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body.
***In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this

In [11]:
# this time the cache is used
t0 = time()
doc = agent.encode_document(document['doc'], document['id'], document['metadata'])
print(time()-t0)

0.0001685619354248047


In [12]:
t0 = time()
doc_passages = agent.get_best_passages(doc, q)
print(time()-t0)
print(doc_passages)

0.7087578773498535
[{'doc_id': 0, 'metadata': 'bla-0', 'passage': 'If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\n Thanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n', 'score': 0.6073856353759766}]


In [13]:
passages = []
t0 = time()
for i in range(10):
    document = documents.iloc[i]
    doc = agent.encode_document(document['doc'], document['id'], document['metadata'])
    doc_passages = agent.get_best_passages(doc, q)
    passages.extend(doc_passages)
print(time()-t0)

12.36342453956604


In [14]:
passages = []
t0 = time()
for i in range(10):
    document = documents.iloc[i]
    doc = agent.encode_document(document['doc'], document['id'], document['metadata'])
    doc_passages = agent.get_best_passages(doc, q)
    passages.extend(doc_passages)
print(time()-t0)

3.680506944656372


In [15]:
passages

[{'doc_id': 0,
  'metadata': 'bla-0',
  'passage': 'If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\n Thanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n',
  'score': 0.6073856353759766},
 {'doc_id': 7,
  'metadata': 'bla-7',
  'passage': 'ALL these SCSI facts have been\nposted to this newsgroup in my Mac & IBM info sheet {available by FTP on \nsumex-aim.stanford.edu (36.44.0.6) in the info-mac/report as \nmac-ibm-compare[version #].txt (It should be 173 but 161 may still be there)}\n\n Part of this problem is both Mac and IBM PC are inconsiant about what SCSI\nis which.  ',
  'score': 0.7398799061775208}]