In [None]:
!pip -q install txtai[all] langchain pypdf > /dev/null

In [None]:
%%capture

from langchain.text_splitter import RecursiveCharacterTextSplitter
from txtai.embeddings import Embeddings
from langchain.document_loaders import PyPDFLoader
import glob

In [None]:
pdf_text = []
for pdf in glob.glob('/content/studies/*.pdf'):
  loader = PyPDFLoader(pdf)
  pages = loader.load()
  pdf_text.extend(pages)

In [None]:
pdf_text[0]

In [None]:
document_splitter = RecursiveCharacterTextSplitter(chunk_size=350,
                                                   chunk_overlap=25,
                                                   length_function=len)

In [None]:
split_data = []
for docs in pdf_text:
  print(docs)
  temp_split = document_splitter.split_text(docs.page_content)
  split_data.extend(temp_split)

In [None]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", 
                         "content": True, 
                         "objects": True})

In [None]:
# Create an index for the list of text
embeddings.index([(uid, 
                   text, 
                   None) for uid, text in enumerate(split_data)])

In [37]:
#embeddings.load("index")

In [39]:
embeddings.search("What is self ask?",7)

[{'id': '634',
  'text': 'Right Answer\nSelf-ask\nChain of thought\nQuestion When was the ﬁrst location of the world’s largest coffeehouse chain opened?\nRight Answer\nSelf-ask\nChain of thought\nQuestion Who directed the highest grossing ﬁlm?\nRight Answer\nSelf-ask\nChain of thought\n24',
  'score': 0.5150313973426819},
 {'id': '447',
  'text': '001\n002\n−0.04−0.02 0.00 0.02 0.04−0.04−0.020.000.020.04\nSelf-ask\nAnswered CorrectlyBoth Subquestions\nAnswered CorrectlyFigure 4: Self-ask is able to narrow and sometimes close the compositionality gap on CC. Here,\nself-ask uses a 1-shot prompt. Chain of thought performed within 1% of self-ask on this dataset and',
  'score': 0.4897139072418213},
 {'id': '631',
  'text': 'Self-Ask (ours) 30.0 36.1 35.4 13.8 27.0 16.2\nSelf-Ask + Search Engine (ours) 40.1 52.6 53.1 15.2 27.2 19.6\nTable 14 shows examples from Bamboogle where chain of thought outputs a full-sentence ﬁnal\nanswer instead of a short form answer. The prompt for chain of thoug

## Similar Clause

similar("query","num of candidates")

In [41]:
embeddings.search("select * from authors where similar('what is self ask')")

[{'id': '444',
  'text': 'Self-ask (depicted in Figure 3) requires a one- or few-shot prompt that demonstrates how to answer\nthe questions. Our prompt starts with those examples, after which we append the inference-time\nquestion. We then insert the phrase “Are follow up questions needed here:” at the end of the prompt',
  'score': 0.5317630171775818},
 {'id': '60',
  'text': 'over two or more Wikipedia passages, and (2) FEVER (Thorne et al., 2018), a fact veriﬁcation\nbenchmark where each claim is annotated SUPPORTS, REFUTES, or NOT ENOUGH INFO, based\non if there exists a Wikipedia passage to verify the claim. In this work, we operate in a question-only',
  'score': 0.5158118009567261},
 {'id': '439',
  'text': 'elicitive prompts can occasionally answer even more compositional questions than direct prompts\ncorrectly answer sub-questions for, separately. This might be because elicitive prompts contain\nmore information than direct ones. Note that the rest of this section shows that 

 Remember that input documents take the form of (id, data, tags) tuples. 

In [22]:
embeddings.search("""SELECT text, flag, entry 
                    FROM txtai 
                    WHERE similar('Language models') 
                    AND entry >= '2023-04-20'""")

[{'text': '2{jeffreyzhao,dianyu,dunan,izhak,yuancao}@google.com\nABSTRACT\nWhile large language models (LLMs) have demonstrated impressive performance\nacross tasks in language understanding and interactive decision making, their\nabilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action',
  'flag': None,
  'entry': '2023-04-21 10:26:59.312376'},
 {'text': 'However, as the language space Lis unlimited, learning in this augmented action space is difﬁcult\nand requires strong language priors. In this paper, we mainly focus on the setup where a frozen\nlarge language model, PaLM-540B (Chowdhery et al., 2022)1, is prompted with few-shot in-context',
  'flag': None,
  'entry': '2023-04-21 10:26:59.312376'},
 {'text': 'com/Authors-Notes/sparrow/sparrow-final.pdf .\nEhsan Hosseini-Asl, Bryan McCann, Chien-Sheng Wu, Semih Yavuz, and Richard Socher. A simple\nlanguage model for task-oriented dialogue. Advances in Neural Information Processing Systems ,\n33:20179–20191, 2

In [27]:
embeddings.search("""SELECT length(text) FROM txtai 
                    WHERE similar('language models')
                    AND score >= 0.15""")

[{'length(text)': 297}, {'length(text)': 289}, {'length(text)': 347}]

## Introducing Translation Pipeline

In [42]:
from txtai.pipeline import Translation

# Translation pipeline
translate = Translation()

# Create embeddings index
embeddings_translate = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2",
                         "content": True,
                         "functions": [translate]})


In [43]:
embeddings_translate.index([(uid,
                             text,
                             None) for uid, text in enumerate(split_data)])

In [44]:
query = """
select
  text,
  translation(text, 'de') 'text (DE)',
  translation(text, 'es') 'text (ES)',
  translation(text, 'fr') 'text (FR)'
from txtai where similar('What is Language Model')
limit 1
"""

In [None]:
# Run a search using a custom SQL function
embeddings_translate.search(query)

## Introducing Extractor Pipeline


In [47]:

%%capture

from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor

# Create extractor instance
extractor = Extractor(embeddings_translate, "google/flan-t5-base")

In [52]:
def prompt(question):
  return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
Question: {question}
Context: """

def search(query, question=None):
  # Default question to query if empty
  if not question:
    question = query

  return extractor([("answer", query, prompt(question), False)])[0][1]

In [53]:
question = "What is self-ask algorithm?"

answer = search(question)

print(question, answer)

What is self-ask algorithm? structured prompting lets us easily plug in a search engine to answer the follow-up questions
