In [1]:
from langchain.chains           import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.llms             import GPT4All
from langchain.embeddings       import HuggingFaceEmbeddings
from langchain.text_splitter    import RecursiveCharacterTextSplitter
from langchain.vectorstores     import Chroma

In [4]:
loader = TextLoader('./state_of_the_union.txt', encoding='utf8')
documents = loader.load_and_split()

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(documents)

In [11]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)e9125/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 3.38MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 626kB/s]
Downloading (…)7e55de9125/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 25.2MB/s]
Downloading (…)55de9125/config.json: 100%|██████████| 612/612 [00:00<00:00, 4.28MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 372kB/s]
Downloading (…)125/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 188kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:06<00:00, 14.7MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 411kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 345kB/s]
Downloading (…)e9125/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 724kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 1.56MB

In [12]:
db = Chroma.from_documents(texts, embeddings, persist_directory='db')

In [13]:
db.persist()

In [14]:
model_n_ctx = 1000
model_path = '/app/ggml-gpt4all-j-v1.3-groovy.bin'
llm = GPT4All(model=model_path, n_ctx=model_n_ctx, backend='gptj', verbose=False)

Found model file.
gptj_model_load: loading model from '/app/ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load: ................................... done
gptj_model_load: model size =  3609.38 MB / num tensors = 285


In [15]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    verbose=False,
)

In [17]:
%%time
res = qa("What did the president say about Kentaji Brown Jackson?")
res

 The context provided does not give any information regarding what President Donald Trump said or referred to when nominating Circuit Court of Appeals Judge Ketanji "Keta" Brown Jackson, therefore it is impossible for me to answer this question.
CPU times: user 12min 3s, sys: 381 ms, total: 12min 3s
Wall time: 3min 4s


{'query': 'What did the president say about Kentaji Brown Jackson?',
 'result': ' The context provided does not give any information regarding what President Donald Trump said or referred to when nominating Circuit Court of Appeals Judge Ketanji "Keta" Brown Jackson, therefore it is impossible for me to answer this question.',
 'source_documents': [Document(page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n\nA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges ap

In [18]:
%%time
res = qa("What did the president say about Kentaji Brown Jackson")
res

 The President nominated Circuit Court Judge Ketanjali "Keta" Brown-Jackson to serve on
CPU times: user 11min 17s, sys: 213 ms, total: 11min 17s
Wall time: 2min 52s


{'query': 'What did the president say about Kentaji Brown Jackson',
 'result': ' The President nominated Circuit Court Judge Ketanjali "Keta" Brown-Jackson to serve on',
 'source_documents': [Document(page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n\nA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n\nWe can

In [10]:
!pip install sentence_transformers

[0mCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch>=1.6.0 (from sentence_transformers)
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting torchvision (from sentence_transformers)
  Downloading torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6