## Imports

In [2]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS, Chroma
from huggingface_hub import hf_hub_download
import textwrap
import glob

## Constants

In [3]:
HUGGING_FACE_API_KEY="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"

In [18]:
model = HuggingFaceHub(
    repo_id="facebook/mbart-large-50",
    model_kwargs={
        "temperature": 0.9,
        "max_length": 512
    },
    huggingfacehub_api_token=HUGGING_FACE_API_KEY
)


## Loading, splitting the PDF

In [5]:
loader = PyPDFLoader("../data/12_rules.pdf")
pages = loader.load_and_split()
splitter = RecursiveCharacterTextSplitter(chunk_size = 700, chunk_overlap = 0)
texts = splitter.split_documents(pages)

In [7]:
texts[:5]

[Document(page_content='Jordan B. Peterson\n12 RULES FOR LIFE\nAn Antidote for Chaos\nForeword by Norman Doidge\nIllustrations by Ethan Van Scriver', metadata={'source': '../data/12_rules.pdf', 'page': 2}),
 Document(page_content='Table of Contents\nForeword by Norman Doidge\nOverture\nRULE 1\n / Stand up straight with your shoulders back\nRULE 2\n / Treat yourself like someone you are responsible for helping\nRULE 3\n / Make friends with people who want the best for you\nRULE 4\n / Compare yourself to who you were yesterday, not to who someone else is today\nRULE 5\n / Do not let your children do anything that makes you dislike them\nRULE 6\n / Set your house in perfect order before you criticize the world\nRULE 7\n / Pursue what is meaningful (not what is expedient)\nRULE 8\n / Tell the truth—or, at least, don’t lie\nRULE 9\n / Assume that the person you are listening to might know something you don’t\nRULE 10', metadata={'source': '../data/12_rules.pdf', 'page': 3}),
 Document(page_

In [8]:
# print(pages[200].page_content)

## Embedding and similarity search

In [9]:
hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [13]:
print('hi')

hi


In [12]:
document_index = Chroma.from_documents(texts, hf_embeddings)
print('Done')

Using embedded DuckDB without persistence: data will be transient


Done


In [None]:
print('helloe world')

In [14]:
results = document_index.similarity_search("What is chaos?", k=2)

for result in results:
    print(str(result.metadata["page"]) + ":", result.page_content+"\n")

63: Chaos is the domain of ignorance itself. It’s 
unexplored territory
. Chaos is
what extends, eternally and without limit, beyond the boundaries of all states,
all ideas, and all disciplines. It’s the foreigner, the stranger, the member of
another gang, the rustle in the bushes in the night-time, the monster under the
bed, the hidden anger of your mother, and the sickness of your child. Chaos is
the despair and horror you feel when you have been profoundly betrayed. It’s
the place you end up when things fall apart; when your dreams die, your
career collapses, or your marriage ends. It’s the underworld of fairytale and
myth, where the dragon and the gold it guards eternally co-exist. Chaos is

64: discover your partner’s infidelity. Chaos is the experience of reeling unbound
and unsupported through space when your guiding routines and traditions
collapse.
Order is the place and time where the oft-invisible axioms you live by
organize your experience and your actions so that what shou

In [None]:
print('hello world')

## Question answering

In [20]:
chain = load_qa_with_sources_chain(model, chain_type='refine')
query = "What is chaos?"
documents = document_index.similarity_search(query)
result = chain({"input_documents": documents, "question": query})