<a href="https://colab.research.google.com/github/SinaRampe/applications-with-LangChain/blob/main/Chroma_DB_Multi_pdf_retriever_Langchain_plus_mapreduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install langchain openai tiktoken chromadb pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.8/248.8 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m990.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies .

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info 
- gpt-3.5-turbo API

## Setting up LangChain 


In [50]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [51]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader

## Load multiple and process documents

In [52]:
loader = PyPDFDirectoryLoader("data/")
raw_documents = loader.load()
print(f"loaded {len(raw_documents)} documents")

loaded 219 documents


In [53]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(raw_documents)

In [16]:
len(texts)

963

In [18]:
texts[11]

Document(page_content="war mein Schwergewicht.«\n»Nicht sehr witzig«, meinte Herr Koreander, »was noch?«\nBastian zögerte, ehe er aufzählte:\n»Spinner, Mondkalb, Aufschneider, Schwindler...«\n»Spinner? Warum?«\n»Ich red' manchmal mit mir selber.«\n»Was redest du da zum Beispiel?«\n»Ich denk' mir Geschichten aus, ich erfinde Namen und Wörter, die's noch nicht gibt, und so.«\n»Und das erzählst du dir selbst? Warum?«\n»Na ja, sonst ist doch niemand da, den so was interessiert.«\nHerr Koreander schwieg eine Weile nachdenklich.\n»Was meinen denn deine Eltern dazu?«\nBastian antwortete nicht gleich. Erst nach einer Weile murmelte er: »Vater sagt nichts. Er sagt\nnie was. Es ist ihm alles ganz gleich.«\n»Und deine Mutter?«\n»Die - ist nicht mehr da.«\n»Sind deine Eltern geschieden?«\n»Nein«, sagte Bastian, »sie ist tot.«\nIn diesem Augenblick klingelte das Telefon. Herr Koreander erhob sich mit einiger\nAnstrengung aus seinem Lehnstuhl und schlurfte in ein kleines Kabinett, das hinter dem", m

## create the DB

In [44]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings(disallowed_special=())

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)



In [20]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [21]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)



## Make a retriever

In [22]:
retriever = vectordb.as_retriever()

In [23]:
docs = retriever.get_relevant_documents("What is NLP?")

In [24]:
len(docs)

4

In [25]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [26]:
retriever.search_type

'similarity'

In [27]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [28]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [29]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [30]:
# full example
query = "What is NLP?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 I don't know.


Sources:
data/die-unendliche-geschichte-michael-ende.pdf
data/die-unendliche-geschichte-michael-ende.pdf


In [31]:
# break it down
query = "Wer bedeutet BBB?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'Wer bedeutet BBB?',
 'result': ' Bastian Balthasar Bux.',
 'source_documents': [Document(page_content='»Habt ihr das gehört? Habt ihr das begriffen? Er ist unser Tolwäter! Er heißt Nastiban\nBaltebux! Nein, er heißt Buxian Wähltoter! Quatsch, er heißt Saratät Buxiwohl! Nein,\nBaldrian Hix! Schlux! Babeltran Totwähler! Nix! Flax! Trix!«\nDie ganze Gesellschaft schien außer sich vor Begeisterung. Sie schüttelten sich gegenseitig\ndie Hände, lüpften die Hüte und schlugen sich auf Schultern und Bäuche, daß große\nStaubwolken aufstiegen.\n»Was sind wir für Glückspilze!« riefen sie. »Hoch lebe unser Buxtäter Sansibar Bastelwohl!«\nUnd immerfort schreiend und lachend stob der ganze riesige Schwärm in die Höhe und\nwirbelte fort. Der Lärm verhallte in der Ferne.\nBastian stand da und wußte kaum noch, wie er richtig hieß.\nEr war sich nicht mehr so sicher, ob er wirklich etwas Gutes getan hatte.', metadata={'source': 'data/die-unendliche-geschichte-michael-ende.pdf', 'page': 144}),
 

In [32]:
query = "What is Huggingface"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 I don't know.


Sources:
data/die-unendliche-geschichte-michael-ende.pdf
data/die-unendliche-geschichte-michael-ende.pdf


In [34]:
query = "Was ist Auryn?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Auryn ist ein Medaillon, das ein Emblem der Kindlichen Kaiserin ist und seinen Träger zu ihrem Stellvertreter macht. Auf der Rückseite des Medaillons ist eine Inschrift mit vier kurzen Worten in eigenartig verschlungenen Buchstaben.


Sources:
data/die-unendliche-geschichte-michael-ende.pdf
data/die-unendliche-geschichte-michael-ende.pdf


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7f799c2b3640>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


QA Chain with mapreduce

In [40]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [63]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [64]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader

In [41]:
from langchain.vectorstores import FAISS 

In [67]:
!ls data/

die-unendliche-geschichte-michael-ende.pdf


In [68]:
doc_reader = PdfReader('data/die-unendliche-geschichte-michael-ende.pdf')

In [69]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [81]:
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 400,
    chunk_overlap  = 100, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [82]:
docsearch = FAISS.from_texts(texts, embedding)

In [83]:
docsearch.embedding_function

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='2022-12-01', openai_api_base=None, openai_api_type=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special=set(), chunk_size=1000, max_retries=6, request_timeout=None)>

In [84]:
query = "Wer ist BBB?"
docs = docsearch.similarity_search(query)

In [85]:
len(docs)

4

In [86]:
docs[1]

Document(page_content='»Wer?« fragte eine neu Dazugekommene.\n»Der Dingsda«, erwiderten die anderen.Und die neu Angekommene sagte: »Ich kenne den\nDingsda nicht. Wer ist das überhaupt?«\nDie erste rief: »He, Dingsda, wer bist du überhaupt?«\n»Ich bin kein Dingsda!« schrie Bastian nun doch ziemlich wütend, »ich bin Bastian Balthasar\nBux und habe aus euch die Schlamuffen gemacht, damit ihr nicht mehr weint und jammert.', metadata={})

In [78]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") # we are going to stuff all the docs in at once

In [79]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [87]:
query = "who is the book authored by?"
docs = docsearch.similarity_search(query,k=20)
chain.run(input_documents=docs, question=query)

' Die Unendliche Geschichte ist von Michael Ende geschrieben.'

In [89]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="map_rerank",
                      return_intermediate_steps=True
                      ) 

query = "Wer ist Fuchur?"
docs = docsearch.similarity_search(query,k=10)
results = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
results

{'intermediate_steps': [{'answer': ' Fuchur ist ein Freund von Atréju.',
   'score': '80'},
  {'answer': ' Fuchur ist ein Geschöpf Phantâsiens.', 'score': '100'},
  {'answer': ' Fuchur ist eine Figur in Michael Ende\'s Buch "Die Unendliche Geschichte" (The Neverending Story).',
   'score': '100'},
  {'answer': ' Fuchur ist ein weißer Drache.', 'score': '90'},
  {'answer': " Fuchur is a character in Michael Ende's novel The Neverending Story.",
   'score': '70'},
  {'answer': ' Fuchur ist ein weißer Blitz, der den Protagonisten an seinem langen, blauschwarzen Haarschopf packt und in den nachtschwarzen Himmel emporbraust.',
   'score': '100'},
  {'answer': ' This document does not answer the question.', 'score': '0'},
  {'answer': ' Fuchur is ein Freund von Atréju.', 'score': '100'},
  {'answer': ' Fuchur ist eine Figur aus dem Buch Die Unendliche Geschichte.',
   'score': '100'},
  {'answer': ' Fuchur ist ein Glücksdrache.', 'score': '100'}],
 'output_text': ' Fuchur ist ein Geschöpf Ph

In [90]:
results['output_text']

' Fuchur ist ein Geschöpf Phantâsiens.'

In [91]:
results['intermediate_steps']

[{'answer': ' Fuchur ist ein Freund von Atréju.', 'score': '80'},
 {'answer': ' Fuchur ist ein Geschöpf Phantâsiens.', 'score': '100'},
 {'answer': ' Fuchur ist eine Figur in Michael Ende\'s Buch "Die Unendliche Geschichte" (The Neverending Story).',
  'score': '100'},
 {'answer': ' Fuchur ist ein weißer Drache.', 'score': '90'},
 {'answer': " Fuchur is a character in Michael Ende's novel The Neverending Story.",
  'score': '70'},
 {'answer': ' Fuchur ist ein weißer Blitz, der den Protagonisten an seinem langen, blauschwarzen Haarschopf packt und in den nachtschwarzen Himmel emporbraust.',
  'score': '100'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' Fuchur is ein Freund von Atréju.', 'score': '100'},
 {'answer': ' Fuchur ist eine Figur aus dem Buch Die Unendliche Geschichte.',
  'score': '100'},
 {'answer': ' Fuchur ist ein Glücksdrache.', 'score': '100'}]

In [92]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nIn addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:\n\nQuestion: [question here]\nHelpful Answer: [answer here]\nScore: [score between 0 and 100]\n\nHow to determine the score:\n- Higher is a better answer\n- Better responds fully to the asked question, with sufficient level of detail\n- If you do not know the answer based on the context, that should be a score of 0\n- Don't be overconfident!\n\nExample #1\n\nContext:\n---------\nApples are red\n---------\nQuestion: what color are apples?\nHelpful Answer: red\nScore: 100\n\nExample #2\n\nContext:\n---------\nit was night and the witness forgot his glasses. he was not sure if it was a sports car or an suv\n---------\nQuestion: what type was the car?\nHelpful Answer: a sports car or an su

## Deleteing the DB

In [None]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma-embeddings.parquet (deflated 29%)
  adding: db/index/ (stored 0%)
  adding: db/index/id_to_uuid_66206206-1387-4ea7-a77f-7126acab2376.pkl (deflated 36%)
  adding: db/index/index_metadata_66206206-1387-4ea7-a77f-7126acab2376.pkl (deflated 5%)
  adding: db/index/uuid_to_id_66206206-1387-4ea7-a77f-7126acab2376.pkl (deflated 39%)
  adding: db/index/index_66206206-1387-4ea7-a77f-7126acab2376.bin (deflated 17%)
  adding: db/chroma-collections.parquet (deflated 50%)


In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

## Starting again loading the db

restart the runtime

In [None]:
!unzip db.zip

Archive:  db.zip
   creating: db/
  inflating: db/chroma-embeddings.parquet  
   creating: db/index/
  inflating: db/index/id_to_uuid_66206206-1387-4ea7-a77f-7126acab2376.pkl  
  inflating: db/index/index_metadata_66206206-1387-4ea7-a77f-7126acab2376.pkl  
  inflating: db/index/uuid_to_id_66206206-1387-4ea7-a77f-7126acab2376.pkl  
  inflating: db/index/index_66206206-1387-4ea7-a77f-7126acab2376.bin  
  inflating: db/chroma-collections.parquet  


In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
persist_directory = 'db'
embedding = OpenAIEmbeddings(disallowed_special=())

vectordb2 = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})



In [None]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [None]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

There is no information provided about Pando raising money, so I don't know the answer to that question.


Sources:
data/Natural Language Processing with Transformers Building Language Applications with Hugging Face.pdf
data/Natural Language Processing with Transformers Building Language Applications with Hugging Face.pdf


### Chat prompts

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
