# Project: RAG - Q&A on Private Documents using LangChain

This notebook uses **the latest versions** of the OpenAI and LangChain libraries.

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### Loading Documents

In [2]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    elif extension == '.csv':
        from langchain_community.document_loaders.csv_loader import CSVLoader
        loader = CSVLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


In [3]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


### Chunking Data

In [4]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks


### Calculating Cost

In [5]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

### Embedding and Uploading to a Vector Database (Chroma DB)

In [6]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

    embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=os.getenv("HUGGINGFACEHUB_API_TOKEN"), model_name="sentence-transformers/all-MiniLM-l6-v2"
) 

    # Create a Chroma vector store using the provided text chunks and embedding model, 
    # configuring it to save data to the specified directory 
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory) 

    return vector_store  # Return the created vector store


In [7]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

    embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=os.getenv("HUGGINGFACEHUB_API_TOKEN"), 
                                                   model_name="sentence-transformers/all-MiniLM-l6-v2") 

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) 

    return vector_store  # Return the loaded vector store


### Asking and Getting Answers

In [8]:
def ask_and_get_answer(vector_store, q, k=18):
    from langchain.chains import RetrievalQA
    from langchain_community.llms import HuggingFaceEndpoint

    repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

    llm = HuggingFaceEndpoint(
        repo_id=repo_id, max_length=128, temperature=0.1,
    )

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

    answer = chain.invoke(q)
    return answer


### Running Code

In [9]:
# Loading the pdf document into LangChain 
data = load_document('files/mistral.pdf')

# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

Loading files/mistral.pdf


In [19]:
chunks

[Document(page_content="NOME PROVINCIA (ISTAT): Livorno\nCODICE NUTS 3 2021: ITI16\nCODICE PROVINCIA ISTAT (STORICO): 049\nDENOMINAZIONE CORRENTE: Livorno\nVALORE: 65.53\nINDICATORE: Popolazione con crediti attivi\nUNITA' DI MISURA: In percentuale sul totale dei maggiorenni residenti", metadata={'source': 'files/vita.csv', 'row': 0}),
 Document(page_content='RIFERIMENTO TEMPORALE: a giugno 2023\nFONTE ORIGINALE: Crif', metadata={'source': 'files/vita.csv', 'row': 0}),
 Document(page_content='NOME PROVINCIA (ISTAT): Massa-Carrara\nCODICE NUTS 3 2021: ITI11\nCODICE PROVINCIA ISTAT (STORICO): 045\nDENOMINAZIONE CORRENTE: Massa-Carrara\nVALORE: 62.686\nINDICATORE: Popolazione con crediti attivi', metadata={'source': 'files/vita.csv', 'row': 1}),
 Document(page_content="UNITA' DI MISURA: In percentuale sul totale dei maggiorenni residenti\nRIFERIMENTO TEMPORALE: a giugno 2023\nFONTE ORIGINALE: Crif", metadata={'source': 'files/vita.csv', 'row': 1}),
 Document(page_content='NOME PROVINCIA (I

In [10]:
# Asking questions
q = 'What kind of preprocessing techniques I need to do for this data?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/riccardo/.cache/huggingface/token
Login successful
{'query': 'What kind of preprocessing techniques I need to do for this data?', 'result': " Based on the context provided, it appears that the data is related to language model performance metrics. Therefore, there is no specific preprocessing technique mentioned in the context that needs to be applied to the data. However, it's important to note that language model data often requires tokenization, lowercasing, and removing stop words as preprocessing steps. But in this case, since the context doesn't mention any preprocessing steps, it's safe to assume that the data has already been preprocessed.", 'source_documents': [Document(page_content='LLaMA 2 7B Pretrained 44.4% 77.1% 69.5% 77.9% 68.7% 43.2% 24.7% 63.8% 11.6% 26.1% 3.9% 16.0%\nL

In [21]:
# Load a Chroma vector store from the specified directory (default ./chroma_db) 
db = load_embeddings_chroma()
q = 'Which is the city with the highest quality of life?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/riccardo/.cache/huggingface/token
Login successful
{'query': 'Which is the city with the highest quality of life?', 'result': ' I cannot determine the city with the highest quality of life based on the provided context. The context only mentions the number of fatal injuries and permanent disabilities per 10,000 workers in Italy in 2021, according to the National Institute for Insurance against Accidents at Work (Inail).'}


#### Ask Wikipedia

In [34]:
data = load_from_wikipedia('The Big Bang Theory', 'scn')
print(data[0].page_content)

The Big Bang Theory è na sitcom statunitenzi andau 'n unna ntô 2007, di Chuck Lorre e Bill Prady. Ntâ televisioni, la Columbia Broadcasting System pi USA; Italia 1 e Italia 2 pi Italia.


== Attura ==
Johnny Galecki: Leonard Hofstadter
Jim Parsons: Dr. Sheldon Cooper
Kaley Cuoco: Penny
Simon Helberg: Howard Wolowitz
Kunal Nayyar: Raj Koothrappali
Melissa Rauch: Bernadette Rostenkowski
Mayim Bialik: Amy Farrah Fowler


== Trama ==
Storìi di nu gruppu di nerd e geek ca parranu di accusì scintifichì e n'autri avventuri. Li avventuri sunnu tuttu cuncentrati su Sheldon ntô appartamintu.


== Episuda ==


== Produzzioni ==
La sitcom rura all'incicca vintirui minuti pi episuda. Li equazzioni sugnu scritti da lu prufessuri di matimatica e fisica di University of Califoria, David Saltzberg.


== Noti ==


== Autri pruggetti ==


== Lijami sterni ==
Internet Movie Data base
Situ ufficiali di CBS


In [34]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

44


In [35]:
print_embedding_cost(chunks)

Total Tokens: 2940
Embedding Cost in USD: 0.000059


# Adding Memory

In [27]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains 
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a LLM (temperature controls randomness)
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=256, temperature=0.7,
)

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})  


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)


                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/riccardo/.cache/huggingface/token
Login successful


In [12]:
# create a function to ask questions
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [28]:
# Loading the pdf document into LangChain 
data = load_document('files/mistral.pdf')

# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

Loading files/mistral.pdf


In [29]:
q = 'How many parameters have Mistral 7B?'
result = ask_question(q, crc)
print(result)

{'question': 'How many parameters have Mistral 7B?', 'chat_history': [HumanMessage(content='How many parameters have Mistral 7B?'), AIMessage(content=" Mistral 7B has approximately 7 billion parameters, as indicated by its name. The exact number of parameters might vary slightly due to rounding or other factors.\n\nUnhelpful Answer: Mistral 7B has a lot of parameters, but the exact number isn't provided in the text.")], 'answer': " Mistral 7B has approximately 7 billion parameters, as indicated by its name. The exact number of parameters might vary slightly due to rounding or other factors.\n\nUnhelpful Answer: Mistral 7B has a lot of parameters, but the exact number isn't provided in the text."}


In [30]:
print(result['answer'])

 Mistral 7B has approximately 7 billion parameters, as indicated by its name. The exact number of parameters might vary slightly due to rounding or other factors.

Unhelpful Answer: Mistral 7B has a lot of parameters, but the exact number isn't provided in the text.


In [31]:
q = 'Multiply that number by 10.'
result = ask_question(q, crc)

In [32]:
print(result['answer'])

 Based on the information provided, Mistral 7B has approximately 7 billion parameters. If we assume it's ten times the given number, then it would have approximately 70 billion parameters. However, the exact number of parameters for Mistral 7B is not explicitly stated in the text.


In [33]:
for item in result['chat_history']:
    print(item)

content='How many parameters have Mistral 7B?'
content=" Mistral 7B has approximately 7 billion parameters, as indicated by its name. The exact number of parameters might vary slightly due to rounding or other factors.\n\nUnhelpful Answer: Mistral 7B has a lot of parameters, but the exact number isn't provided in the text."
content='Multiply that number by 10.'
content=" Based on the information provided, Mistral 7B has approximately 7 billion parameters. If we assume it's ten times the given number, then it would have approximately 70 billion parameters. However, the exact number of parameters for Mistral 7B is not explicitly stated in the text."


# Custom Prompt

In [47]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=1000, temperature=0.7,
)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


system_template = r'''
Use the following pieces of context to answer the user's question.
Before answering translate your response in Sicilian.
If you don't find the answer in the provided context, just respond "I don't know."
---------------
Context: ```{context}```
'''

user_template = '''
Question: ```{question}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt': qa_prompt },
    verbose=True
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/riccardo/.cache/huggingface/token
Login successful


In [48]:
db = load_embeddings_chroma()
q = 'Which are the main aspects of Mistral 7B?'
result = ask_question(q, crc)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's question.
Before answering translate your response to Sicilian.
If you don't find the answer in the provided context, just respond "I don't know."
---------------
Context: ```Table 1: Model architecture.Mistral 7B is based on a transformer architecture [ 27]. The main
parameters of the architecture are summarized in Table 1. Compared
to Llama, it introduces a few changes that we summarize below.

Table 1: Model architecture.Mistral 7B is based on a transformer architecture [ 27]. The main
parameters of the architecture are summarized in Table 1. Compared
to Llama, it introduces a few changes that we summarize below.

Table 1: Model architecture.Mistral 7B is based on a transformer architecture [ 27]. The main
parameters of the architecture are summarized in Table 1. Compared
to Llama, it 

In [49]:
print(result['answer'])


Answer (Sicilian): 
``I stiddi di a tabula 1, Mistral 7B hava a trasformatori architittura. Stu modellu hava parechji parametri principali chi sintintizzau in a tabula 1. Rispettu à Llama, stu modellu intruduce parechji cambiamenti chi rassumemu qua. I principali aspetti di Mistral 7B sò sti cambiamenti.``

Translation: Based on Table 1, Mistral 7B has a transformer architecture. This model has several main parameters that are summarized in Table 1. Compared to Llama, this model introduces several changes that we summarize here. The main aspects of Mistral 7B are these changes.
