# Project: Question Answering on Private Documents

In [9]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [10]:
pip install -q pypdf

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install -q docx2txt

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install -q langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [14]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain_community.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain_community.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

# Wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain_community.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


In [15]:
def chunk_data(data, chunk_size=256):
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

## Embeding Cost

In [17]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00255:.6f}')

## Embedding and Uploading to a Vector Database (Pinecone)

In [27]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('OK')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('OK')
        return vector_store
        

In [24]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ...')
        for index in indexes:
            pc.delete_index(index)
        print('OK')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('OK')

## Asking and Getting Answers

In [18]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(q)
    return answer
    

## Running Code

In [19]:
data = load_document('files/us_constitution.pdf')
# print(data[1],page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [24]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [23]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [21]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

190
Representatives
shall
chuse
their
Speaker
and
other
Of ficers;and
shall
have
the
sole
Power
of
Impeachment.
Section
3:
The
Senate
The
Senate
of
the
United
States
shall
be
composed
of
two
Senators
from
each
State,
chosen
by
the
Legislature
thereof,
for
six


In [22]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.042613


In [25]:
delete_pinecone_index()

Deleting all indexes ...
OK


In [28]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index askadocument and embeddings ...OK


In [28]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

  warn_deprecated(


The pieces of text provided are actually excerpts from the United States Constitution. The Constitution outlines the framework for the government of the United States, establishing the three branches of government (legislative, executive, and judicial) and defining the powers and limitations of each branch. It also guarantees certain rights and freedoms to the people and sets the foundation for the country's laws and governance.


In [27]:
import time
i = 1
print('Write Quit or Exit to quit.')

while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break

    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.


Question #1:  quit


Quitting ... bye bye!


In [54]:
delete_pinecone_index()

Deleting all indexes ...
OK


In [55]:
data = load_from_wikipedia('ChatGPT', 'id')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index chatgpt and embeddings ...OK


In [29]:
q = "Apa itu chatgpt"
answer = ask_and_get_answer(vector_store, q)
print(answer)

Maaf, saya tidak memiliki informasi terkait "chatgpt" dalam konteks yang diberikan. Apakah ada pertanyaan lain yang bisa saya bantu jawab?


## Using Chroma as a Vector DB

In [1]:
pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


In [30]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate an embedding model from OpenAI (smaller version for efficiency)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  

    # Create a Chroma vector store using the provided text chunks and embedding model, 
    # configuring it to save data to the specified directory 
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory) 

    return vector_store  # Return the created vector store


In [31]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536) 

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) 

    return vector_store  # Return the loaded vector store


In [25]:
data = load_document('files/rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [28]:
q = 'What is Vertex AI Search'
answer = ask_and_get_answer(vector_store, q)
print(answer)

Vertex AI Search is a feature within Google's Vertex AI platform. It offers customizable answers, search tuning, vector search, grounding, and compliance updates for enterprises. It also includes new generative AI capabilities and enterprise-ready features to enhance AI and machine learning tasks related to search functions.


In [24]:
# Load a Chroma vector store from the specified directory (default ./chroma_db) 
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

The StackOverflow dataset used in the context provided had 8 million pairs of questions and answers.


In [30]:
q = 'Multiply that number by 2'
answer = ask_and_get_answer(vector_store, q)
print(answer)

I'm sorry, but there is no specific number provided in the context that can be multiplied by 2.


### Adding Memory (Chat History)

In [36]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains 
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a ChatGPT LLM (temperature controls randomness)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)  

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})  


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the ChatGPT LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)


In [39]:
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [32]:
data = load_document('files/rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [40]:
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)

{'question': 'How many pairs of questions and answers had the StackOverflow dataset?', 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content='The StackOverflow dataset had 8 million pairs of questions and answers.')], 'answer': 'The StackOverflow dataset had 8 million pairs of questions and answers.'}


In [41]:
print(result['answer'])

The StackOverflow dataset had 8 million pairs of questions and answers.


In [43]:
q = 'Multiply that number by 10'
result = ask_question(q, crc)
print(result['answer'])

The result of multiplying 8 million pairs of questions and answers by 10 would be 80 million pairs of questions and answers.


In [46]:
q = 'Devide that result by 80'
result = ask_question(q, crc)
print(result['answer'])

The result of dividing 80 million pairs of questions and answers by 80 is 1 million.


In [45]:
for item in result['chat_history']:
    print(item)

content='How many pairs of questions and answers had the StackOverflow dataset?'
content='The StackOverflow dataset had 8 million pairs of questions and answers.'
content='Multiply that number by 10'
content='Multiplying 8 million pairs of questions and answers by 10 would result in 80 million pairs of questions and answers.'
content='Multiply that number by 10'
content='The result of multiplying 8 million pairs of questions and answers by 10 would be 80 million pairs of questions and answers.'
content='Devide that result by 80'
content='The result of dividing 80 million pairs of questions and answers by 80 is 1 million.'
