# Project: Question Answering on Private Documents

In [5]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [9]:
pip install -q pypdf

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install -q docx2txt

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [26]:
pip install -q langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [11]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain_community.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain_community.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

# Wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain_community.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


In [12]:
def chunk_data(data, chunk_size=256):
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

## Embeding Cost

In [35]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00255:.6f}')

print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.042613


## Embedding and Uploading to a Vector Database (Pinecone)

In [22]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('OK')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('OK')
        return vector_store
        

In [3]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ...')
        for index in indexes:
            pc.delete_index(index)
        print('OK')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('OK')

## Asking and Getting Answers

In [26]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(q)
    return answer
    

## Running Code

In [13]:
data = load_document('files/us_constitution.pdf')
# print(data[1],page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [24]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [23]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [14]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

190
Representatives
shall
chuse
their
Speaker
and
other
Of ficers;and
shall
have
the
sole
Power
of
Impeachment.
Section
3:
The
Senate
The
Senate
of
the
United
States
shall
be
composed
of
two
Senators
from
each
State,
chosen
by
the
Legislature
thereof,
for
six


In [6]:
delete_pinecone_index()

Deleting all indexes ...
OK


In [23]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index askadocument and embeddings ...OK


In [28]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

  warn_deprecated(


The pieces of text provided are actually excerpts from the United States Constitution. The Constitution outlines the framework for the government of the United States, establishing the three branches of government (legislative, executive, and judicial) and defining the powers and limitations of each branch. It also guarantees certain rights and freedoms to the people and sets the foundation for the country's laws and governance.


In [29]:
import time
i = 1
print('Write Quit or Exist to quit.')

while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exist']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break

    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exist to quit.


Question #{i}:  What is the first amendment of the US Constitution?



Answer: The First Amendment of the US Constitution guarantees freedom of religion, speech, press, assembly, and the right to petition the government.

 -------------------------------------------------- 



Question #{i}:  Explain the concept of "Federalism" as it is presented in US Constitution



Answer: Federalism, as presented in the US Constitution, is the system of government in which power is divided between a central authority (the federal government) and individual states. The Constitution outlines specific powers granted to the federal government, such as national defense and regulating interstate commerce, while reserving other powers to the states. This division of power aims to create a balance between a strong central government and individual state autonomy. The 10th Amendment further reinforces this concept by stating that any powers not explicitly given to the federal government are reserved for the states or the people.

 -------------------------------------------------- 



Question #{i}:  Describe the bill of Rights



Answer: The Bill of Rights is the first ten amendments to the United States Constitution. It includes protections for individual liberties such as the freedom of speech, religion, and the press, the right to assemble peaceably, the right to petition the government, and the right to bear arms. The Bill of Rights also includes protections against unreasonable searches and seizures, the right to a fair trial, and protections against cruel and unusual punishment.

 -------------------------------------------------- 



Question #{i}:  Q1: How does the constitution address the issue of Presidential succession? Q2: Describe the bill of rights. Answer both question



Answer: A1: The Constitution addresses the issue of Presidential succession by stating that in case of the removal, death, or resignation of the President, the Vice President shall become President, and if there is a vacancy in the office of the Vice President, the President shall nominate a Vice President.

A2: The Bill of Rights refers to the first ten amendments to the United States Constitution. These amendments protect individual liberties and rights such as freedom of speech, religion, and the right to a fair trial. They were added to the Constitution to address concerns over individual freedoms and limitations of governmental power.

 -------------------------------------------------- 



Question #{i}:  quit


Quitting ... bye bye!


In [54]:
delete_pinecone_index()

Deleting all indexes ...
OK


In [55]:
data = load_from_wikipedia('ChatGPT', 'id')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Creating index chatgpt and embeddings ...OK


In [57]:
q = "Apa itu chatgpt"
answer = ask_and_get_answer(vector_store, q)
print(answer)

ChatGPT adalah bot obrolan kecerdasan buatan yang merupakan model bahasa generatif yang menggunakan teknologi transformer untuk menghasilkan teks. Model ini dilatih dengan miliaran kalimat dari berbagai sumber untuk memahami berbagai gaya bahasa dan konteks percakapan. Model awalnya berbasis GPT-3.5 dan saat ini telah diperbarui dengan GPT-4 yang dirilis pada 14 Maret 2023.
