Get Answers from PDF Using Langchain and Astradb

In [None]:
! pip install -q cassio datasets langchain openai tiktoken

In [None]:
pip install -U langchain-community

In [None]:
# Lanchain components to use
from langchain.vectorstores.cassandra import Cassandra # libaries in langchain help to connect with cassandra db and perform necessary tasks like text embeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper # wrap vector in one specific  package
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings # convert text to vectors

In [None]:
from datasets import load_dataset # to use dataset from huggingface

In [None]:
import cassio # for Astra DB integration in Langchain. Also, to initialize the DB coonection

In [None]:
!pip install PyPDF2 # library to read text present inside the pdf

In [None]:
from PyPDF2 import PdfReader # to read text from pdf

In [None]:
# to connect to Astra DB hosted in Cloud
ASTRA_DB_APPLICATION_TOKEN = 'Enter your token here'
ASTRA_DB_ID = 'Enter your ID here'

In [None]:
# to use Openaiapi features
OPENAI_API_KEY = "Enter your key here"

In [None]:
# providing pdf path
pdfreader = PdfReader('/content/budget_speech.pdf')

In [None]:
from typing_extensions import Concatenate
raw_text = " "
for i, page in enumerate(pdfreader.pages):
  content = page.extract_text()
  if content:
    raw_text += content

In [None]:
raw_text

In [None]:
#Initialize connection to my database
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
# Created Langchain embedding and LLM objects
llm = OpenAI(openai_api_key = OPENAI_API_KEY)
embedding = OpenAIEmbeddings(open_ai_key=OPENAI_API_KEY)

In [None]:
# Creating Langchain vector store
astra_vector_store = Cassandra(
   embedding = embedding,
   table_name = "qa_mini_demo",
   session = "None",
   keyspace = "None",
)

In [None]:
from langchain.textsplitter import CharacterTextSplitter

# Splitting the text using Character Text Split such that it does not increase token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len,
)

texts = text_splitter.split_text(raw_text)

In [None]:
texts[:50] # to see top 50

In [None]:
# Loading top 50 into the vector store and converting text to embeddings
astra_vector_store.add_texts(texts[:50])
print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore = astra_vector_store)

In [None]:
# Running QA
first_question = True
while True:
  if first_question:
    query_text = input("\nEnter your question (or type 'quit' to exit):").strip()
  else:
      query_text = input("\nWhat's your next question (or type 'quit' to exit):").strip()

  if query_text.lower()== 'quit':
      break

  if query_text == '':
      continue

  first_question = False

  print('\nQUESTION: \'%s\"" % query_text')
  answer = astra_vector_index.query(query_text, llm = llm).strip()
  print("ANSWER: \'%s\'n" % answer)

  print("FIRST DOCUMENTS BY RELAVANCE:")
  for doc, score in astra_vector_store.similarity_search_with_score(query_text, k = 4):
    print("     [%0.4f]  \"%s ...\"" % (score, doc.page_content[:84]))

