Install the required dependencies:

In [1]:
!pip install -q cassio datasets langchain openai tiktoken -U langchain-community

Import the packages you'll need:

In [2]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [3]:
!pip install PyPDF2



In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
from PyPDF2 import PdfReader

### Setup

In [None]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:OGpDInhrKNBmGZLpgi:334f87685233e893e76471503162cc3c86ac9e0b71cfd9da24ee" # enter the "AstraCS:..." string found in in your Token JSON file
ASTRA_DB_ID = "b16a94e0-4076-b076-dca0cb4d00be" # enter your Database ID

OPENAI_API_KEY = "sk-....." # enter your OpenAI key

#### Provide your secrets:

Replace the following with your Astra DB connection details and your OpenAI API key:

In [7]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('UP.pdf')

In [8]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [9]:
raw_text

"Uttar Pradesh\ndrishtiias.com\n/printpdf/know-your-state-uttar-pradesh\nState:\n Uttar Pradesh (U.P.)\nFormed:\n 1 April 1937-as United province\nStatehood:\n 26 January 1950-rechristened as Uttar Pradesh\nCapital:\n Lucknow\nGovernor:\n Ram Naik\nChief Minister:\n Yogi Adityanath\nDeputy Chief Minister(s):\n Kesav Prasad Maurya, Dr. Dinesh Sharma\nOfficial language:\n Hindi\nOther languages: \nUrdu, Awadhi, Bundeli, Bhojpuri, etc.\nMajor Religions:\n Hinduism, Islam\nArea: \n2,40,928 Square km (4th largest in India)\nPopulation: \n19,9,812,341 (Most populous in India)\nPopulation density:\n 829 per square km\nSex Ratio:\n 912 per thousand\nLiteracy rate: \n69.72%\nMale literacy:\n 79.24%\nFemale literacy:\n 59.26%\n1/41GDP (2018-19):\n Rs 14.89 Lakh Crore\nHigh Court:\n Allahabad High Court\nDistricts:\n 75\nCities and Towns:\n 915\nDevelopment blocks:\n 822\nNagar Nigams:\n 17\nMembers of Lok Sabha from U.P.:\n 80\nMembers of Rajya Sabha from U.P.:\n 31\nMembers of Legislative Assem

Initialize the connection to your database:

_(do not worry if you see a few warnings, it's just that the drivers are chatty about negotiating protocol versions with the DB.)_

In [10]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the LangChain embedding and LLM objects for later usage:

In [11]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


Create your LangChain vector store ... backed by Astra DB!

In [12]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="QA_Mini_Demo",
    session=None,
    keyspace=None,
)

In [13]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it should not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [14]:
texts[:5]

['Uttar Pradesh\ndrishtiias.com\n/printpdf/know-your-state-uttar-pradesh\nState:\n Uttar Pradesh (U.P.)\nFormed:\n 1 April 1937-as United province\nStatehood:\n 26 January 1950-rechristened as Uttar Pradesh\nCapital:\n Lucknow\nGovernor:\n Ram Naik\nChief Minister:\n Yogi Adityanath\nDeputy Chief Minister(s):\n Kesav Prasad Maurya, Dr. Dinesh Sharma\nOfficial language:\n Hindi\nOther languages: \nUrdu, Awadhi, Bundeli, Bhojpuri, etc.\nMajor Religions:\n Hinduism, Islam\nArea: \n2,40,928 Square km (4th largest in India)\nPopulation: \n19,9,812,341 (Most populous in India)\nPopulation density:\n 829 per square km\nSex Ratio:\n 912 per thousand\nLiteracy rate: \n69.72%\nMale literacy:\n 79.24%\nFemale literacy:\n 59.26%\n1/41GDP (2018-19):\n Rs 14.89 Lakh Crore\nHigh Court:\n Allahabad High Court\nDistricts:\n 75\nCities and Towns:\n 915',
 '912 per thousand\nLiteracy rate: \n69.72%\nMale literacy:\n 79.24%\nFemale literacy:\n 59.26%\n1/41GDP (2018-19):\n Rs 14.89 Lakh Crore\nHigh Court:\

### Load the dataset into the vector store



In [15]:

astra_vector_store.add_texts(texts)

print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 104 headlines.


### Run the QA cycle

Simply run the cells and ask a question -- or `quit` to stop. (you can also stop execution with the "▪" button on the top toolbar)

Here are some suggested questions:
- _What is the current GDP of UP?_
- _How much the agriculture target will be increased to and what the focus will be_ in UP?


In [16]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)


Enter your question (or type 'quit' to exit): What is the current GDP of UP?

QUESTION: "What is the current GDP of UP?"




ANSWER: "Rs 14.89 Lakh Crore"


What's your next question (or type 'quit' to exit): quit
