In [13]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_astradb import AstraDBVectorStore

from datasets import load_dataset
import os
from PyPDF2 import PdfReader

from dotenv import load_dotenv
load_dotenv()

True

In [5]:
pdf_reader = PdfReader("RIL-Integrated-Annual-Report-2022-23.pdf")

In [6]:
from typing_extensions import Concatenate

raw_text = ""
for i,page in enumerate(pdf_reader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [11]:
raw_text



In [20]:
llm = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])

vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name="vector_storage",
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    namespace=os.environ["ASTRA_DB_KEYSPACE"],
)

In [21]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=800,
        chunk_overlap=200,
        length_function=len
)

texts = text_splitter.split_text(raw_text)

In [25]:
len(texts)

2599

In [22]:
vstore.add_texts(texts[:50])
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vstore)


In [26]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()
    
    if query_text.lower() == "quit":
        break
    if query_text == "":
        continue

    first_question = False

    print("\nQuestion: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc,score in vstore.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))



Question: "who is chairman"
ANSWER: "Mukesh D. Ambani"
FIRST DOCUMENTS BY RELEVANCE:
    [0.8639] "the right talent and through empowering 
our young leaders, I am confident that 
we  ..."
    [0.8547] "businesses is not possible without a 
robust governance structure. We, as 
a company ..."
    [0.8530] "to ensure our continued alignment to 
best ESG practices and compliance 
with applic ..."
    [0.8515] "is going to be a crucial milestone in 
our history. We, as a company, have 
a proven ..."
