Install the required dependencies:

In [1]:
!pip install -q cassio datasets langchain openai tiktoken

Import the packages you'll need:

In [2]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
   ---------------------------------------- 232.6/232.6 kB 7.2 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
from PyPDF2 import PdfReader

### Setup

Replace the following with your Astra DB connection details and your OpenAI API key:

In [20]:
ASTRA_DB_APPLICATION_TOKEN = "" # enter the "AstraCS:..." string found in in your Token JSON file
ASTRA_DB_ID = "" # Enter your Database ID

OPENAI_API_KEY = "" # Enter your OpenAI key

In [21]:
# Provide the path of  pdf file.
pdfreader = PdfReader('')

In [22]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [23]:
raw_text



Initialize the connection to your database

In [24]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the LangChain embedding and LLM objects for later usage:

In [25]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

Create your LangChain vector store ... backed by Astra DB!

In [26]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [27]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it should not increase token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [28]:
texts[:50]



### Load the dataset into the vector store



In [29]:

astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 40 headlines.


### Run the QA cycle

Simply run the cells and ask a question -- or `quit` to stop. (you can also stop execution with the "▪" button on the top toolbar)





In [30]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "What is RAG system"
ANSWER: "The RAG (Retrieval-Augmented Generation) system is a technology that utilizes text embeddings, vector storing, and similarity search to generate contextually rich responses in document interactions. It is powered by frameworks like LangChain and incorporates components like the Vectorstore Index and Chat API interfaces. It is also used in professional knowledge-based question answering systems."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8994] "the document content.  
 
In a sophisticated symphony of technologies, the intellige ..."
    [0.8829] "availability of Embedding and Chat API interfaces from major foundation model compan ..."
    [0.8827] "of techniques for efficient document analysis and interaction. From traditional appr ..."
    [0.8701] "utilizing AstraDB for document vector storage, LangChain for efficient language proc ..."
