In [3]:
# Langchain Components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

# Support for dataset retrieval with Huggingface 
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in Langchain.
# Will also initialize the DB Connection
import cassio

import os
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
# LangSmith Tracing SetUp
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = 'Langchain - Text To Math Problem Solver and Data Search Assistant'

In [4]:
from PyPDF2 import PdfReader

### SETUP

In [5]:
ASTRA_DB_APPLICATION_TOKEN = "enter-your-token"
ASTRA_DB_ID = "enter-your-id"

GROQ_API_KEY = os.getenv('GROQ_API_KEY')

1. Read the Dcument


In [9]:
pdfreader = PdfReader('LLM.pdf')

In [10]:
from typing_extensions import Concatenate

# Read text from PDF
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

Initialize The Connection to DataBase.

In [12]:
cassio.init(token = ASTRA_DB_APPLICATION_TOKEN, database_id = ASTRA_DB_ID)

Create the Langchain Embedding And LLM objects 

In [17]:
llm = ChatGroq(model = 'llama3-8b-8192', groq_api_key = GROQ_API_KEY)

embedding = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") 



Create Langchain VectorStore... Backed by Astra DB

In [16]:
astra_vector_store = Cassandra(
    embedding = embedding,
    table_name = "qa_mini_demo",
    session = None, #Default Parameter
    keyspace = None #Default Parameter
)

2. Text Chunks

In [18]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len
)

texts = text_splitter.split_text(raw_text)

3. Load the Dataset into the VectorStore

In [23]:
astra_vector_store.add_texts(texts)

print('Inserted %i headlines.'% len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore = astra_vector_store)

Inserted 119 headlines.


4. Run the QA Cycle

In [1]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)

    # Quering the vectorstore DB
    answer = astra_vector_index.query(query_text, llm=llm).strip()


    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))