In [2]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

from datasets import load_dataset

import cassio

In [4]:
!pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
from PyPDF2 import PdfReader

## Setup

In [6]:
import os 
from dotenv import load_dotenv

load_dotenv()

True

In [10]:
astra_db_application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
astra_db_id = os.getenv("ASTRA_DB_ID")

In [11]:
### Embeddings
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [12]:
pdfreader = PdfReader("Research Papers\LLm.pdf")

In [13]:
from typing_extensions import Concatenate

raw_text = ""
for i, pages in enumerate(pdfreader.pages):
    content = pages.extract_text()
    if content:
        raw_text += content

In [14]:
cassio.init(token=astra_db_application_token, database_id=astra_db_id)

In [18]:
llm = ChatGroq(model = "openai/gpt-oss-20b", groq_api_key = os.getenv("GROQ_API_KEY"))

In [19]:
astra_vector_stores = Cassandra(
    embedding=embeddings,
    table_name="qa_mini_db",
    session=None,
    keyspace=None
)

In [20]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
)

texts = text_splitter.split_text(raw_text)

In [22]:
len(texts)

155

In [24]:
astra_vector_stores.add_texts(texts=texts)

print("Inserted %i headlines." %len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_stores)

Inserted 155 headlines.


# Run QA Cycle

In [25]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_stores.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "What is self attention?"
ANSWER: "**Self‑attention** is the core operation that lets a transformer model “look at” every other token in the same sequence when computing the representation for a particular token.  

- **How it works**  
  1. For each token in the input, the model computes three vectors: a **query** (Q), a **key** (K) and a **value** (V).  
  2. The relevance of every other token to the current token is measured by the dot‑product of its query with all the keys:  
     \[
     \text{score}_{ij} = \frac{Q_i \cdot K_j}{\sqrt{d_k}}
     \]  
     (the division by \(\sqrt{d_k}\) stabilises gradients).  
  3. The scores are passed through a softmax to obtain attention weights that sum to one.  
  4. Each token’s new representation is the weighted sum of all values:  
     \[
     \text{output}_i = \sum_j \text{softmax}(\text{score}_{ij}) \, V_j
     \]  

- **Why it’s called “self‑attention”**  
  The queries, keys and values all come from the *same* block of toke