In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
import os

In [None]:
# function to load data files/ pdfs 
def pdf_file_loader(file):
    pdf_loader = DirectoryLoader(file, glob=["*.pdf"], loader_cls=PyPDFLoader) 
    
    pdf_docs = pdf_loader.load()

    return pdf_docs


In [11]:
data = pdf_file_loader(file= 'dataset/')

In [None]:
data

In [13]:
# Text chunking

def text_chunk(data):
    text_chunker = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    chunks = text_chunker.split_documents(data)

    return chunks

In [16]:
String_bits = text_chunk(data)

In [23]:
from langchain_huggingface import HuggingFaceEmbeddings 

In [28]:
def HGF_embedder(model: str= "sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = HuggingFaceEmbeddings(model_name= model)

    return embedding_model

In [29]:
text_embedding_model = HGF_embedder()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
#from langchain.vectorstores.cassandra import Cassandra

In [33]:
ASTRA_DB_TOKEN = os.environ.get("ASTRA_DB_TOKEN")
ASTRA_DB_API = os.environ.get("ASTRA_DB_API")

In [41]:
print(ASTRA_DB_TOKEN)

None


In [None]:
from langchain_astradb import AstraDBVectorStore

vstore = AstraDBVectorStore(
    collection_name="DocStore",
    embedding = text_embedding_model,
    token= ""
    api_endpoint= ""
)

In [39]:
vstore.add_documents(String_bits)

['0fb50148acb54d059275d77e9de6c938',
 '0ac4d444fb4d42479ba2ad7f444d1981',
 'a786a86afe1c47dc9f0319c46d6a57b3',
 'cff49a9dc8444855905a94b56524899a',
 'b2265d95b9e84d9fb81198a138954eda',
 '74f07a917b35462eb54df91aaace6a33',
 'e56aab3d15884289b2cdc1788ef197ee',
 '8be1a1c546a24fe597632a73b33036c5',
 'cc0943b7fe414daeb8e8f944c2e1c383',
 'd276ad1a4bc8408dac301809b10c6733',
 'f97b7538698e4ad8ae95bc5ff0f6ebd6',
 'd0947e3a63054ad69427277d0273fee1',
 'aa32b0c6b6e34347b216093971799a9a',
 'e3fa5d635b204921826bbee1c26dbe45',
 '328eb3ae3fae45a5900be3974a9510a4',
 '98206aa3384f42a9805b22a872c1726b',
 '48f41f0b509445aa9936799404604c95',
 '5376a57dd3e347029598e250df366cc1',
 '357a118c6d0a47f3b0f435e4cc2ebe86',
 'b711b9244cd94044bf0c66c673b0ee40',
 '4ee021a6b77d4ae596beca4af3eae802',
 '163e501ea4c94192a3369d17d7d77a51',
 '84b2c0fc19b04a6293b12abb6e9aaa57',
 '25eed9e141124fd0bef830d8e30a13d5',
 '8b68e1efa1a640a0b6606b1a8f035132',
 '7a03f35ad8e343dab8d6cf3f713fc634',
 '5e1bb381ad4142aa96702f72136f3625',
 

In [None]:
import os
from dotenv import load_dotenv

# LangChain components
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

# Load environment variables (like GOOGLE_API_KEY)
load_dotenv()

# --- 1. Set up Google API Key ---
# LangChain will automatically look for GOOGLE_API_KEY environment variable.
# If not set, you can pass it directly:
# os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"

# --- 2. Load Documents (Example: a PDF) ---
# For this example, let's create a dummy PDF file or use an existing one.
# Make sure you have a 'sample.pdf' in the same directory, or download one.
# For demonstration, I'll assume a 'sample.pdf' exists.
# If you don't have one, you can skip document loading and directly create a vector store from text.
# Let's create a dummy text file to simulate content.
dummy_content = """
The quick brown fox jumps over the lazy dog. This is a classic pangram.
Artificial intelligence is rapidly advancing, with large language models
like Gemini playing a significant role in text generation and understanding.
Machine learning powers many modern applications, from recommendation systems
to autonomous vehicles. Natural Language Processing (NLP) is a subfield of AI
focused on the interaction between computers and human language.
Vector databases are essential for efficient retrieval in RAG applications.
"""
with open("sample.txt", "w") as f:
    f.write(dummy_content)

# Using PyPDFLoader (if you actually have a PDF)
# If using a text file, you'd use TextLoader.
# loader = PyPDFLoader("sample.pdf")
# documents = loader.load()

# For our dummy text file:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("sample.txt")
documents = loader.load()


# --- 3. Split Documents into Chunks ---
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Max number of characters in a chunk
    chunk_overlap=200 # Overlap between chunks to maintain context
)
docs = text_splitter.split_documents(documents)
print(f"Split {len(documents)} document(s) into {len(docs)} chunks.")

# --- 4. Create Embeddings and Vector Store ---
# GoogleGenerativeAIEmbeddings is the class for Gemini embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # Recommended embedding model for Gemini

# Create a FAISS vector store (in-memory for this example)
vector_store = FAISS.from_documents(docs, embeddings)
print("Vector store created and populated.")

# --- 5. Initialize the Gemini Chat Model ---
# ChatGoogleGenerativeAI is the class for Gemini chat models in LangChain
# model="gemini-pro" is the text-only Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.0) # temperature=0.0 for deterministic answers

print(f"Initialized Gemini model: {llm.model_name}")

# --- 6. Set up the QA Chain ---
# We'll use a standard QA chain for RAG
# Prompt template for better answer formatting
prompt_template = """
Answer the question based on the provided context only.
If the answer is not found in the context, politely state that you don't have enough information.

Context:
{context}

Question:
{question}

Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Load the QA chain, specifying the Gemini LLM and prompt
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT) # 'stuff' combines all docs into one prompt

# --- 7. Perform QA ---
question = "What are large language models used for and what role does Gemini play?"
print(f"\n--- Asking Question ---")
print(f"Question: {question}")

# Retrieve relevant documents from the vector store
retrieved_docs = vector_store.similarity_search(question)
print(f"Retrieved {len(retrieved_docs)} relevant document(s).")

# Run the QA chain
try:
    response = chain.run(input_documents=retrieved_docs, question=question)
    print("\n--- Answer ---")
    print(response)
except Exception as e:
    print(f"An error occurred during QA: {e}")
    print("Please ensure your GOOGLE_API_KEY is correct and the Gemini model is accessible.")


# --- Another Question ---
question_2 = "What is the capital of France according to the document?"
print(f"\n--- Asking Another Question ---")
print(f"Question: {question_2}")

retrieved_docs_2 = vector_store.similarity_search(question_2)
print(f"Retrieved {len(retrieved_docs_2)} relevant document(s).")

try:
    response_2 = chain.run(input_documents=retrieved_docs_2, question=question_2)
    print("\n--- Answer ---")
    print(response_2)
except Exception as e:
    print(f"An error occurred during QA: {e}")