In [1]:
# Note on Symlink Warning:

# To permanently suppress the Windows symlink warning, add this line at the start of your script

# (before any HF imports) or set it in your environment:

# os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [2]:
# --- Standard Python Imports ---
import os
import warnings
import torch
from dotenv import load_dotenv

# --- LangChain/RAG Component Imports ---
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma # 1. Import ChromaDB
from langchain_anthropic import ChatAnthropic        # 2. Import Anthropic Chat Model
from langchain.chains import RetrievalQA             # 3. Import RAG Chain

# Optional: For Hugging Face login if needed for commercial models
from huggingface_hub import login
# Other imports from original code (kept but not used for RAG core logic)
from numpy.linalg import norm
from numpy import dot
from PIL import Image
import requests 

# Suppress all warnings for cleaner output
warnings.filterwarnings('ignore') 

# --- 1. Authentication & Configuration ---
load_dotenv()
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
HF_MODEL_NAME = "jinaai/jina-embeddings-v2-base-en"
PERSIST_DIR = "./chroma_db" # Directory to save the vector store

# Check for API Keys
if huggingface_api_key:
    # Log in to Hugging Face (optional but good practice)
    login(token=huggingface_api_key, add_to_git_credential=False)
    print("Hugging Face login successful.")
else:
    print("⚠ HUGGINGFACE_API_KEY not found. Proceeding with public access only.")

if not anthropic_api_key:
    print("❌ ANTHROPIC_API_KEY not found in .env. Cannot initialize Anthropic LLM.")
    # Exit or raise error if the core component is missing
    exit() 

# --- 2. Data Loading and Chunking ---
print("\n--- 2. Data Loading and Chunking ---")
try:
    loader = PyPDFLoader('attention.pdf')
    docs = loader.load()
except Exception as e:
    print(f"❌ Error loading PDF: {e}. Ensure 'attention.pdf' is in the current directory.")
    exit()

# Split documents into chunks for RAG
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
print(f"Loaded {len(docs)} pages and split into {len(documents)} chunks.")

# --- 3. Initialize Embeddings ---
print("\n--- 3. Initialize Embeddings ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the HuggingFaceEmbeddings model (Jina Embeddings)
text_embeddings = HuggingFaceEmbeddings(
    model_name=HF_MODEL_NAME,
    model_kwargs={
        'device': device,
        'trust_remote_code': True # Necessary for Jina models
    } 
)
print(f"Embedding model *{HF_MODEL_NAME}* loaded successfully on {device.upper()}.")

# --- 4. Create and Persist Vector Store (ChromaDB) ---
print("\n--- 4. Creating Vector Store (ChromaDB) ---")

# Create a Chroma vector store from the document chunks using the HuggingFace embeddings
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=text_embeddings,
    persist_directory=PERSIST_DIR # Persist the database to disk
)

# NOTE: Chroma.from_documents automatically runs the embeddings and stores them.
print(f"ChromaDB created and persisted at '{PERSIST_DIR}' with {vectordb._collection.count()} documents.")

# --- 5. Initialize Anthropic LLM ---
print("\n--- 5. Initializing Anthropic LLM ---")

# Initialize the ChatAnthropic model using the API Key from the environment
# We use 'claude-3-sonnet-20240229' or a more current model for good performance.
llm = ChatAnthropic(
    model="claude-3-sonnet-20240229", 
    temperature=0.1, # Low temperature for more factual RAG answers
    anthropic_api_key=anthropic_api_key # Explicitly pass the key (optional if env var is set)
)
print(f"Anthropic LLM *{llm.model}* initialized.")

# --- 6. Set up the RAG Chain (RetrievalQA) ---
print("\n--- 6. Setting up RAG Chain ---")

# Convert the vector store into a Retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3}) # Retrieve top 3 relevant chunks

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", # 'stuff' puts all retrieved docs into the prompt
    retriever=retriever, 
    return_source_documents=True # Get the source chunks that answered the query
)
print("RetrievalQA Chain setup complete.")

# --- 7. Answer Queries ---
print("\n--- 7. Answering Queries with Anthropic LLM ---")

# Example Query 1
query1 = "What is the key novelty introduced in this paper?"
print(f"\n❓ Query 1: {query1}")
result1 = qa_chain.invoke({"query": query1})
print(f"✅ Answer: {result1['result']}")
print(f"Source Document (Page {result1['source_documents'][0].metadata.get('page', 'N/A')}): {result1['source_documents'][0].page_content[:150]}...")

# Example Query 2
query2 = "Describe the multi-head attention mechanism briefly."
print(f"\n❓ Query 2: {query2}")
result2 = qa_chain.invoke({"query": query2})
print(f"✅ Answer: {result2['result']}")
print(f"Source Document (Page {result2['source_documents'][0].metadata.get('page', 'N/A')}): {result2['source_documents'][0].page_content[:150]}...")

KeyboardInterrupt: 

In [None]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

# --- LangChain RAG Component Imports ---
# Use the same components for data processing and vector store
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFacePipeline  # New LLM Wrapper
from langchain.chains import RetrievalQA

# --- Configuration ---
PERSIST_DIR = "./chroma_db"
# Use a strong open-source model for the LLM
LLM_MODEL_ID = "NousResearch/Meta-Llama-3-8B-Instruct"
# The embedding model can remain the same (Jina embeddings or a simple sentence-transformer)
EMBEDDING_MODEL_ID = "jinaai/jina-embeddings-v2-base-en"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- 1. Model Initialization (REPLACED ANTHROPIC) ---
print("\n--- 1. Initializing Hugging Face LLM ---")

try:
    # 1. Configuration for 4-bit Quantization (Essential for 8B models on consumer GPUs)
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for better performance
    )
    
    # 2. Load Tokenizer and Model
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_ID,
        quantization_config=nf4_config,
        device_map="auto" # Automatically maps model layers to available hardware
    )

    # 3. Create the Hugging Face Pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,      # Max tokens for the answer generation
        temperature=0.1,
        return_full_text=False   # Ensures only the generated text is returned
    )
    
    # 4. Wrap the pipeline in LangChain's HuggingFacePipeline
    llm = HuggingFacePipeline(pipeline=pipe)
    print(f"Hugging Face LLM *{LLM_MODEL_ID}* initialized with 4-bit quantization on {device.upper()}.")

except Exception as e:
    print(f"❌ Error initializing Hugging Face model: {e}")
    print("Falling back to a small, CPU-friendly model (distilgpt2). Performance will be low.")
    from langchain_huggingface import HuggingFacePipeline
    llm = HuggingFacePipeline.from_model_id(
        model_id="distilgpt2",
        task="text-generation",
        pipeline_kwargs={"max_new_tokens": 100}
    )


# --- 2. Data Loading, Chunking, and Embeddings (Using the same logic) ---
print("\n--- 2. Setting up RAG Components ---")

# (Skipping PDF loading/chunking for brevity, assuming 'documents' and 'text_embeddings' are ready)
# --- Start of placeholder code for RAG components ---
from langchain_core.documents import Document
docs = [Document(page_content="The Transformer paper is called Attention Is All You Need and relies solely on attention mechanisms.", metadata={"page": 0})]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

text_embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_ID)
vectordb = Chroma.from_documents(documents=documents, embedding=text_embeddings, persist_directory=PERSIST_DIR)
# --- End of placeholder code ---

# --- 3. Set up the RAG Chain (RetrievalQA) ---
print("\n--- 3. Setting up RAG Chain ---")
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True
)
print("RetrievalQA Chain setup complete.")


# --- 4. Answer Queries ---
print("\n--- 4. Answering Queries with Hugging Face LLM ---")

query1 = "What is the key novelty introduced in this paper?"
print(f"\n❓ Query 1: {query1}")
result1 = qa_chain.invoke({"query": query1})
print(f"✅ Answer: {result1['result']}")