In [8]:

!pip install pdfplumber faiss-cpu sentence-transformers langchain-groq




In [32]:
import os
import pdfplumber
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq

# -----------------------------
# 1. Configuration
# -----------------------------
# Set API Key
groq_api_key = "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"  # <-- Replace with your actual API key

# Initialize the ChatGroq interface
llm = ChatGroq(
    temperature=0,
    groq_api_key=groq_api_key,
    model_name="llama-3.3-70b-versatile"
)

# Path to your structured PDF data
pdf_file_path = "data.pdf"

# -----------------------------
# 2. Read and Process Structured PDF Data
# -----------------------------
def load_pdf_text(pdf_path):
    """ Extract text from the PDF while preserving structure. """
    pdf_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                pdf_text += page_text + "\n"
    return pdf_text

def structure_text(text):
    """ Processes the text into structured sections. """
    sections = text.split("\n# ")  # Splitting by headers
    documents = {}
    for section in sections:
        lines = section.split("\n")
        title = lines[0].strip()
        content = "\n".join(lines[1:]).strip()
        if title and content:
            documents[title] = content
    return documents

# Load and process the PDF
pdf_text = load_pdf_text(pdf_file_path)
structured_data = structure_text(pdf_text)

print("Structured sections extracted:", len(structured_data))

# -----------------------------
# 3. Create FAISS Index for Efficient Search
# -----------------------------
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare document keys and embeddings
document_keys = list(structured_data.keys())
document_texts = [structured_data[key] for key in document_keys]
doc_embeddings = embedder.encode(document_texts, convert_to_numpy=True)

# Create a FAISS index
embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(doc_embeddings)
print("FAISS index built with", index.ntotal, "documents.")

# -----------------------------
# 4. Build the RAG Pipeline (with Token Limit Handling)
# -----------------------------
MAX_CONTEXT_LENGTH = 5000  # Limit to avoid API token limits

def build_prompt(query, top_k=5):
    """ Retrieves the most relevant documents and builds a prompt. """
    query_embedding = embedder.encode(query, convert_to_numpy=True)
    distances, indices = index.search(np.array([query_embedding]), top_k)
    
    retrieved_sections = [document_keys[i] + ":\n" + document_texts[i] for i in indices[0]]

    # Combine retrieved documents while ensuring token limit
    context = "\n\n".join(retrieved_sections)
    if len(context) > MAX_CONTEXT_LENGTH:
        context = context[:MAX_CONTEXT_LENGTH]  # Truncate excess text

    prompt = f"""
    You are a highly factual AI assistant. Use only the provided context to answer accurately.
    Do NOT add false information. If details are missing, state only the available data.
    
    Context:\n{context}\n\n
    Question: {query}
    Answer:
    """
    return prompt

# -----------------------------
# 5. Test the RAG Chatbot with Improved Handling
# -----------------------------
query = "Can you tell me about vivek ?"

# Build the optimized prompt
prompt = build_prompt(query, top_k=5)
print("=== Optimized Prompt Sent to Model ===")
print(prompt)

# Call the model
response = llm.invoke(prompt)
print("\n=== Generated Answer ===")
print(response.content)


Structured sections extracted: 13
FAISS index built with 13 documents.
=== Optimized Prompt Sent to Model ===

    You are a highly factual AI assistant. Use only the provided context to answer accurately.
    Do NOT add false information. If details are missing, state only the available data.
    
    Context:
Recommendations:
## Recommendation 1
Recommendator's Name: Rakesh Jangam
Recommendator's Position: Was Engineer III & Mentor to Vivek
Recommendator's LinkedIn: https://www.linkedin.com/in/rakesh-j-87b4a7134/
Company: University of Phoenix
Recommendation:
"It has been an absolute pleasure mentoring Vivek Patel during his time as an IT intern in
University of Phoenix. Vivek displayed a genuine eagerness to learn and grow in the field. His
curiosity and dedication to mastering new technologies have been truly impressive. Throughout
his internship, he consistently demonstrated a proactive approach to learning, eagerly seeking
out challenges and opportunities to grow. He actively sou