In [36]:
# !pip install PyPDF2 pdfplumber langchain faiss-cpu sentence-transformers groq

In [3]:
import pdfplumber

In [8]:
import pdfplumber

def parse_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text and clean whitespace
            page_text = page.extract_text()
            if page_text:
                text += page_text.strip() + "\n"
    return text


In [55]:
pdf_text = parse_pdf("AICommunity_Assignment_25.pdf")

In [56]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text: str, chunk_size=500, chunk_overlap=50) -> list[str]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]  # Split on paragraphs first
    )
    return splitter.split_text(text)

chunks = chunk_text(pdf_text)

In [57]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def build_index(chunks: list[str]) -> FAISS:
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    return FAISS.from_texts(chunks, embeddings)

vector_db = build_index(chunks)
vector_db.save_local("faiss_index")  # Save for reuse

In [49]:
def retrieve_context(query: str, vector_db: FAISS, k=3) -> list[str]:
    retriever = vector_db.as_retriever(search_kwargs={"k": k})
    return retriever.get_relevant_documents(query)

In [29]:
from groq import Groq

def generate_answer(query: str, context: list, model: str = "llama3-70b-8192") -> str:
    """
    Generates answers using Mistral-Saba-24B model via Groq API.
    
    Args:
        query: User question
        context: List of retrieved text chunks
        model: Model name (confirm exact name via Groq console)
    
    Returns:
        Generated answer string
    """
    try:
        client = Groq(api_key="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")  # Replace with your key
        
        context_str = "\n\n".join([c.page_content for c in context])  # Join chunks
        
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": f"""Answer ONLY using the context below. If unsure, say "I don't know".
                    
                    Context:
                    {context_str}
                    """
                },
                {"role": "user", "content": query}
            ],
            model=model,
            temperature=0.3,  # Balance creativity/factuality
            max_tokens=512     # Control response length
        )
        
        return response.choices[0].message.content
    
    except Exception as e:
        return f"Error generating answer: {str(e)}"

In [58]:
query = "what is the first question in the document?"
context_chunks = retrieve_context(query, vector_db)

In [59]:
answer = generate_answer(query, context_chunks)
print(answer)

I don't know. The document does not explicitly state the first question. It provides general guidelines and instructions for the assignment.


In [34]:
# client = Groq(api_key="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")
# models = client.models.list()

# for model in models.data:
#     print(model.id)