## Imports

In [10]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from PyPDF2 import PdfReader
#import sqlite3

## Groq API

In [11]:
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

## Code

In [12]:

def load_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            reader = PdfReader(os.path.join(folder_path, filename))
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            pdf_texts.append(text)
    return pdf_texts

def split_into_chunks(texts, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []
    for text in texts:
        chunks.extend(text_splitter.split_text(text))
    return chunks

def setup_chroma(chunks, model_name="sentence-transformers/all-mpnet-base-v2", persist_directory="chroma_data"):
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = Chroma.from_texts(texts=chunks, embedding=embedding_model, persist_directory=persist_directory)
    vectorstore.persist()  # Save to disk
    return vectorstore

def query_chroma(vectorstore, query, k=3):
    results = vectorstore.similarity_search(query, k=k)
    return results
    
def setup_llm(model_name="llama-3.1-70b-versatile", temperature=0):
    return ChatGroq(model=model_name, temperature=temperature)

def rag_workflow(query, vectorstore, llm, k=3):
    # Retrieve relevant documents
    docs = query_chroma(vectorstore, query, k=k)
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Generate response
    prompt = f"You are a helpful assistant. Use the following context to answer the query.\n\nContext:\n{context}\n\nQuery: {query}"
    response = llm.invoke(HumanMessage(content=prompt))
    return response.content


if __name__ == "__main__":
    # Step 1: Load PDFs from a folder
    folder_path = "./data"  # Update this path to your folder containing PDFs
    pdf_texts = load_pdfs(folder_path)
    
    # Step 2: Split into chunks
    chunks = split_into_chunks(pdf_texts)
    
    # Step 3: Setup ChromaDB
    vectorstore = setup_chroma(chunks)
    
    # Step 4: Initialize Llama with Groq
    llm = setup_llm()
    
    # Step 5: Run RAG
    query = "What are the key points in the documents?"
    response = rag_workflow(query, vectorstore, llm)
    
    print("Response:")
    print(response)

RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma                     requires sqlite3 >= 3.35.0.[0m
[94mPlease visit                     https://docs.trychroma.com/troubleshooting#sqlite to learn how                     to upgrade.[0m