## Imports

In [1]:
import os
import time
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from PyPDF2 import PdfReader
#import sqlite3

## Groq API

In [2]:
# Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

## Code

In [None]:
def load_pdfs(folder_path):
    start_time = time.time()
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            reader = PdfReader(os.path.join(folder_path, filename))
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            pdf_texts.append(text)
    end_time = time.time()
    print(f"Loaded {len(pdf_texts)} PDF files in {end_time - start_time:.2f} seconds.")
    #print(f"Loaded {len(pdf_texts)} PDF files.")
    return pdf_texts

def split_into_chunks(texts, chunk_size=1000, chunk_overlap=200):
    start_time = time.time()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []
    for text in texts:
        chunks.extend(text_splitter.split_text(text))
    end_time = time.time()
    time.time() 
    print(f"Split texts into {len(chunks)} chunks in {end_time - start_time:.2f} seconds.")
    #print(f"Split texts into {len(chunks)} chunks.")
    return chunks

def setup_chroma(chunks, model_name="sentence-transformers/all-mpnet-base-v2", persist_directory="chroma_data"):
    start_time = time.time()
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = Chroma.from_texts(texts=chunks, embedding=embedding_model, persist_directory=persist_directory)
    end_time = time.time()
    print(f"ChromaDB setup complete with {len(chunks)} chunks in {end_time - start_time:.2f} seconds.")
    #print(f"ChromaDB setup complete with {len(chunks)} chunks.")
    return vectorstore

def query_chroma(vectorstore, query, k=3):
    start_time = time.time()
    results = vectorstore.similarity_search(query, k=k)
    end_time = time.time()
    print(f"Query returned {len(results)} results in {end_time - start_time:.2f} seconds.")
    #print(f"Query returned {len(results)} results.")
    return results

def setup_llm(model_name="llama-3.1-70b-versatile", temperature=0):
    start_time = time.time()
    llm = ChatGroq(model=model_name, temperature=temperature)
    end_time = time.time()
    print(f"LLM setup complete in {end_time - start_time:.2f} seconds.")
    #print("LLM setup complete.")
    return llm

def rag_workflow(query, vectorstore, llm, k=3):
    start_time = time.time()
    # Retrieve relevant documents
    docs = query_chroma(vectorstore, query, k=k)
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Generate response
    prompt = f"You are a helpful assistant. Use the following context to answer the query.\n\nContext:\n{context}\n\nQuery: {query}"
    response = llm.invoke(prompt)
    end_time = time.time()
    print(f"RAG workflow executed in {end_time - start_time:.2f} seconds.")
    return response.content


if __name__ == "__main__":
    # Step 1: Load PDFs from a folder
    folder_path = "./data"  # Update this path to your folder containing PDFs
    pdf_texts = load_pdfs(folder_path)
    
    # Step 2: Split into chunks
    chunks = split_into_chunks(pdf_texts)
    
    # Step 3: Setup ChromaDB
    vectorstore = setup_chroma(chunks)
    
    # Step 4: Initialize Llama with Groq
    llm = setup_llm()
    
    # Step 5: Run RAG
    query = "What does the document tell us about RAG-Systems?"
    response = rag_workflow(query, vectorstore, llm)
    
    print("Response:")
    print(response)

Loaded 1 PDF files in 1.33 seconds.
Split texts into 85 chunks in 0.00 seconds.


  embedding_model = HuggingFaceEmbeddings(model_name=model_name)


RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma                     requires sqlite3 >= 3.35.0.[0m
[94mPlease visit                     https://docs.trychroma.com/troubleshooting#sqlite to learn how                     to upgrade.[0m