In [44]:
import os
import re
import fitz  # PyMuPDF
import faiss
import numpy as np
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq

In [46]:
# Load environment variables
load_dotenv()

True

In [47]:
# Ensure the GROQ API key is loaded
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
    raise ValueError("GROQ_API_KEY not found. Make sure it's set in your .env file.")

In [48]:
# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

In [49]:
# Function to extract text from all PDFs in a folder
def extract_text_from_folder(folder_path):
    all_text = {}
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
            print(f"Processing: {pdf_path}")
            all_text[file] = extract_text_from_pdf(pdf_path)
    return all_text

In [50]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'Page\s+\d+\s+(of\s+\d+)?', '', text, flags=re.IGNORECASE)  # Remove page numbers
    text = re.sub(r'\n+', '\n', text)  # Remove multiple newlines
    text = re.sub(r'\s+', ' ', text).strip()  # Remove excess spaces
    lines = text.split("\n")
    cleaned_lines = [line for line in lines if len(line.strip()) > 10]  # Remove short lines
    return "\n".join(cleaned_lines)

In [None]:
# Function to clean all texts
def clean_all_texts(pdf_texts):
    cleaned_texts = {}
    for pdf_name, text in pdf_texts.items():
        print(f"Cleaning text for: {pdf_name}")
        cleaned_texts[pdf_name] = clean_text(text)
    return cleaned_texts

In [51]:
# Function to chunk text into smaller sections
def chunk_text(text, max_tokens=512):
    words = text.split()
    chunks = []
    current_chunk = []
    current_token = 0

    for word in words:
        if current_token + len(word) + 1 > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_token = 0
        current_chunk.append(word)
        current_token += len(word) + 1
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

In [52]:
# Function to chunk all cleaned texts
def chunk_all_texts(cleaned_texts, max_length=500):
    chunked_texts = {}
    for pdf_name, text in cleaned_texts.items():
        print(f"Chunking text for: {pdf_name}")
        chunked_texts[pdf_name] = chunk_text(text, max_length)
    return chunked_texts

In [53]:
# Load and process PDF folder
folder_path = r"C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data"
pdf_texts = extract_text_from_folder(folder_path)
cleaned_pdf_texts = clean_all_texts(pdf_texts)
chunked_texts = chunk_all_texts(cleaned_pdf_texts)

Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1501.05039v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1602.00203v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1607.00858v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1705.03921v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1711.03577v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1805.03551v2.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1805.04825v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\Ge

In [54]:
# Flatten the chunked texts
flat_chunks = [chunk for pdf_chunks in chunked_texts.values() for chunk in pdf_chunks]
print(f"Total chunks created: {len(flat_chunks)}")


Total chunks created: 3703


In [55]:
# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(flat_chunks, show_progress_bar=True)
print(f"Generated embeddings for {len(flat_chunks)} chunks.")

Batches: 100%|██████████| 116/116 [01:04<00:00,  1.79it/s]

Generated embeddings for 3703 chunks.





In [56]:
# FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
print("FAISS index created and populated with embeddings.")

FAISS index created and populated with embeddings.


In [57]:
# Function to retrieve relevant chunks
def retrieve_relevant_chunks(query, embedding_model, index, chunks, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k=top_k)
    return [chunks[i] for i in indices[0]]


In [58]:
# GROQ LLM initialization
llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

In [59]:
# Function to combine chunks
def combine_chunks(relevant_chunks, max_length=3000):
    combined_text = ""
    for chunk in relevant_chunks:
        if len(combined_text) + len(chunk) <= max_length:
            combined_text += chunk + "\n"
        else:
            break
    return combined_text

In [65]:
# Function to generate response using GROQ LLM
def generate_response(query, context):
    prompt = f"""
    You are a helpful AI assistant. Use the context provided to answer the question accurately. 
    If you do not have information to answer the question, say 'I don't have enough information to answer this question'.
    
    Context:
    {context}
    
    Question: {query}
    Answer:
    """
    response = llm.invoke(input=prompt, max_tokens=300)
    return response.content.strip()

In [66]:
# Full RAG pipeline
def query_rag_system(query, embedding_model, llm, max_context_length=3000):
    relevant_chunks = retrieve_relevant_chunks(query, embedding_model, index, flat_chunks, top_k=5)
    context = combine_chunks(relevant_chunks, max_length=max_context_length)
    response = generate_response(query, context)
    return response

In [67]:
query = "What are the main components of the transformer architecture?"
response = query_rag_system(query, embedding_model, llm)
print("Generated Response:")
print(response)

Generated Response:
The main components of the Transformer architecture include:

1. Attention Mechanism: This is the core component of the Transformer model, which calculates the attentions from the input sequence to determine the aggregation.

2. Positional Embedding: This component is used to encode the token positions in the input sequence, as Transformers make least assumptions on the structural information of data.

3. Linear Embedding: In the context of Vision Transformer (ViT), each image is split into fixed-size patches, and then each patch is linearly embedded.

4. Transformer Encoder: The resulting sequence of vectors from the above components is fed into a standard Transformer encoder for sequence modeling.

5. Self-Attention Mechanism: This dynamically computes the connection weights between every two tokens in the input sequence.

6. Feature Dimension: This measures the number of parameters required to closely approximate the optimization problem.

7. Intrinsic-Dimension 