In [1]:
import requests
import xmltodict
import fitz  # PyMuPDF
import os
import re
from dotenv import load_dotenv
# from langchain.embeddings import 
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [2]:
## load enviroment variables from the .env files
load_dotenv()

True

In [3]:
## get the api key from the .env file
api_key = os.getenv("GROQ_API_KEY")

In [4]:
def extrct_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

In [5]:
def extract_text_from_folder(folder_path):
    all_text = {}
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
            print(f"Processing: {pdf_path}")
            all_text[file] = extrct_text_from_pdf(pdf_path)
    return all_text            

In [6]:
folder_path = r"C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data"
pdf_texts = extract_text_from_folder(folder_path)

Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1501.05039v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1602.00203v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1607.00858v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1705.03921v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1711.03577v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1805.03551v2.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\GenerativeAI-Projects\ResearchPaper-Query-RAG\ResearchPaper-Data\1805.04825v1.pdf
Processing: C:\Users\soura\OneDrive\Desktop\Projects\Ge

In [7]:
pdf_texts

{'1501.05039v1.pdf': "Defining Data Science \nBeyond the study of the rules of the natural world as reflected by data \n \nYangyong Zhu and Yun Xiong \nSchool of Computer Science, Fudan University, Shanghai, China  \nShanghai Key Laboratory of Data Science, Fudan University, China. \n{yyzhu, yunx}@fudan.edu.cn \n \nData science has received widespread attention in academic and industrial circles. New data \nscience research institutes and organizations have continued to emerge on the scene, such as the \nColumbia University Institute for Data Sciences and Engineering and New York University \nCenter for Data Science. The University of California at Berkeley, Columbia University, Fudan \nUniversity, and other universities have launched data science courses and degree programs. \nCleveland and Smith proposed that data science should be considered an independent \ndiscipline2, 8. Facebook, Google, EMC, IBM, and other companies have established employment \npositions for data scientists. A

In [8]:
def clean_text(text):
    # Remove non-ASCII characters 
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Remove page numbers or lines that look like "Page X of Y"
    text = re.sub(r'Page\s+\d+\s+(of\s+\d+)?', '', text, flags=re.IGNORECASE)
    
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    
    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove lines with very few characters (e.g., headers/footers)
    lines = text.split("\n")
    cleaned_lines = [line for line in lines if len(line.strip()) > 10]
    
    # Join the cleaned lines back together
    text = "\n".join(cleaned_lines)
    
    return text

In [9]:
def clean_all_texts(pdf_texts):
    """
    Clean the extracted text for all PDFs in a dictionary.
    """
    cleaned_texts = {}
    for pdf_name, text in pdf_texts.items():
        print(f"Cleaning text for: {pdf_name}")
        cleaned_texts[pdf_name] = clean_text(text)
    return cleaned_texts

In [10]:
# Clean the extracted text for all PDFs
cleaned_pdf_texts = clean_all_texts(pdf_texts)

Cleaning text for: 1501.05039v1.pdf
Cleaning text for: 1602.00203v1.pdf
Cleaning text for: 1607.00858v1.pdf
Cleaning text for: 1705.03921v1.pdf
Cleaning text for: 1711.03577v1.pdf
Cleaning text for: 1805.03551v2.pdf
Cleaning text for: 1805.04825v1.pdf
Cleaning text for: 1805.08355v1.pdf
Cleaning text for: 1806.01756v1.pdf
Cleaning text for: 1812.05448v4.pdf
Cleaning text for: 1901.02354v2.pdf
Cleaning text for: 1901.04195v1.pdf
Cleaning text for: 1901.09388v2.pdf
Cleaning text for: 1908.02130v1.pdf
Cleaning text for: 2002.05658v1.pdf
Cleaning text for: 2007.03606v1.pdf
Cleaning text for: 2010.05125v2.pdf
Cleaning text for: 2106.00120v3.pdf
Cleaning text for: 2108.01468v1.pdf
Cleaning text for: 2108.11510v1.pdf
Cleaning text for: 2112.01590v3.pdf
Cleaning text for: 2201.05852v1.pdf
Cleaning text for: 2201.05867v1.pdf
Cleaning text for: 2303.01980v1.pdf
Cleaning text for: 2303.02715v1.pdf
Cleaning text for: 2306.13586v1.pdf
Cleaning text for: 2306.16177v3.pdf
Cleaning text for: 2308.0489

## Text Chunking

In [11]:
def chunk_text(text, max_tokens=512):
    """
    Split the text into chunks of maximum `max_tokens` length.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_token = 0

    for word in words:
        if current_token + len(word) + 1 > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_token = 0
        current_chunk.append(word)
        current_token += len(word) + 1
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [12]:
def chunk_all_texts(cleaned_texts, max_length=500):
    """
    Chunk all cleaned texts into smaller sections.
    """
    chunked_texts = {}
    for pdf_name, text in cleaned_texts.items():
        print(f"Chunking text for: {pdf_name}")
        chunked_texts[pdf_name] = chunk_text(text, max_length)
    return chunked_texts

In [13]:
# Chunk the cleaned text
chunked_texts = chunk_all_texts(cleaned_pdf_texts)

Chunking text for: 1501.05039v1.pdf
Chunking text for: 1602.00203v1.pdf
Chunking text for: 1607.00858v1.pdf
Chunking text for: 1705.03921v1.pdf
Chunking text for: 1711.03577v1.pdf
Chunking text for: 1805.03551v2.pdf
Chunking text for: 1805.04825v1.pdf
Chunking text for: 1805.08355v1.pdf
Chunking text for: 1806.01756v1.pdf
Chunking text for: 1812.05448v4.pdf
Chunking text for: 1901.02354v2.pdf
Chunking text for: 1901.04195v1.pdf
Chunking text for: 1901.09388v2.pdf
Chunking text for: 1908.02130v1.pdf
Chunking text for: 2002.05658v1.pdf
Chunking text for: 2007.03606v1.pdf
Chunking text for: 2010.05125v2.pdf
Chunking text for: 2106.00120v3.pdf
Chunking text for: 2108.01468v1.pdf
Chunking text for: 2108.11510v1.pdf
Chunking text for: 2112.01590v3.pdf
Chunking text for: 2201.05852v1.pdf
Chunking text for: 2201.05867v1.pdf
Chunking text for: 2303.01980v1.pdf
Chunking text for: 2303.02715v1.pdf
Chunking text for: 2306.13586v1.pdf
Chunking text for: 2306.16177v3.pdf
Chunking text for: 2308.0489

In [14]:
# Flatten the chunked texts for embedding
flat_chunks = [chunk for pdf_chunks in chunked_texts.values() for chunk in pdf_chunks]
print(f"Total chunks created: {len(flat_chunks)}")

Total chunks created: 3703


In [43]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = model.encode(flat_chunks, show_progress_bar=True)

print(f"Generated embeddings for {len(flat_chunks)} chunks.")


Batches: 100%|██████████| 116/116 [00:57<00:00,  2.00it/s]

Generated embeddings for 3703 chunks.





In [16]:
embeddings

array([[-0.06688278,  0.00978954,  0.02699181, ..., -0.06405541,
        -0.00827712, -0.03206822],
       [-0.01601519, -0.06354769,  0.00032897, ..., -0.09346824,
         0.04706018, -0.02880853],
       [-0.01615418,  0.01419142,  0.00591523, ..., -0.03564106,
        -0.01791636, -0.03747967],
       ...,
       [-0.02278017,  0.02977859, -0.01137762, ..., -0.00439937,
        -0.00036648, -0.00719787],
       [-0.08764973,  0.01078109, -0.0598324 , ..., -0.02534723,
         0.00824314,  0.05721295],
       [ 0.00350639,  0.04448356, -0.01410194, ...,  0.00064699,
        -0.04002129, -0.01773401]], dtype=float32)

## Vector Store

In [40]:
import faiss
import numpy as np

# Define the dimension of the embeddings
dimension = embeddings.shape[1]

# Create a FAISS index
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(np.array(embeddings))
print("FAISS index created and populated with embeddings.")

FAISS index created and populated with embeddings.


In [41]:
def retrieve_relevant_chunks(query, model, index, chunks, top_k=5):
    """
    Retrieve the most relevant text chunks for a given query.
    """
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k=top_k)
    return [chunks[i] for i in indices[0]]


In [42]:
# Example Query
query = "What are the key components of a transformer model?"
relevant_chunks = retrieve_relevant_chunks(query, model, index, flat_chunks)

print("Relevant Chunks:")
for chunk in relevant_chunks:
    print(chunk)

AttributeError: 'ChatGroq' object has no attribute 'encode'

In [26]:
from langchain_groq import ChatGroq

# Initialize the GROQ chat model
llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

In [21]:
def combine_chunks(relevant_chunks, max_length=3000):
    """
    Combine text chunks into a single context string within a token limit.
    """
    combined_text = ""
    for chunk in relevant_chunks:
        if len(combined_text) + len(chunk) <= max_length:
            combined_text += chunk + "\n"
        else:
            break
    return combined_text


In [27]:
def generate_response(query, context):
    """
    Generate a response using LangChain and the retrieved context.
    """
    prompt = f"""
    You are a helpful AI assistant. Use the context provided to answer the question accurately. 
    If you do not have information to answer the question, you can say 'I don't have enough information to answer this question'.
    
    Context:
    {context}
    
    Question: {query}
    Answer:
    """
    # Invoke the language model
    response = llm.invoke(prompt=prompt, max_tokens=300)
    return response.content.strip()


In [32]:
def query_rag_system(query,model, max_context_length=3000):
    """
    Full RAG system pipeline: retrieves context and generates a response.
    """
    # Step 1: Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(query, model, index, top_k=5)  # Your retrieval logic here
    
    # Step 2: Combine retrieved chunks into context
    context = combine_chunks(relevant_chunks, max_length=max_context_length)
    
    # Step 3: Generate a response
    response = generate_response(query, context)
    
    return response

In [33]:
# Example usage
query = "Explain the concept of self-attention in transformers."
response = query_rag_system(query, model=llm, max_context_length=3000)
print("Generated Response:")
print(response)


NameError: name 'chunks' is not defined