# RAG Pipeline

#### Importing Modules

In [1]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
from spacy.lang.en import English

  from .autonotebook import tqdm as notebook_tqdm


#### Formatting text

In [2]:
def text_format(text: str)->str :
    cleaner_text = text.replace("\n", " ").strip()

    return cleaner_text

In [3]:
def get_text_from_source(path: str)->list[dict]:
    doc = fitz.open(path)
    pages_text = []

    for pageno, pagecontent in tqdm(enumerate(doc)):
        text = pagecontent.get_text()
        text = text_format(text=text)
        pages_text.append({"Page No.": pageno, "page_char_count": len(text), "page_word_count": len(text.split(" ")), "page_sentence_count (Not accurate)": len(text.split(".")), "page_token_count": len(text)/4, "text": text})
        
    return pages_text

#### Using our custom function


In [4]:
import random

text_info = get_text_from_source(path="test.pdf")
random.sample(text_info, k=1)


8it [00:00, 262.12it/s]


[{'Page No.': 7,
  'page_char_count': 608,
  'page_word_count': 108,
  'page_sentence_count (Not accurate)': 8,
  'page_token_count': 152.0,
  'text': '(the Box) is a pointer to that value on the heap. When the Box is eventually dropped, that memory is freed. If you forget to deallocate heap memory, it will stick around forever, and your application will eventually eat up all the memory on your machine. This is called leaking memory and is usually something you want to avoid. However, there are some cases where you explicitly want to leak memory. For example, say you have a read-only configuration that the entire program should be able to access. You can allocate that on the heap and explicitly leak it with Box::leak to get a ‘static reference to it.'}]

#### Converting to DataFrame

In [5]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count
count,8.0,8.0,8.0,8.0,8.0
mean,3.5,3045.88,493.5,26.75,761.47
std,2.45,1152.18,207.38,8.92,288.04
min,0.0,608.0,108.0,8.0,152.0
25%,1.75,2737.75,408.75,25.25,684.44
50%,3.5,3102.0,455.5,26.5,775.5
75%,5.25,4014.25,695.5,34.0,1003.56
max,7.0,4063.0,716.0,36.0,1015.75


#### Splitting text. Conversion of sentences

In [6]:
# Instance of English
obj = English()

#adding pipeling
obj.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x21468f0c550>

In [7]:
for items in tqdm(text_info):
    items["sentences"] = list(obj(items["text"]).sents)
    items["sentences"]= [str(sentence) for sentence in items["sentences"]]
    items["senetences_count_spacy"] = len(items["sentences"])

100%|██████████| 8/8 [00:00<00:00, 190.50it/s]


In [8]:
random.sample(text_info, k=1)

[{'Page No.': 0,
  'page_char_count': 3397,
  'page_word_count': 474,
  'page_sentence_count (Not accurate)': 27,
  'page_token_count': 849.25,
  'text': 'Introduction With the increasing demand for high-performance computing in domains such as deep learning, scientific simulations, and real-time rendering, Graphics Processing Units (GPUs) have become essential for accelerating large-scale parallel workloads. Unlike Central Processing Units (CPUs), GPUs are designed with a large number of lightweight threads and high memory bandwidth, making them highly suitable for data-parallel computation. To exploit this capability, various GPU programming frameworks have emerged, targeting different hardware architectures and levels of portability. This project presents a comparative study of four prominent GPU programming models: CUDA, HIP on NVIDIA (CUDA-supported), HIP on AMD (ROCm-supported), and OpenCL. The report begins with an overview of GPU and CPU architectures, the fundamentals of paral

In [9]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count,senetences_count_spacy
count,8.0,8.0,8.0,8.0,8.0,8.0
mean,3.5,3045.88,493.5,26.75,761.47,24.25
std,2.45,1152.18,207.38,8.92,288.04,7.63
min,0.0,608.0,108.0,8.0,152.0,7.0
25%,1.75,2737.75,408.75,25.25,684.44,23.75
50%,3.5,3102.0,455.5,26.5,775.5,25.0
75%,5.25,4014.25,695.5,34.0,1003.56,28.5
max,7.0,4063.0,716.0,36.0,1015.75,32.0


#### Chunking sentences into group of 10 or less

In [13]:
chunk_size = 10


def create_chunk(big_list: list[str], split_size: int=chunk_size)->list[list[str]]:
    return [big_list[i:i+split_size] for i in range(0, len(big_list), split_size)]

In [18]:
#Chunk size
for items in tqdm(text_info):
    items["text_chunks"] = create_chunk(big_list=items["sentences"], split_size=chunk_size)
    items["chunk_size"] = len(items["text_chunks"])

100%|██████████| 8/8 [00:00<00:00, 45160.74it/s]


In [20]:
random.sample(text_info, k=2)

[{'Page No.': 0,
  'page_char_count': 3397,
  'page_word_count': 474,
  'page_sentence_count (Not accurate)': 27,
  'page_token_count': 849.25,
  'text': 'Introduction With the increasing demand for high-performance computing in domains such as deep learning, scientific simulations, and real-time rendering, Graphics Processing Units (GPUs) have become essential for accelerating large-scale parallel workloads. Unlike Central Processing Units (CPUs), GPUs are designed with a large number of lightweight threads and high memory bandwidth, making them highly suitable for data-parallel computation. To exploit this capability, various GPU programming frameworks have emerged, targeting different hardware architectures and levels of portability. This project presents a comparative study of four prominent GPU programming models: CUDA, HIP on NVIDIA (CUDA-supported), HIP on AMD (ROCm-supported), and OpenCL. The report begins with an overview of GPU and CPU architectures, the fundamentals of paral