# RAG Pipeline

#### Importing Modules

In [21]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
from spacy.lang.en import English

#### Formatting text

In [22]:
def text_format(text: str)->str :
    cleaner_text = text.replace("\n", " ").strip()

    return cleaner_text

In [23]:
def get_text_from_source(path: str)->list[dict]:
    doc = fitz.open(path)
    pages_text = []

    for pageno, pagecontent in tqdm(enumerate(doc)):
        text = pagecontent.get_text()
        text = text_format(text=text)
        pages_text.append({"Page No.": pageno, "page_char_count": len(text), "page_word_count": len(text.split(" ")), "page_sentence_count (Not accurate)": len(text.split(".")), "page_token_count": len(text)/4, "text": text})
        
    return pages_text

#### Using our custom function


In [24]:
import random

text_info = get_text_from_source(path="Rust.pdf")
text_info


0it [00:00, ?it/s]

457it [00:00, 539.26it/s]


[{'Page No.': 0,
  'page_char_count': 82,
  'page_word_count': 16,
  'page_sentence_count (Not accurate)': 1,
  'page_token_count': 20.5,
  'text': 'M A N N I N G Timothy Samuel McNamara Systems programming concepts  and techniques'},
 {'Page No.': 1,
  'page_char_count': 2687,
  'page_word_count': 441,
  'page_sentence_count (Not accurate)': 24,
  'page_token_count': 671.75,
  'text': "Raw Pointer The cousins mut T and * *const T are the free radicals of the pointer world. Lightning fast, but wildly unsafe. Powers • Speed • Can interact with the outside world Weaknesses • Unsafe Box<T> Store anything in a box. Accepts almost any type for long-term storage. The workhorse of a new, safe programming era. Powers • Store a value in central storage in a location called “the heap” Weaknesses • Size increase Rc<T> The reference counted pointer, Rc<T> is Rust's competent, yet miserly bookkeeper. It knows who has borrowed what and when. Powers • Shared access to values Weaknesses • Size increas

#### Converting to DataFrame

In [25]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count
count,457.0,457.0,457.0,457.0,457.0
mean,228.0,1935.68,376.77,26.86,483.92
std,132.07,613.66,133.46,41.53,153.41
min,0.0,0.0,1.0,1.0,0.0
25%,114.0,1647.0,315.0,18.0,411.75
50%,228.0,1982.0,388.0,26.0,495.5
75%,342.0,2362.0,448.0,32.0,590.5
max,456.0,3458.0,960.0,874.0,864.5


#### Splitting text. Conversion of sentences

In [26]:
# Instance of English
obj = English()

#adding pipeling
obj.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1ab98a30890>

In [27]:
for items in tqdm(text_info):
    items["sentences"] = list(obj(items["text"]).sents)
    items["sentences"]= [str(sentence) for sentence in items["sentences"]]
    items["sentences_count_spacy"] = len(items["sentences"])

100%|██████████| 457/457 [00:01<00:00, 411.01it/s]


In [28]:
random.sample(text_info, k=1)

[{'Page No.': 407,
  'page_char_count': 2159,
  'page_word_count': 394,
  'page_sentence_count (Not accurate)': 34,
  'page_token_count': 539.75,
  'text': '382 CHAPTER 11 Kernel 17   Red = 0x4,      BrightRed = 0xC, 18   Magenta = 0x5,  BrightMagenta = 0xD, 19   Brown = 0x6,    Yellow = 0xE, 20   Gray = 0x7,     DarkGray = 0x8 21 } 11.5.2 Controlling the in-memory representation of enums We’ve been content to allow the compiler to determine how an enum is represented. But there are times when we need to pull in the reins. External systems often demand that our data matches their requirements.  Listing 11.13 provides an example of fitting the colors from the VGA-compatible text mode palette enum into a single u8. It removes any discretion from the compiler about which bit pattern (formally called the discriminant) to associate with particular variants. To prescribe a representation, add the repr attribute. You are then able to specify any integer type (i32, u8, i16, u16,…), as well as 

In [29]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count,sentences_count_spacy
count,457.0,457.0,457.0,457.0,457.0,457.0
mean,228.0,1935.68,376.77,26.86,483.92,15.25
std,132.07,613.66,133.46,41.53,153.41,8.44
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,114.0,1647.0,315.0,18.0,411.75,9.0
50%,228.0,1982.0,388.0,26.0,495.5,15.0
75%,342.0,2362.0,448.0,32.0,590.5,21.0
max,456.0,3458.0,960.0,874.0,864.5,39.0


#### Chunking sentences into group of 10 or less

In [30]:
chunk_size = 10
def create_chunk(big_list: list[str], split_size: int=chunk_size)->list[list[str]]:
    return [big_list[i:i+split_size] for i in range(0, len(big_list), split_size)]

In [31]:
#Chunk size
for items in tqdm(text_info):
    items["text_chunks"] = create_chunk(big_list=items["sentences"], split_size=chunk_size)
    items["chunk_size"] = len(items["text_chunks"])

100%|██████████| 457/457 [00:00<00:00, 370825.48it/s]


In [32]:
text_info

[{'Page No.': 0,
  'page_char_count': 82,
  'page_word_count': 16,
  'page_sentence_count (Not accurate)': 1,
  'page_token_count': 20.5,
  'text': 'M A N N I N G Timothy Samuel McNamara Systems programming concepts  and techniques',
  'sentences': ['M A N N I N G Timothy Samuel McNamara Systems programming concepts  and techniques'],
  'sentences_count_spacy': 1,
  'text_chunks': [['M A N N I N G Timothy Samuel McNamara Systems programming concepts  and techniques']],
  'chunk_size': 1},
 {'Page No.': 1,
  'page_char_count': 2687,
  'page_word_count': 441,
  'page_sentence_count (Not accurate)': 24,
  'page_token_count': 671.75,
  'text': "Raw Pointer The cousins mut T and * *const T are the free radicals of the pointer world. Lightning fast, but wildly unsafe. Powers • Speed • Can interact with the outside world Weaknesses • Unsafe Box<T> Store anything in a box. Accepts almost any type for long-term storage. The workhorse of a new, safe programming era. Powers • Store a value in cen

#### Splitting Chunks for ease of embedding

In [33]:
import re

page_chunk = []
for item in tqdm(text_info):
    for parts in item["text_chunks"]:  # each chunk is already a list of sentences
        chunk_store = {}
        chunk_store["page_number"] = item['Page No.']

        # Merge sentences into one paragraph
        joined_sentence_chunk = " ".join(parts).replace(" ", " ").strip()

        # Optional: Ensure space after a period when followed by capital letter
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_store["sentence_chunk"] = joined_sentence_chunk
        chunk_store["sentence_chunk_size"] = len(joined_sentence_chunk)  # char count
        chunk_store["sentence_chunk_word_count"] = len(joined_sentence_chunk.split())  # word count
        chunk_store["sentence_chunk_tokens"] = len(joined_sentence_chunk) / 4  # approx token count

        page_chunk.append(chunk_store)


100%|██████████| 457/457 [00:00<00:00, 30926.55it/s]


In [34]:
random.sample(page_chunk, k=1)

[{'page_number': 364,
  'sentence_chunk': "339 Spawning threads  1 use std::{thread,time};  2   3 fn main() {  4   let pause = time::Duration::from_millis(20);  5   let handle1 = thread::spawn(|| {  6     thread::sleep(pause);  7   });  8   let handle2 = thread::spawn(|| {  9     thread::sleep(pause); 10   }); 11  12   handle1.join(); 13   handle2.join(); 14 } If we run listing 10.12, we’ll receive a verbose—and surprisingly helpful—error message: $ cargo run -q  error[E0373]: closure may outlive the current function, but it borrows `pause`, which is owned by the current function  --> src/main.rs:5:33   | 5 |     let handle1 = thread::spawn(|| {   |                                 ^^ may outlive borrowed value `pause` 6 |         thread::sleep(pause);   |                       ----- `pause` is borrowed here   | note: function requires argument type to outlive `'static`  --> src/main.rs:5:19   | 5 |       let handle1 = thread::spawn(|| {   |  ___________________^ 6 | |         thread::s

In [35]:
df = pd.DataFrame(page_chunk)
df.describe().round(2)

Unnamed: 0,page_number,sentence_chunk_size,sentence_chunk_word_count,sentence_chunk_tokens
count,911.0,911.0,911.0,911.0
mean,226.63,970.58,150.25,242.65
std,128.85,530.56,76.61,132.64
min,0.0,12.0,2.0,3.0
25%,112.0,670.5,109.0,167.62
50%,224.0,928.0,148.0,232.0
75%,335.5,1236.0,192.5,309.0
max,456.0,3458.0,464.0,864.5


### Filtering out texts with low token count

In [36]:
minimum_token_length = 25
for row in df[df["sentence_chunk_tokens"]<=minimum_token_length].sample(10).iterrows():
    print(f"Chunk token count: {row[1]["sentence_chunk_tokens"]} | Text: {row[1]["sentence_chunk"]}")

Chunk token count: 8.0 | Text: SIGKILL is particularly vicious.
Chunk token count: 13.25 | Text: Executes the  target/debug/clock  executable directly
Chunk token count: 10.75 | Text: All of your comments were read. Many of the
Chunk token count: 19.25 | Text: Using a  reference within this block allows us to  sidestep ownership issues.
Chunk token count: 8.0 | Text: prints to standard out (stdout).
Chunk token count: 3.25 | Text: Still hidden!
Chunk token count: 3.5 | Text: Rust in Action
Chunk token count: 8.25 | Text: Listing 5.6 Inspecting endianness
Chunk token count: 11.25 | Text: To everyone aspiring to write safer software.
Chunk token count: 20.75 | Text: Performs addition to calculate the line number, avoiding calculations at every step


In [53]:
page_chunk_min_token_filter = df[df["sentence_chunk_tokens"]>minimum_token_length].to_dict(orient='records')
len(page_chunk_min_token_filter)

880

In [54]:
random.sample(page_chunk_min_token_filter, k=2)

[{'page_number': 140,
  'sentence_chunk': 'Revisiting our original code from list- ing 4.3, we can see that sat_a starts its life with ownership over a CubeSat object: fn main() {   let sat_a = CubeSat { id: 0 };   // ... The CubeSat object is then passed into check_status() as an argument. This moves ownership to the local variable sat_id: fn main() {   let sat_a = CubeSat { id: 0 };   // ...   let a_status = check_status(sat_a);   // ... Another possibility is that sat_a relinquishes its ownership to another variable within main(). That would look something like this: 2 Within the Rust community, the term variable binding  is preferred because it is more technically correct.',
  'sentence_chunk_size': 642,
  'sentence_chunk_word_count': 109,
  'sentence_chunk_tokens': 160.5},
 {'page_number': 181,
  'sentence_chunk': '\uf0a1Modules can be nested arbitrarily. \uf0a1All members of a module including its submodules are private by default. Pri- vate items can be accessed within the modul

#### Embedding our Data

In [45]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

In [68]:
%%time

embedding_model.to("cuda")

#embedding chunks 
for items in tqdm(page_chunk_min_token_filter):
    item["embedding"]=embedding_model.encode("sentence_chunk")

100%|██████████| 880/880 [00:14<00:00, 61.97it/s]

CPU times: total: 14.1 s
Wall time: 14.2 s





#### Running Encoding in Batch Mode

In [69]:
%%time

text_chunks_batch = [item["sentence_chunk"] for item in page_chunk_min_token_filter]
len(text_chunks_batch)

CPU times: total: 0 ns
Wall time: 306 μs


880

In [71]:
%%time

# encodding in batch
embedding_model.to("cuda")
text_chunks_batch_encoding = embedding_model.encode(text_chunks_batch, batch_size=32, convert_to_tensor=True)

CPU times: total: 55.2 s
Wall time: 16.8 s
