# RAG Pipeline

#### Importing Modules

In [1]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
from spacy.lang.en import English

  from .autonotebook import tqdm as notebook_tqdm


#### Formatting text

In [2]:
def text_format(text: str)->str :
    cleaner_text = text.replace("\n", " ").strip()

    return cleaner_text

In [3]:
def get_text_from_source(path: str)->list[dict]:
    doc = fitz.open(path)
    pages_text = []

    for pageno, pagecontent in tqdm(enumerate(doc)):
        text = pagecontent.get_text()
        text = text_format(text=text)
        pages_text.append({"Page No.": pageno, "page_char_count": len(text), "page_word_count": len(text.split(" ")), "page_sentence_count (Not accurate)": len(text.split(".")), "page_token_count": len(text)/4, "text": text})
        
    return pages_text

#### Using our custom function


In [4]:
import random

text_info = get_text_from_source(path="Test.pdf")
text_info


9it [00:00, 225.35it/s]


[{'Page No.': 0,
  'page_char_count': 4082,
  'page_word_count': 609,
  'page_sentence_count (Not accurate)': 24,
  'page_token_count': 1020.5,
  'text': 'Building LLMs Understanding Large Langauge Models Large language models (LLMs), such as those offered in OpenAI’s ChatGPT, are deep neural network models that have been developed over the past few years. They ushered in a new era for natural language processing (NLP). Before the advent of LLMs, traditional methods excelled at categorization tasks such as email spam classification and straightforward pattern recognition that could be captured with handcrafted rules or simpler models. However, they typically underperformed in language tasks that demanded complex understanding and generation abilities, such as parsing detailed instructions, conducting contextual analysis, and creating coherent and contextually appropriate original text. For example, previous generations of language models could not write an email from a list of keywords

#### Converting to DataFrame

In [5]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count
count,9.0,9.0,9.0,9.0,9.0
mean,4.0,3886.0,591.89,29.11,971.5
std,2.74,525.97,75.14,5.82,131.49
min,0.0,2651.0,420.0,18.0,662.75
25%,2.0,3729.0,561.0,26.0,932.25
50%,4.0,4082.0,609.0,30.0,1020.5
75%,6.0,4223.0,649.0,33.0,1055.75
max,8.0,4350.0,655.0,37.0,1087.5


#### Splitting text. Conversion of sentences

In [6]:
# Instance of English
obj = English()

#adding pipeling
obj.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1d2b18e42d0>

In [7]:
for items in tqdm(text_info):
    items["sentences"] = list(obj(items["text"]).sents)
    items["sentences"]= [str(sentence) for sentence in items["sentences"]]
    items["sentences_count_spacy"] = len(items["sentences"])

100%|██████████| 9/9 [00:00<00:00, 155.19it/s]


In [8]:
random.sample(text_info, k=1)

[{'Page No.': 5,
  'page_char_count': 4350,
  'page_word_count': 655,
  'page_sentence_count (Not accurate)': 34,
  'page_token_count': 1087.5,
  'text': 'The next-word prediction task is a form of self-supervised learning, which is a form of self-labeling. This means that we don’t need to collect labels for the training data explicitly but can use the structure of the data itself: we can use the next word in a sentence or document as the label that the model is supposed to predict. Since this next-word prediction task allows us to create labels “on the fly,” it is possible to use massive unlabeled text datasets to train LLMs. Compared to the original transformer architecture, the general GPT architecture is relatively simple. Essentially, it’s just the decoder part without the encoder. Since decoder-style models like GPT generate text by predicting text one word at a time, they are considered a type of autoregressive model. Autoregressive models incorporate their previous outputs as i

In [9]:
data = pd.DataFrame(text_info)
data.describe().round(2)

Unnamed: 0,Page No.,page_char_count,page_word_count,page_sentence_count (Not accurate),page_token_count,sentences_count_spacy
count,9.0,9.0,9.0,9.0,9.0,9.0
mean,4.0,3886.0,591.89,29.11,971.5,27.44
std,2.74,525.97,75.14,5.82,131.49,5.88
min,0.0,2651.0,420.0,18.0,662.75,17.0
25%,2.0,3729.0,561.0,26.0,932.25,24.0
50%,4.0,4082.0,609.0,30.0,1020.5,27.0
75%,6.0,4223.0,649.0,33.0,1055.75,32.0
max,8.0,4350.0,655.0,37.0,1087.5,36.0


#### Chunking sentences into group of 10 or less

In [10]:
chunk_size = 10
def create_chunk(big_list: list[str], split_size: int=chunk_size)->list[list[str]]:
    return [big_list[i:i+split_size] for i in range(0, len(big_list), split_size)]

In [11]:
#Chunk size
for items in tqdm(text_info):
    items["text_chunks"] = create_chunk(big_list=items["sentences"], split_size=chunk_size)
    items["chunk_size"] = len(items["text_chunks"])

100%|██████████| 9/9 [00:00<00:00, 28793.85it/s]


In [12]:
text_info

[{'Page No.': 0,
  'page_char_count': 4082,
  'page_word_count': 609,
  'page_sentence_count (Not accurate)': 24,
  'page_token_count': 1020.5,
  'text': 'Building LLMs Understanding Large Langauge Models Large language models (LLMs), such as those offered in OpenAI’s ChatGPT, are deep neural network models that have been developed over the past few years. They ushered in a new era for natural language processing (NLP). Before the advent of LLMs, traditional methods excelled at categorization tasks such as email spam classification and straightforward pattern recognition that could be captured with handcrafted rules or simpler models. However, they typically underperformed in language tasks that demanded complex understanding and generation abilities, such as parsing detailed instructions, conducting contextual analysis, and creating coherent and contextually appropriate original text. For example, previous generations of language models could not write an email from a list of keywords

#### Splitting Chunks for ease of embedding

In [13]:
import re

page_chunk = []
for item in tqdm(text_info):
    for parts in item["text_chunks"]:  # each chunk is already a list of sentences
        chunk_store = {}
        chunk_store["page_number"] = item['Page No.']

        # Merge sentences into one paragraph
        joined_sentence_chunk = " ".join(parts).replace(" ", " ").strip()

        # Optional: Ensure space after a period when followed by capital letter
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_store["sentence_chunk"] = joined_sentence_chunk
        chunk_store["sentence_chunk_size"] = len(joined_sentence_chunk)  # char count
        chunk_store["sentence_chunk_word_count"] = len(joined_sentence_chunk.split())  # word count
        chunk_store["sentence_chunk_tokens"] = len(joined_sentence_chunk) / 4  # approx token count

        page_chunk.append(chunk_store)


100%|██████████| 9/9 [00:00<00:00, 6410.04it/s]


In [14]:
random.sample(page_chunk, k=1)

[{'page_number': 5,
  'sentence_chunk': 'GPT-3 has 96 transformer layers and 175 billion parameters in total. GPT-3 was introduced in 2020, which, by the standards of deep learning and large language model development, is considered a long time ago. However, more recent architectures, such as Meta’s Llama models, are still based on the same underlying concepts, introducing only minor modifications. Hence, understanding GPT remains as relevant as ever, so I focus on implementing the prominent architecture behind GPT while providing pointers to specific tweaks employed by alternative LLMs. Although the original transformer model, consisting of encoder and decoder blocks, was explicitly designed for language translation, GPT models—despite their larger yet simpler decoder-only architecture aimed at next-word prediction—are also capable of performing translation tasks. This capability was initially unexpected to researchers, as it emerged from a model primarily trained on a next-word predi

In [15]:
df = pd.DataFrame(page_chunk)
df.describe().round(2)

Unnamed: 0,page_number,sentence_chunk_size,sentence_chunk_word_count,sentence_chunk_tokens
count,29.0,29.0,29.0,29.0
mean,3.9,1205.59,183.86,301.4
std,2.48,440.98,64.7,110.25
min,0.0,237.0,37.0,59.25
25%,2.0,1058.0,161.0,264.5
50%,4.0,1301.0,196.0,325.25
75%,6.0,1431.0,224.0,357.75
max,8.0,1926.0,278.0,481.5


### Filtering out texts with low token count

In [16]:
minimum_token_length = 25
# for row in df[df["sentence_chunk_tokens"]<=minimum_token_length].sample().iterrows():
#     print(f"Chunk token count: {row[1]["sentence_chunk_tokens"]} | Text: {row[1]["sentence_chunk"]}")

In [17]:
page_chunk_min_token_filter = df[df["sentence_chunk_tokens"]>minimum_token_length].to_dict(orient='records')
len(page_chunk_min_token_filter)

29

In [18]:
random.sample(page_chunk_min_token_filter, k=2)

[{'page_number': 5,
  'sentence_chunk': 'The next-word prediction task is a form of self-supervised learning, which is a form of self-labeling. This means that we don’t need to collect labels for the training data explicitly but can use the structure of the data itself: we can use the next word in a sentence or document as the label that the model is supposed to predict. Since this next-word prediction task allows us to create labels “on the fly,” it is possible to use massive unlabeled text datasets to train LLMs. Compared to the original transformer architecture, the general GPT architecture is relatively simple. Essentially, it’s just the decoder part without the encoder. Since decoder-style models like GPT generate text by predicting text one word at a time, they are considered a type of autoregressive model. Autoregressive models incorporate their previous outputs as inputs for future predictions. Consequently, in GPT, each new word is chosen based on the sequence that precedes it

#### Embedding our Data

In [19]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

In [20]:
%%time

embedding_model.to("cuda")

#embedding chunks 
for item in tqdm(page_chunk_min_token_filter):
    item["embedding"]=embedding_model.encode(item['sentence_chunk'])

100%|██████████| 29/29 [00:01<00:00, 25.01it/s]

CPU times: total: 6.97 s
Wall time: 1.16 s





#### Running Encoding in Batch Mode

In [21]:
%%time

text_chunks_batch = [item["sentence_chunk"] for item in page_chunk_min_token_filter]
len(text_chunks_batch)

CPU times: total: 0 ns
Wall time: 37.4 μs


29

In [22]:
%%time

# encodding in batch
embedding_model.to("cuda")
text_chunks_batch_encoding = embedding_model.encode(text_chunks_batch, batch_size=32, convert_to_tensor=True)
text_chunks_batch_encoding

CPU times: total: 2.36 s
Wall time: 760 ms


tensor([[ 0.0497,  0.0489, -0.0100,  ...,  0.0106, -0.0750, -0.0096],
        [ 0.0550,  0.0377, -0.0213,  ...,  0.0049, -0.0822, -0.0202],
        [ 0.0629, -0.0123,  0.0018,  ...,  0.0186, -0.0843, -0.0322],
        ...,
        [ 0.0039,  0.0779,  0.0072,  ..., -0.0410, -0.0429, -0.0501],
        [ 0.0220, -0.0329, -0.0213,  ..., -0.0156, -0.0374, -0.0483],
        [ 0.0111,  0.0677, -0.0058,  ...,  0.0233, -0.0760, -0.0426]],
       device='cuda:0')

#### Saving the data in a file

In [23]:
# Saving in file
text_chunk_data = pd.DataFrame(page_chunk_min_token_filter)
emded_savepath = "text_chunk_data.csv"
text_chunk_data.to_csv(emded_savepath, index=False)

#### Importing our csv file

In [24]:
import numpy as np
import pandas as pd
import torch


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [25]:
dataset = pd.read_csv('text_chunk_data.csv')

pages_chunker = dataset.to_dict(orient='records')

In [26]:
pages_chunker

[{'page_number': 0,
  'sentence_chunk': 'Building LLMs Understanding Large Langauge Models Large language models (LLMs), such as those offered in OpenAI’s ChatGPT, are deep neural network models that have been developed over the past few years. They ushered in a new era for natural language processing (NLP). Before the advent of LLMs, traditional methods excelled at categorization tasks such as email spam classification and straightforward pattern recognition that could be captured with handcrafted rules or simpler models. However, they typically underperformed in language tasks that demanded complex understanding and generation abilities, such as parsing detailed instructions, conducting contextual analysis, and creating coherent and contextually appropriate original text. For example, previous generations of language models could not write an email from a list of keywords—a task that is trivial for contemporary LLMs. LLMs have remarkable capabilities to understand, generate, and inte

In [27]:
text_chunk_data["embedding"]

0     [0.04973646, 0.04890698, -0.009988132, 0.07886...
1     [0.05504632, 0.037693933, -0.021266555, 0.0655...
2     [0.06285553, -0.012305287, 0.0018492913, 0.041...
3     [0.029825618, -0.0035607142, -0.05730359, 0.01...
4     [0.07315906, 0.020924203, -0.032348167, 0.0298...
5     [0.036764275, -0.015411747, -0.00660755, 0.021...
6     [0.020559799, -0.038756255, -0.02460149, 0.018...
7     [0.03291103, -0.039424673, -0.020620914, 0.016...
8     [0.059157982, -0.0071736774, -0.006206, 0.0478...
9     [0.005167837, -0.056243856, -0.005987081, 0.05...
10    [0.017086511, -0.057033505, -0.016533313, 0.04...
11    [0.026564995, -0.026551738, 0.001147631, 0.009...
12    [0.02036939, 0.0055063316, -0.0025264546, 0.02...
13    [0.0513227, 0.0010891206, -0.021530451, 0.0456...
14    [0.04133433, 0.048474442, -0.003493334, 0.0317...
15    [0.06332711, -0.019899074, 0.005012609, 0.0350...
16    [0.031515833, 0.024714928, 0.012086132, 0.0295...
17    [0.042928156, 0.063535064, -0.020745669, 0

In [28]:
embeddings_chunk = torch.tensor(text_chunk_data["embedding"].to_list(), dtype=torch.float32).to(device)

  embeddings_chunk = torch.tensor(text_chunk_data["embedding"].to_list(), dtype=torch.float32).to(device)


In [29]:
embeddings_chunk.dtype

torch.float32

#### Creating queries

In [30]:
#Creating models
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

In [31]:
query = "Attention mechanism"
print(f"Query: {query}")

embed_query = embedding_model.encode(query, convert_to_tensor=True).to(device=device)
embed_query.dtype

Query: Attention mechanism


torch.float32

In [32]:
from time import perf_counter as timer
from sentence_transformers import util

start_time = timer()
dort_prod = util.dot_score(a=embed_query, b=embeddings_chunk)[0]
end_time = timer()

print(f"Total time taken: {end_time-start_time:.5f} seconds")
top_results = torch.topk(dort_prod, k=5)
top_results

Total time taken: 0.00265 seconds


torch.return_types.topk(
values=tensor([0.5039, 0.4482, 0.4347, 0.3842, 0.3732], device='cuda:0'),
indices=tensor([24, 25, 10, 28, 27], device='cuda:0'))

In [33]:
page_chunk

[{'page_number': 0,
  'sentence_chunk': 'Building LLMs Understanding Large Langauge Models Large language models (LLMs), such as those offered in OpenAI’s ChatGPT, are deep neural network models that have been developed over the past few years. They ushered in a new era for natural language processing (NLP). Before the advent of LLMs, traditional methods excelled at categorization tasks such as email spam classification and straightforward pattern recognition that could be captured with handcrafted rules or simpler models. However, they typically underperformed in language tasks that demanded complex understanding and generation abilities, such as parsing detailed instructions, conducting contextual analysis, and creating coherent and contextually appropriate original text. For example, previous generations of language models could not write an email from a list of keywords—a task that is trivial for contemporary LLMs. LLMs have remarkable capabilities to understand, generate, and inte

#### Pretty printer for our data

In [34]:
import textwrap

def pretty_printer(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [35]:
print(f"Query: {query}")
print("Results:\n")
for score, idx in zip(top_results[0],top_results[1]):
    print(f"Score: {score}\n")
    print("Text: \n")
    text_def = page_chunk[idx]["sentence_chunk"]
    pretty_printer(text_def)
    print(f"Page Number: {page_chunk[idx]["page_number"]}")
    print("\n")

Query: Attention mechanism
Results:

Score: 0.5038601756095886

Text: 

padded tokens. Thus, the specific token chosen for padding becomes
inconsequential. Moreover, the tokenizer used for GPT models also doesn’t use an
<|unk|> token for out-of-vocabulary words. Instead, GPT models use a byte pair
encoding tokenizer, which breaks words down into subword units. Attention in
LLMs In machine learning, attention is a method that determines the importance
of each component in a sequence relative to the other components in that
sequence. In natural language processing, importance is represented by “soft”
weights assigned to each word in a sentence. More generally, attention encodes
vectors called token embeddings across a fixed-width sequence that can range
from tens to millions of tokens in size. Unlike “hard” weights, which are
computed during the backwards training pass, “soft” weights exist only in the
forward pass and therefore change with every step of the input. Earlier designs
implem

#### Functioning our semantic pipeline

In [36]:
# main goal is to just integrate everything into one singular pipeline
def get_semantic_result(query: str, embeddings: torch.Tensor, EMmodel: SentenceTransformer=embedding_model, return_resources: int = 1, time_bool: bool=True)-> tuple[torch.Tensor, torch.Tensor]:

    if not query.strip():
        print("[WARN] Empty query given, returning no results.")
        return None, None
    
    query_embeddings = EMmodel.encode(query, convert_to_tensor=True).to(device=device)

    start_time = timer()
    dort_prod = util.dot_score(query_embeddings, embeddings_chunk)[0]
    end_time = timer()

    if time_bool:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} = {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dort_prod, k=return_resources)

    return scores, indices
    

In [37]:
# for printing top result
def pretty_printer_top(query: str, embeddings: torch.Tensor, page_chunk_dict: list[dict]=page_chunk):
    scores, indices = get_semantic_result(query,  embeddings=embed_query, return_resources= 5)

    print("\n")
    print(f"Query: {query}")
    print("Results:\n")
    for score, idx in zip(scores, indices):
        print(f"Score: {score:0.4f}\n")
        print("Text: \n")
        text_def = page_chunk[idx]["sentence_chunk"]
        pretty_printer(text_def)
        print(f"Page Number: {page_chunk[idx]["page_number"]}")
        print("\n")

In [38]:
pretty_printer_top("Formula 1", embeddings=embed_query)

[INFO] Time taken to get scores on 768 = 0.00018 seconds.


Query: Formula 1
Results:

Score: 0.1147

Text: 

This local implementation can significantly decrease latency and reduce server-
related costs. Furthermore, custom LLMs grant developers complete autonomy,
allowing them to control updates and modifications to the model as needed. The
general process of creating an LLM includes pretraining and fine-tuning. The
“pre” in “pretraining” refers to the initial phase where a model like an LLM is
trained on a large, diverse dataset to develop a broad understanding of
language. This pretrained model then serves as a foundational resource that can
be further refined through fine-tuning, a process where the model is
specifically trained on a narrower dataset that is more specific to particular
tasks or domains. The first step in creating an LLM is to train it on a large
corpus of text data, sometimes referred to as raw text. Here, “raw” refers to
the fact that this data is just regular te

In [39]:
import torch

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes/(2**30))

print(f"Available GPU Memory: {gpu_memory_gb} GB")

Available GPU Memory: 4 GB


#### Loading an LLM locally

In [40]:
import os
os.environ["TORCH_COMPILE_DISABLE"] = "1"  # must be set before importing torch

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.set_float32_matmul_precision("high")

In [53]:
attn_implementation = "sdpa"  
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# from transformers import BitsAndBytesConfig
# quant_Config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

print(f"[INFO] Using model: {model_name}")

tokeniser = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

[INFO] Using model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


##### Loading our model

In [55]:
llm = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name, 
                                           torch_dtype=torch.bfloat16, 
                                           device_map="auto" 
                                           )



In [56]:
llm

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [57]:
# Helper functions
def get_model_param(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

def get_model_memory(model: torch.nn.Module):
   mem_params = sum([param.nelement()*param.element_size() for param in model.parameters()])
   mem_buffers = sum([buf.nelement()*buf.element_size() for buf in model.buffers()])

   total_bytes = mem_buffers+mem_params
   total_bytes_MB = total_bytes/(1024**2)

   return total_bytes_MB/(1024)


In [58]:
print(f"Total Parameters: {get_model_param(llm)}")
print(f"Total memory size: {get_model_memory(llm):0.3f} GB")

Total Parameters: 1100048384
Total memory size: 2.049 GB


#### Generating text from LLMs

In [116]:
input_text = "Generate a haiku about programming"
print(f"Input text: {input_text}")

Input text: Generate a haiku about programming


In [117]:
messages = [
    {
        "role": "user",
        "content": input_text,
    },
]

#### Using LLMs for Text Generation

In [119]:
inputs = tokeniser.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
)
                                                

outputs = llm.generate(**inputs, max_new_tokens=50).to(device)

print(outputs)

tensor([[  529, 29989,  1792, 29989, 29958,    13,  5631,   403,   263,   447,
         18282,  1048,  8720,     2, 29871,    13, 29966, 29989,   465, 22137,
         29989, 29958,    13,   797, 18925, 24496, 29892,    13, 29909,   775,
           304,   443,   908, 29892,    13, 29909,  4086,   304,  4653, 29892,
            13,  9283,  4056, 29915, 29879,  1095,  2222,  4972, 29889,     2]],
       device='cuda:0')


In [120]:
output_text = tokeniser.decode(outputs[0], skip_special_tokens=True)

In [121]:
print(f"Text\n: {output_text}")

Text
: <|user|>
Generate a haiku about programming 
<|assistant|>
Infinite possibilities,
A code to unlock,
A language to express,
Programming's endless flow.
