 **Create and Run a local RAG pipeline from scratch**

## 1. Document/text preprocessing and embedding creation

In [1]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrition-text.pdf"

# Download
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    # Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # THe Local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    responce = requests.get(url)

    # Check if the request was successfull
    if responce.status_code == 200:
        # Open the file and save it
        with open(filename , 'wb') as file:
            file.write(responce.content)
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Falied to download the file . Status code {responce.status_code}")

else:
    print(f"File {pdf_path} exists.")


File human-nutrition-text.pdf exists.


In [2]:
import fitz
from tqdm import tqdm


def text_formaatter(text: str) -> str:
    """ Performs minor formatting on text """
    cleaned_text = text.replace("\n" , " ").strip()

    # Pottentially more text formating functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number , page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formaatter(text=text)
        pages_and_text.append({"page_number" : page_number - 41 , 
                               "page_char_count": len(text) , 
                               "page_word_count": len(text.split(" ")) , 
                               "page_sentence_count_raw": len(text.split(". ")) , 
                               "page_token_count": len(text) / 4  , # 1 token = 4 character.
                               "text": text})
    
    return pages_and_text

pages_and_text = open_and_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]


1208it [00:01, 818.78it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
import random
random.sample(pages_and_text , k = 4)

[{'page_number': 462,
  'page_char_count': 2020,
  'page_word_count': 336,
  'page_sentence_count_raw': 14,
  'page_token_count': 505.0,
  'text': 'Metabolism Overview  Metabolism is defined as the sum of all chemical reactions required  to support cellular function and hence the life of an organism.  Metabolism is either categorized as catabolism, referring to all  metabolic processes involved in molecule breakdown, or anabolism,  which includes all metabolic processes involved in building bigger  molecules. Generally, catabolic processes release energy and  anabolic processes consume energy. The overall goals of  metabolism are energy transfer and matter transport. Energy is  transformed from food macronutrients into cellular energy, which  is used to perform cellular work. Metabolism transforms the matter  of macronutrients into substances a cell can use to grow and  reproduce and also into waste products. For example, enzymes are  proteins and their job is to catalyze chemical reac

In [4]:
import pandas as pd 

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,198.299669,9.972682,287.001035
std,348.86387,560.382275,95.759336,6.187226,140.095569
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.875
75%,864.25,1603.5,271.0,14.0,400.875
max,1166.0,2308.0,429.0,32.0,577.0


**Further text processing (splitting pages into sentences)**

In [8]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This is another sentenece. I like elephants.")
assert len(list(doc.sents)) == 3

# Print out our sentence split
list(doc.sents)

[This is a sentence., This is another sentenece., I like elephants.]

In [9]:
for item in tqdm(pages_and_text):
    item['sentences'] = list(nlp(item["text"]).sents)

    # Make sure all sentences are string (the default type is a spacyy datatype)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the senteneces
    item["page_sentence_count_spacy"] = len(item["sentences"])


100%|██████████| 1208/1208 [00:01<00:00, 944.78it/s] 


In [10]:
random.sample(pages_and_text , k = 1)

[{'page_number': 888,
  'page_char_count': 744,
  'page_word_count': 120,
  'page_sentence_count_raw': 4,
  'page_token_count': 186.0,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=469  \xa0 888  |  Adolescence',
  'sentences': ['Learning Activities  Technology Note: The second edition of the Human  Nutrition 

In [13]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### Chucking our sentence together

In [14]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split a list of texts recursively into chunk size

def split_list(input_list: list[str] , slice_size : int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0 , len(input_list) , slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [15]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"] , slice_size=num_sentence_chunk_size)

    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 238086.52it/s]


In [18]:
random.sample(pages_and_text , k = 1)

[{'page_number': 392,
  'page_char_count': 1544,
  'page_word_count': 273,
  'page_sentence_count_raw': 13,
  'page_token_count': 386.0,
  'text': 'cells and nerve cells) do not regenerate at any appreciable levels.  Tissue regeneration is the creation of new cells (cell division), which  requires many different proteins including enzymes that synthesize  RNA and proteins, transport proteins, hormones, and collagen. In  a hair follicle, cells divide and a hair grows in length. Hair growth  averages 1 centimeter per month and fingernails about 1 centimeter  every one hundred days. The cells lining the intestine regenerate  every three to five days. Protein-inadequate diets impair tissue  regeneration, causing many health problems including impairment  of nutrient digestion and absorption and, most visibly, hair and nail  growth.  Energy Production  Some of the amino acids in proteins can be disassembled and used  to make energy (Figure 6.14 “Amino Acids Used for Energy”). Only  about 10

In [19]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item

In [24]:
import re

# Split each chunk into own item
pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  " , " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])' , r'. \1' , joined_sentence_chunk) # ".A" => ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 21577.95it/s]


1843

In [27]:
random.sample(pages_and_chunks , k = 1)

[{'page_number': 869,
  'sentence_chunk': 'often mimic their behavior and eating habits. Parents must continue to help their school-aged children and adolescents establish healthy eating habits and attitudes toward food. Their primary role is to bring a wide variety of health-promoting foods into the home, so that their children can make good choices. \xa0 Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been excluded from this version of the text. You can view 

In [28]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


### Filter chunks of text for short chunks

In [29]:
min_token_length = 30

In [30]:
# Filter our DataFrame for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [37]:
random.sample(pages_and_chunks_over_min_token_len , k = 1)

[{'page_number': 251,
  'sentence_chunk': 'More Resources Visit this online database to discover the glycemic indices of foods. Foods are listed by category and also by low, medium, or high glycemic index. http://www.gilisting.com/ Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \xa0 Digestion and Absorption of Carbohydrates | 251',
  'chunk_char_count': 777,
  'chunk_word_count': 112,
  'chunk_token_count': 194.25}]

## Embedding our text chunks

In [42]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2" , device = "mps")

# Create a list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
"Sentences can be embedded one by one or in a list.",
"I like horses!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences , embeddings))

# See the embeddings
for sentence , embedding in embeddings_dict.items():
    print(f"Sentences: {sentences}")
    print(f"Embedding: {embedding}")
    print("")

Sentences: ['The Sentence Transformer library provides an easy way to create embeddings.', 'Sentences can be embedded one by one or in a list.', 'I like horses!']
Embedding: [-3.44285034e-02  2.95328740e-02 -2.33643129e-02  5.57257049e-02
 -2.19098348e-02 -6.47064066e-03  1.02849510e-02 -6.57804459e-02
  2.29718108e-02 -2.61120982e-02  3.80421579e-02  5.61402477e-02
 -3.68746743e-02  1.52788563e-02  4.37020771e-02 -5.19723520e-02
  4.89479378e-02  3.58104357e-03 -1.29750213e-02  3.54386633e-03
  4.23262045e-02  3.52606587e-02  2.49401722e-02  2.99177412e-02
 -1.99381504e-02 -2.39753220e-02 -3.33373318e-03 -4.30450179e-02
  5.72013743e-02 -1.32517647e-02 -3.54478061e-02 -1.13935862e-02
  5.55561855e-02  3.61094647e-03  8.88527211e-07  1.14027150e-02
 -3.82229984e-02 -2.43550329e-03  1.51313534e-02 -1.32587040e-04
  5.00659235e-02 -5.50876968e-02  1.73444599e-02  5.00958897e-02
 -3.75959873e-02 -1.04462970e-02  5.08322604e-02  1.24860760e-02
  8.67376551e-02  4.64143828e-02 -2.10689977e-

In [44]:
embeddings[0].shape

(768,)

In [46]:
%%time

embedding_model.to("mps")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 1680/1680 [02:57<00:00,  9.48it/s]

CPU times: user 6min 24s, sys: 1min 11s, total: 7min 36s
Wall time: 2min 57s





In [47]:
%%time

text_chunks = [item['sentence_chunk'] for item in pages_and_chunks_over_min_token_len]
text_chunks[523]

CPU times: user 258 μs, sys: 2.75 ms, total: 3.01 ms
Wall time: 3.69 ms


'through food. Genetic factors may also influence the way a person’s body modifies cholesterol. The 2015-2020 US Dietary Guidelines suggest limiting saturated fats, thereby indirectly limiting dietary cholesterol since foods that are high in cholesterol tend to be high in saturated fats also. A Prelude to Disease If left unchecked, improper dietary fat consumption can lead down a path to severe health problems. An increased level of lipids, triglycerides, and cholesterol in the blood is called hyperlipidemia. Hyperlipidemia is inclusive of several conditions but more commonly refers to high cholesterol and triglyceride levels. When blood lipid levels are high, any number of adverse health problems may ensue. Consider the following: • Cardiovascular disease. According to the AHA, cardiovascular disease encompasses a variety of problems, many of which are related to the process of atherosclerosis. Over time the arteries thicken and harden with plaque buildup, causing restricted or at tim

In [48]:
len(text_chunks)

1680

In [50]:
%%time

# Embed all texts in batches
text_chunks_embeddings = embedding_model.encode(text_chunks , batch_size=32 ,convert_to_tensor=True)

text_chunks_embeddings


CPU times: user 30.3 s, sys: 16.6 s, total: 46.9 s
Wall time: 2min 36s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='mps:0')

# Save embeddings to file


In [51]:
pages_and_chunks_over_min_token_len[412]

{'page_number': 273,
 'sentence_chunk': 'Foods Total Carbohydrates Sugars Fiber Added Sugars Banana 27 (1 medium) 14.40 3.1 0 Lentils 40 (1 c.) 3.50 16.0 0 Snap beans 8.7 (1 c.) 1.60 4.0 0 Green pepper 5.5 (1 medium) 2.90 2.0 0 Corn tortilla 10.7 (1) 0.20 1.5 0 Bread, wheat bran 17.2 (1 slice) 3.50 1.4 3.4 Bread, rye 15.5 (1 slice) 1.20 1.9 1.0 Bagel (plain) 53 (1 medium) 5.30 2.3 4.8 Brownie 36 (1 square) 20.50 1.2 20.0 Oatmeal cookie 22.3 (1 oz.) 12.00 2.0 7.7 Cornflakes 23 (1 c.) 1.50 0.3 1.5 Pretzels 47 (10 twists) 1.30 1.7 0 Popcorn (homemade) 58 (100 g) 0.50 10.0 0 Skim milk 12 (1 c.) 12.00 0 0 Cream (half and half) 0.65 (1 Tbs.) 0.02 0 0 Cream substitute 1.0 (1 tsp.) 1.00 0 1.0 Cheddar cheese 1.3 (1 slice) 0.50 0 0 Yogurt (with fruit) 32.3 (6 oz.) 32.30 0 19.4 Caesar dressing 2.8 (1 Tbs.) 2.80 0 2.4 Sources: • National Nutrient Database for Standard Reference. US Department of Agriculture.http://www.nal.usda.gov/fnic/ foodcomp/search/. Updated December 7, 2011. Accessed Septembe

In [52]:
# Save embeddings to file 
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path , index=False)

In [53]:
# Import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242824e-02 9.02282149e-02 -5.09550050e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156195e-02 5.92139065e-02 -1.66167226e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801600e-02 3.39814052e-02 -2.06426680e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566985e-02 3.81274670e-02 -8.46854784e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264606e-02 -8.49768240e-03 9.57159232e-...
