## 1. Import the PDF Document
* Can be any type of text document


In [66]:
import os
import requests

#pdf path
pdf_path = "human-nutrition-text.pdf"

#download the pdf Document if the file does not exist
if not os.path.exists(pdf_path):
    print("File Doesn't Exist, Downloading...")
    #That's it's url
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    filename = pdf_path
    #Send a get request
    response = requests.get(url)
    
    if response.status_code == 200:
        #success
        print("Downloaded Successfully")
        # Open a file in binary write mode and save the content to it
        with open(filename,"wb") as file:
            file.write(response.content)
        print(f"The file has been saved as {filename}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")
    


File human-nutrition-text.pdf exists.


In [67]:
#text cleaning
def text_format(text:str) -> str:
    cleaned_text = text.replace("\n"," ").strip()
    #maybe more formatting here
    return cleaned_text

## Read PDF

In [68]:
import fitz
from tqdm.auto import tqdm

def open_read_pdf(pdf_path: str)-> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for p_num,page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_format(text)
        #In this book page 1 starts at 43
        pages_and_text.append({
            "page_number": p_num-41,
            "page_char_count": len(text),
            "page_word_count":len(text.split(" ")),
            "page_sentence_count":len(text.split(". ")),
            "page_token_count": len(text)/4, # 1 token = 4 characters
            "text": text
                               })
    return pages_and_text

pages_and_text = open_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [69]:
import random

random.sample(pages_and_text,k = 3)

[{'page_number': 776,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 902,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 248,
  'page_char_count': 273,
  'page_word_count': 39,
  'page_sentence_count': 4,
  'page_token_count': 68.25,
  'text': 'Table 4.1 The Glycemic Index: Foods In Comparison To Glucose  Health Implications. Journal of the American College of  Nutrition, 28(4),  446S–49S.https://www.ncbi.nlm.nih.gov/pubmed/ 20234031. Accessed September 27, 2017.  248  |  Digestion and Absorption of Carbohydrates'}]

In [70]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [71]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,199.499172,10.519868,287.001035
std,348.86387,560.382275,95.830681,6.548495,140.095569
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.875
75%,864.25,1603.5,272.0,15.0,400.875
max,1166.0,2308.0,430.0,39.0,577.0


In [72]:
# We should think about the number of tokens per page as:
# LLMS and embedding models do not work with infinite tokens

# Now we will continue text pre-processing
## Split text to chunks. Each chunk = 10 sentences
### Two ways of doing that:
* 1. split on ". "
* 2. use NLP libraries such as spaCY & NLTK

In [73]:
from spacy.lang.en import English

nlp = English()

#Add a sentencizer : turns text into sentences
nlp.add_pipe("sentencizer")

#create a test instance
doc = nlp("This is a sentence. That's another. Also I like dogs")

print(list(doc.sents))

[This is a sentence., That's another., Also I like dogs]


In [74]:
print(pages_and_text[600])

{'page_number': 559, 'page_char_count': 863, 'page_word_count': 138, 'page_sentence_count': 9, 'page_token_count': 215.75, 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  |  559'}


In [75]:
for page in pages_and_text:
    # Will add antoher attribute to the dictionary of each page called sentence having the sentencized text.
    page["sentences"] = list(nlp(page["text"]).sents)
    #the default datatype is spaCY we want it as string
    page["sentences"] = [str(sentence) for sentence in page["sentences"]]
    #count the number of sentences from staCY
    page["page_sentence_count_staCY"] = len(page["sentences"])

In [76]:
print(pages_and_text[600])
print(len(pages_and_text[600]["sentences"]))

{'page_number': 559, 'page_char_count': 863, 'page_word_count': 138, 'page_sentence_count': 9, 'page_token_count': 215.75, 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  |  559', 's

In [77]:
df1 = pd.DataFrame(pages_and_text)
df1.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_staCY
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


# Text chunking/splitting

### Is splitting larger pieces of text to smaller chunks

* There is no 100% correct way of doing so
* we will split each chunk into a group of 10 sentences (Not a contant number can be 5, 6, 7, 8, Whatever you like)
* Can use Langchain but we will be using pure python

## Why do we use this
1. so that our text chunks can fit into out context window of the embedding model
2. So that our text is easier to filter (smaller groups of text can be easier to inspect that large passages of text)
3. Also so that out context passed is more focused & specific

In [78]:
#Define split size to turn groups of sentences into chuncks
chunk_size = 10

# [20] -> [10,10] | [25] -> [10,10,5]
def split_list_chunks(inp_list : list[str],slice_size = chunk_size) -> list[list[str]]:
    
    return [inp_list[i:i+slice_size] for i in range(0, len(inp_list),slice_size) ]

split_list_chunks(list(range(1,26)),chunk_size)

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [21, 22, 23, 24, 25]]

In [79]:
# loop through pages and texts and split senteces into chunks

for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list_chunks(inp_list=item["sentences"], slice_size=chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [80]:
random.sample(pages_and_text, k = 1)

[{'page_number': 239,
  'page_char_count': 474,
  'page_word_count': 79,
  'page_sentence_count': 4,
  'page_token_count': 118.5,
  'text': 'recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=175    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=175  Introduction  |  239',
  'sentences': ['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
   '   An interactive or media element has been  excluded from this version of the text.',
   'You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=175    An interactive or media element has been  excluded from this version o

In [81]:
df2 = pd.DataFrame(pages_and_text)
df2.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_staCY,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


## Splitting each chunk to it's own item

That will give us a good level of granularity

In [82]:
import re

#split each chunk into its own item

pages_and_chunks = []

for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = dict()
        chunk_dict["page_number"] = item["page_number"]
        
        #Now each chunk has a list of 10 sentences, we want them combined in 1 paragraph
        joined_chunk =  "".join(sentence_chunk).replace("  "," ").strip()
        joined_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_chunk)
        chunk_dict["text"] =joined_chunk
        
        #some chunk stats
        chunk_dict["chunk_char_count"] = len(joined_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_chunk)/4 # 1 token = ~ 4 words
        
        pages_and_chunks.append(chunk_dict)
        
print(len(pages_and_chunks))
        
        

  0%|          | 0/1208 [00:00<?, ?it/s]

1843


In [83]:
random.sample(pages_and_chunks, k = 1)

[{'page_number': 702,
  'text': 'Fluoride Fluoridated water, foods prepared in fluoridated water, seafood 3-4 mg/day Component of mineralized bone, provides structure and microarchitecture, stimulates new bone growth Increased risk of dental caries Po w flu w Manganese Legumes, nuts, leafy green vegetables 1.8-2.3 mg/ day Glucose synthesis, amino-acid catabolism Impaired growth, skeletal abnormalities, abnormal glucose metabolism N Molybdenum Milk, grains, legumes 45 mcg/day Cofactor for a number of enzymes Unknown N Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities. These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activit

In [84]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [94]:
df.head()

Unnamed: 0,page_number,text,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5
4,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25


In [117]:
min_token_len = 20

for row in df[df["chunk_token_count"]<= min_token_len].sample(5).iterrows():
    print(f"Chunk count : {row[1]["chunk_token_count"]}  |  Text: {row[1]["text"]}")

Chunk count : 12.0  |  Text: PART V CHAPTER 5. LIPIDS Chapter 5. Lipids | 289
Chunk count : 3.0  |  Text: Iodine | 681
Chunk count : 11.0  |  Text: Accessed October 5, 2017. Introduction | 433
Chunk count : 11.25  |  Text: Carbohydrates and Personal Diet Choices | 275
Chunk count : 16.5  |  Text: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281


In [136]:
#Filter our Dataframes for rows with under 30 tokens

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_len].to_dict(orient = "records")

random.sample(pages_and_chunks_over_min_token_len, k = 1)



[{'page_number': 279,
  'text': 'Benefits of Sugar Substitutes Consuming foods and beverages containing sugar substitutes may benefit health by reducing the consumption of simple sugars, which are higher in calories, cause tooth decay, and are potentially linked to chronic disease. Artificial sweeteners are basically non-nutrients though not all are completely calorie-free. However, because they are so intense in sweetness they are added in very small amounts to foods and beverages. Artificial sweeteners and sugar alcohols are not “fermentable sugars” and therefore they do not cause tooth decay. Chewing gum with artificial sweeteners is the only proven way that artificial sweeteners promote oral health. The American Dental Association (ADA) allows manufacturers of chewing gum to label packages with an ADA seal if they have convincing scientific evidence demonstrating their product either reduces plaque acids, cavities, or gum disease, or promotes tooth remineralization. There is limite

# Embedding our text chunks

## What is Embeddings
* Turn our chunks into numbers
* A useful numerical represntation as computers only understand numbers
* The most useful part of the embeddings is that it is learned a represntation

Example:
```
"the" : 0,
"a" : 1
etc...
```

Works with words or sentences