In [1]:
%pip install -r requirements.txt




In [2]:
%pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


In [3]:
!nvidia-smi

Sat Nov 09 13:09:48 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 457.49       Driver Version: 457.49       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1650   WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   52C    P8     3W /  N/A |    624MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

[Info](https://whimsical.com/simple-local-rag-workflow-39kToR3yNf7E8kY4sS2tjV)

In [4]:
import os


dataDir = "./data/human-nutrition-text.pdf"

if not os.path.exists(dataDir):
    print(f"Doesn't exist {dataDir}")
else:
    print(f"File exists {dataDir}")


File exists ./data/human-nutrition-text.pdf


In [5]:
import fitz 
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    return text.replace('\n', ' ').strip()


def read_pdf(path: str) -> list[dict]:
    doc = fitz.open(path)
    #print(f"Number of pages: {len(doc)}")
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_text.append({"page_number":page_number,
                               "page_char_count":len(text),
                               "page_word_count":len(text.split(" ")),
                               "page_sentence_count":len(text.split(". ")),
                               "page_token_count":len(text) / 4,
                               "page_text":text})
    return pages_and_text

In [6]:
pages_and_text = read_pdf(path = dataDir)


0it [00:00, ?it/s]

In [7]:

pages_and_text[:2]

[{'page_number': 0,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'page_text': 'Human Nutrition: 2020 Edition'},
 {'page_number': 1,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'page_text': ''}]

In [8]:
import random

random.sample(pages_and_text, 2)

[{'page_number': 187,
  'page_char_count': 1501,
  'page_word_count': 269,
  'page_sentence_count': 7,
  'page_token_count': 375.25,
  'page_text': 'Image by Cassie Matias on unsplash.com / CC0  Learning Objectives  By the end of this chapter you will be able to:  •  Describe the importance of water intake for the  body  •  Describe the major aspects of water regulation in  the body  •  Describe the function, balance, sources, and  consequences of the imbalance of [electrolytes  •  Describe the effects and use of popular beverage  choices  Maintaining the right level of water in your body is crucial to  survival, as either too little or too much water in your body will  result in less-than-optimal functioning. One mechanism to help  ensure the body maintains water balance is thirst. Thirst is the  result of your body’s physiology telling your brain to initiate the  thought to take a drink. Sensory proteins detect when your mouth is  dry, your blood volume too low, or blood electrolyte 

In [9]:
import pandas as pd 

df = pd.DataFrame(pages_and_text)

df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_text
0,0,29,4,1,7.25,Human Nutrition: 2020 Edition
1,1,0,1,1,0.0,
2,2,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,3,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,4,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [10]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,0.0,0.0,1.0,1.0,0.0
25%,301.75,762.0,134.0,5.0,190.5
50%,603.5,1231.5,216.0,10.0,307.88
75%,905.25,1603.5,272.0,15.0,400.88
max,1207.0,2308.0,430.0,39.0,577.0


In [11]:
# from spacy.lang.uk import Ukrainian
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is the first sentence. This is the second sentence. I like dogs")   

assert len(list(doc.sents)) == 3

print(list(doc.sents))


  r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


[This is the first sentence., This is the second sentence., I like dogs]


In [12]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["page_text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
# Inspect an example
random.sample(pages_and_text, k=1)

[{'page_number': 476,
  'page_char_count': 852,
  'page_word_count': 139,
  'page_sentence_count': 5,
  'page_token_count': 213.0,
  'page_text': '• Mixture (carbonated mixers speed absorption)  • Medications may increase the bioavailability of alcohol  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=279    Introduction  |  435'

In [14]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,301.75,762.0,134.0,5.0,190.5,5.0
50%,603.5,1231.5,216.0,10.0,307.88,10.0
75%,905.25,1603.5,272.0,15.0,400.88,15.0
max,1207.0,2308.0,430.0,39.0,577.0,28.0


In [15]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str], 
               slice_size: int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))

split_list(test_list)



[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [16]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list = item["sentences"],
                                        slice_size = num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [17]:
random.sample(pages_and_text, k=1)

[{'page_number': 16,
  'page_char_count': 925,
  'page_word_count': 166,
  'page_sentence_count': 3,
  'page_token_count': 231.25,
  'page_text': 'Chromium  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  686  Manganese  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  689  Molybdenum  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  692  Fluoride  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  695  Summary of Trace Minerals  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  700  Part XII. Chapter 12. Nutrition Applications  Introduction  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  707  Understanding Dietary Reference Intakes  University of Hawai‘i at Mānoa Foo

In [18]:
df = pd.DataFrame(pages_and_text)

df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,603.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,301.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,603.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,905.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1207.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [19]:
df.head(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,0,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1,[[Human Nutrition: 2020 Edition]],1
1,1,0,1,1,0.0,,[],0,[],0


In [32]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict ={}
        chunk_dict["page_number "] = item["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)


  0%|          | 0/1208 [00:00<?, ?it/s]

In [33]:
random.sample(pages_and_chunks,1)

[{'page_number ': 843,
  'sentence_chunk': 'The most common food aversions are milk, meats, pork, and liver. For most women, it is not harmful 11. Reid, R. L., & Lorenzo, M. (2018). SCUBA Diving in Pregnancy. Journal of Obstetrics and Gynaecology Canada, 40(11), 1490–1496.https://doi.org/10.1016/ j.jogc.2017.11.024) 802 | Pregnancy',
  'chunk_char_count': 289,
  'chunk_word_count': 42,
  'chunk_token_count': 72.25}]

In [29]:
len(pages_and_chunks)

1843

In [40]:
df = pd.DataFrame(pages_and_chunks)

df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,624.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,0.0,12.0,3.0,3.0
25%,321.5,315.0,45.0,78.75
50%,627.0,745.0,115.0,186.25
75%,931.0,1118.0,173.0,279.5
max,1207.0,1830.0,297.0,457.5


In [41]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 19.25 | Text: The function of the anticoagulant drug warfarin is 544 | Fat-Soluble Vitamins
Chunk token count: 25.0 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=521  996 | The Major Types of Foodborne Illness
Chunk token count: 10.0 | Text: Table 3.6 Salt Alternatives 186 | Sodium
Chunk token count: 21.0 | Text: Updated September 2003. Accessed November 28,2017. Discovering Nutrition Facts | 735
Chunk token count: 29.5 | Text: 2011. https://www.ers.usda.gov/publications/pub- details/?pubid=44909. Accessed April 15, 2018. 1138 | Food Insecurity


In [43]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")

pages_and_chunks_over_min_token_len[:2]

[{'page_number ': 2,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number ': 3,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding