In [3]:
import os
import requests

In [4]:
# downloading the human-nutrition-text

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading...")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition/open/download?type=pdf"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    response = requests.get(url, headers=headers, stream=True, timeout=60)

    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        print(f"[INFO] File downloaded successfully as {pdf_path}")
    else:
        print(f"[ERROR] Failed to download. Status code: {response.status_code}")

else:
    print("[INFO] File already exists")


[INFO] File already exists


In [5]:
# text-formatting
def  text_formatter(text:str)->str:
  cleaned_text=text.replace("/n"," ").strip()
  return cleaned_text

In [9]:
!pip install tqdm
!pip uninstall fitz -y
!pip install PyMuPDF

Found existing installation: fitz 0.0.1.dev2
Uninstalling fitz-0.0.1.dev2:
  Successfully uninstalled fitz-0.0.1.dev2
Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7


In [10]:
import fitz
from re import split
from tqdm.auto import tqdm

def open_and_read_pdf(pdf_path:str)->list[dict]:
  pages_and_text=[]

  for page_number,page_text in tqdm(enumerate(fitz.open(pdf_path))):
    text=page_text.get_text()
    text=text_formatter(text)
    pages_and_text.append({
        "page_number":page_number-41,
        "page_char_count":len(text),
        "page_word_count":len(text.split(" ")),
        "page_sentence_count":len(text.split(".")),
        "text":text
    })

  return pages_and_text

pages_and_text=open_and_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 15,
  'page_word_count': 2,
  'page_sentence_count': 1,
  'text': 'Human Nutrition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'text': ''}]

In [11]:
import random
random.sample(pages_and_text,k=3)

[{'page_number': 618,
  'page_char_count': 1146,
  'page_word_count': 108,
  'page_sentence_count': 57,
  'text': 'no. ERR-125. 2011. https://www.ers.usda.gov/publications/pub-details/?pubid=44909. Accessed April 15, 2018.\n11.\nColeman-Jensen A. Household Food Security in the United States in 2010. US Department of Agriculture, Economic Research Report,\nno. ERR-125. 2011. https://www.ers.usda.gov/publications/pub-details/?pubid=44909. Accessed April 15, 2018.\n12.\nColeman-Jensen A. Household Food Security in the United States in 2010. US Department of Agriculture, Economic Research Report,\nno. ERR-125. 2011. https://www.ers.usda.gov/publications/pub-details/?pubid=44909. Accessed April 15, 2018.\n13.\nColeman-Jensen A. Household Food Security in the United States in 2010. US Department of Agriculture, Economic Research Report,\nno. ERR-125. 2011. https://www.ers.usda.gov/publications/pub-details/?pubid=44909. Accessed April 15, 2018.\n14.\nNational School Lunch Program. US Departme

In [17]:
import pandas as pd
df=pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,text,sentences,page_sentence_spacy_count
0,-41,15,2,1,Human Nutrition,[Human Nutrition],1
1,-40,0,1,1,,[],0
2,-39,188,22,1,Human Nutrition\nUNIVERSITY OF HAWAI‘I AT MĀNO...,[Human Nutrition\nUNIVERSITY OF HAWAI‘I AT MĀN...,1
3,-38,607,95,9,Human Nutrition by University of Hawai‘i at Mā...,[Human Nutrition by University of Hawai‘i at M...,5
4,-37,827,75,4,Contents\nPreface\nxi\nAbout the Contributors\...,[Contents\nPreface\nxi\nAbout the Contributors...,4


In [18]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_spacy_count
count,667.0,667.0,667.0,667.0,667.0
mean,292.0,1756.907046,243.116942,21.503748,16.206897
std,192.690598,1211.252292,174.745258,19.469185,13.638702
min,-41.0,0.0,1.0,1.0,0.0
25%,125.5,774.5,94.0,9.0,6.0
50%,292.0,1584.0,216.0,17.0,14.0
75%,458.5,2750.5,382.0,29.0,23.0
max,625.0,4555.0,716.0,153.0,101.0


In [19]:
# text preprocessing divide the whole text into sentences
from spacy.lang.en import English
nlp=English()
nlp.add_pipe("sentencizer")
for item in tqdm(pages_and_text):
  item["sentences"]=list(nlp(item["text"]).sents)
  item["sentences"]=[str(sentence) for sentence in item["sentences"]]
  item["page_sentence_spacy_count"]=len(item["sentences"])


  0%|          | 0/667 [00:00<?, ?it/s]

In [20]:
random.sample(pages_and_text,k=1)

[{'page_number': -28,
  'page_char_count': 507,
  'page_word_count': 73,
  'page_sentence_count': 5,
  'text': 'Students\nALLISON CALABRESE\nAllison Calabrese is currently a MS graduate student in the Nutritional Sciences Program at University of Hawai‘i\nat Mānoa. She obtained her BS from California Lutheran University in Exercise Science with an emphasis in Health\nProfessions. Her research interests include the relationship between diet and optimal health.\nCHERYL GIBBY\nCheryl Gibby was born and raised in Hawai‘i and is a wife and mother of three. She received her BA, MS in Nutritional\nAbout the Contributors | xiv',
  'sentences': ['Students\nALLISON CALABRESE\nAllison Calabrese is currently a MS graduate student in the Nutritional Sciences Program at University of Hawai‘i\nat Mānoa.',
   'She obtained her BS from California Lutheran University in Exercise Science with an emphasis in Health\nProfessions.',
   'Her research interests include the relationship between diet and optima

In [21]:
df=pd.DataFrame(pages_and_text)
# df.head()
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_sentence_spacy_count
count,667.0,667.0,667.0,667.0,667.0
mean,292.0,1756.91,243.12,21.5,16.21
std,192.69,1211.25,174.75,19.47,13.64
min,-41.0,0.0,1.0,1.0,0.0
25%,125.5,774.5,94.0,9.0,6.0
50%,292.0,1584.0,216.0,17.0,14.0
75%,458.5,2750.5,382.0,29.0,23.0
max,625.0,4555.0,716.0,153.0,101.0


In [22]:
num_sentence_chunk_size=10
def split_list(input_list:list[str],slice_size:int=num_sentence_chunk_size)->list[list[str]]:
  return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]

In [23]:
test_list=list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [24]:
for item in tqdm(pages_and_text):
  item["sentence_chunks"]=split_list(input_list=item["sentences"],slice_size=num_sentence_chunk_size)
  item["num_chunks"]=len(item["sentence_chunks"])

  0%|          | 0/667 [00:00<?, ?it/s]

In [25]:
random.sample(pages_and_text,k=1)

[{'page_number': 611,
  'page_char_count': 3565,
  'page_word_count': 491,
  'page_sentence_count': 29,
  'text': 'Undernutrition, Overnutrition, and Malnutrition\nFor many, the word “malnutrition” produces an image of a child in a third-world country with a bloated belly, and skinny\narms and legs. However, this image alone is not an accurate representation of the state of malnutrition. For example,\nsomeone who is 150 pounds overweight can also be malnourished.\nMalnutrition refers to one not receiving proper nutrition and does not distinguish between the consequences of too\nmany nutrients or the lack of nutrients, both of which impair overall health. Undernutrition is characterized by a lack of\nnutrients and insufficient energy supply, whereas overnutrition is characterized by excessive nutrient and energy intake.\nOvernutrition can result in obesity, a growing global health threat. Obesity is defined as a metabolic disorder that leads\nto an overaccumulation of fat tissue.\nAltho

In [47]:
import re
pages_and_chunks=[]

for item in tqdm(pages_and_text):
  for sentence_chunk in item["sentence_chunks"]:
    chunks_dict={}
    chunks_dict["page_number"]=item["page_number"]
    join_sentence_chunk="".join(sentence_chunk).replace(" "," ").strip()
    join_sentence_chunk=re.sub(r'\.([A-Z])',r'. \1',join_sentence_chunk)
    chunks_dict["sentence_chunk"]=join_sentence_chunk
    chunks_dict["chunks_char_count"]=len(join_sentence_chunk)
    chunks_dict["chunks_word_count"]=len(join_sentence_chunk.split(" "))
    chunks_dict["chunks_token_count"]=len(join_sentence_chunk)/4

    pages_and_chunks.append(chunks_dict)

  0%|          | 0/667 [00:00<?, ?it/s]

In [48]:
len(pages_and_chunks)

1409

In [50]:
random.sample(pages_and_chunks,k=1)

[{'page_number': 496,
  'sentence_chunk': 'Nutrient\nMales, Ages 14–18\nFemales, Ages 14–18\nVitamin A (mcg)\n900.0\n700.0\nVitamin B6 (mg)\n1.3\n1.2\nVitamin B12 (mcg)\n2.4\n2.4\nVitamin C (mg)\n75.0\n65.0\nVitamin D (mcg)\n5.0\n5.0\nVitamin E (mg)\n15.0\n15.0\nVitamin K (mcg)\n75.0\n75.0\nCalcium (mg)\n1,300.0\n1,300.0\nFolate mcg)\n400.0\n400.0\nIron (mg)\n11.0\n15.0\nMagnesium (mg)\n410.0\n360.0\nNiacin (B3) (mg)\n16.0\n14.0\nPhosphorus (mg)\n1,250.0\n1,250.0\nRiboflavin (B2) (mg)\n1.3\n1.0\nSelenium (mcg)\n55.0\n55.0\nThiamine (B1) (mg)\n1.2\n1.0\nZinc (mg)\n11.0\n9.0\nSource: Institute of Medicine.2006. Dietary Reference Intakes: The Essential Guide to Nutrient Requirements.\nWashington, DC: The National Academies Press.https://doi.org/10.17226/11537. Accessed December 10, 2017.\nEating Disorders\nMany teens struggle with an eating disorder, which can have a detrimental effect on diet and health. A study published\nby North Dakota State University estimates that these conditions 

In [51]:
df=pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunks_char_count,chunks_word_count,chunks_token_count
count,1409.0,1409.0,1409.0,1409.0
mean,315.141235,830.816182,114.929028,207.704045
std,194.957686,535.264253,78.575424,133.816063
min,-41.0,3.0,1.0,0.75
25%,146.0,345.0,37.0,86.25
50%,313.0,851.0,116.0,212.75
75%,490.0,1223.0,179.0,305.75
max,625.0,3060.0,436.0,765.0


In [59]:
min_token_len=30
for row in df[df["chunks_token_count"]<=min_token_len].sample(5).iterrows():
  print(f'chunk_token_count: {row[1]["chunks_token_count"]} | text: {row[1]["sentence_chunk"]}')


chunk_token_count: 4.75 | text: 501 | Toddler Years
chunk_token_count: 18.75 | text: Greater than 40–60 mg/dL
•
triglycerides.10–150 mg/dL
•
VLDL.2–38 mg/dL
206
chunk_token_count: 26.75 | text: For example, 12 micrograms of fruit- or vegetable-based beta-carotene will yield
319 | Fat-Soluble Vitamins
chunk_token_count: 8.5 | text: CHAPTER 18. NUTRITIONAL ISSUES
603
chunk_token_count: 7.0 | text: CHAPTER 2. THE HUMAN BODY
29


In [61]:
pages_and_chunks_over_min_token_len=df[df["chunks_token_count"]>min_token_len].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition\nUNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN\nNUTRITION PROGRAM\nALAN TITCHENAL, ALLISON CALABRESE, CHERYL GIBBY, MARIE\nKAINOA FIALKOWSKI REVILLA, AND WILLIAM MEINKE',
  'chunks_char_count': 188,
  'chunks_word_count': 22,
  'chunks_token_count': 47.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons\nAttribution 4.0 International License, except where otherwise noted.\nThis Human Nutrition OER textbook includes content from a number of OER sources. All new content added to this book is licensed\nunder a Creative Commons CC BY 4.0 license, while select chapters have been used and are shared under a CC BY-NC-SA license. All\nother content not under a CC is used fairly and is labeled as such.\nThis book was produced using Pressbooks.com, and PDF rendering was done by PrinceXML.',
  'chunks_char_cou

In [62]:
from sentence_transformers import SentenceTransformer
embedding_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

sentences=["nikhil is very nice guy.","sometimes bro behave like a poor guy","any ways nikhil is good"]
embeddings=embedding_model.encode(sentences)
embedding_dict=dict(zip(sentences,embeddings))
for s,e in embedding_dict.items():
  print(f"sentences : {s}")
  print(f"embeddings : {e}")
  print(" ")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

sentences : nikhil is very nice guy.
embeddings : [-1.26460036e-02 -1.95201859e-02 -1.19406208e-02  5.10186367e-02
 -2.59513836e-02 -4.12686728e-02 -1.52346008e-02 -4.12986092e-02
  8.99924617e-03  3.72502953e-02 -5.72146708e-03 -2.61850934e-03
 -2.42796782e-02 -3.15206200e-02  1.31004136e-02 -2.37675430e-03
  4.17955145e-02  3.07486597e-02 -1.36279864e-02  1.94614474e-02
  3.71075347e-02 -3.38850101e-03 -2.39769816e-02  1.12151727e-03
  2.26126835e-02 -1.29513291e-03  1.29272304e-02  9.91150644e-03
 -8.54200870e-03  1.95735171e-02 -3.86401862e-02 -1.04666455e-02
 -1.74427666e-02  1.72149278e-02  1.52004793e-06  1.21186590e-02
  2.32108925e-02  2.57023447e-03  2.57338248e-02 -5.70092276e-02
  6.18896671e-02  5.96457757e-02 -1.74824130e-02  1.10479305e-02
 -1.51949301e-02  7.29276538e-02  5.71517134e-03  3.85445431e-02
 -3.87634290e-03  3.61582488e-02  9.94958356e-03 -3.51701416e-02
 -1.24387257e-02 -6.49093790e-03  1.69908609e-02  4.26636934e-02
  4.54508263e-04 -2.81013884e-02 -5.0183

In [63]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
  item["embedding"]=embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1303 [00:00<?, ?it/s]

CPU times: user 27.7 s, sys: 144 ms, total: 27.8 s
Wall time: 29.6 s


In [69]:
%%time
text_chunks=[item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

CPU times: user 252 µs, sys: 0 ns, total: 252 µs
Wall time: 255 µs


In [70]:
text_chunks_embeddings=embedding_model.encode(text_chunks,batch_size=32,convert_to_tensor=True)
text_chunks_embeddings

tensor([[ 0.0329,  0.0385, -0.0121,  ..., -0.0130, -0.0069,  0.0218],
        [ 0.0506,  0.0029, -0.0081,  ..., -0.0045, -0.0112,  0.0093],
        [ 0.0295,  0.0251, -0.0068,  ...,  0.0206, -0.0170, -0.0087],
        ...,
        [ 0.0336,  0.0173,  0.0061,  ..., -0.0271, -0.0371, -0.0292],
        [ 0.0709, -0.0490,  0.0073,  ..., -0.0166, -0.0213, -0.0243],
        [ 0.0395, -0.0063,  0.0031,  ..., -0.0223, -0.0465, -0.0305]],
       device='cuda:0')

In [72]:
random.sample(pages_and_chunks_over_min_token_len,k=1)

[{'page_number': 225,
  'sentence_chunk': 'PLoS One.\n2011; 6(6), e20456.http://dx.plos.org/10.1371/journal.pone.0020456. Accessed September 30, 2017.\n2.\nProtein: The Bottom Line. Harvard School of Public Health. The Nutrition Source.http://www.hsph.harvard.edu utritionsource/\nwhat-should-you-eat/protein/. Published 2012. Accessed September 28, 2017.',
  'chunks_char_count': 308,
  'chunks_word_count': 26,
  'chunks_token_count': 77.0,
  'embedding': array([ 4.21455465e-02,  2.75437720e-02,  3.98950418e-03, -5.03458828e-02,
          7.31119048e-03, -8.07334110e-03, -3.95533703e-02,  1.24506094e-02,
          1.67017372e-03, -3.36753461e-03,  9.36092734e-02,  1.22377882e-02,
          5.65549694e-02,  5.30069843e-02,  2.38159932e-02, -4.29929309e-02,
          6.77416974e-04,  6.43924251e-02,  4.23802622e-02,  2.22812742e-02,
         -2.35169623e-02,  1.56468451e-02, -1.64501555e-02,  1.44153899e-02,
         -5.51119214e-03,  5.43571897e-02,  4.23616879e-02, -4.11424525e-02,
     

In [75]:
text_chunks_and_embeddings_df=pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path="texts_chunks_and_embeddings.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path,index=False)

In [76]:
text_chunks_and_embeddings_df_load=pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunks_char_count,chunks_word_count,chunks_token_count,embedding
0,-39,Human Nutrition\nUNIVERSITY OF HAWAI‘I AT MĀNO...,188,22,47.0,[ 3.28588448e-02 3.85478772e-02 -1.21062482e-...
1,-38,Human Nutrition by University of Hawai‘i at Mā...,607,95,151.75,[ 5.06095067e-02 2.92800670e-03 -8.06175638e-...
2,-37,Contents\nPreface\nxi\nAbout the Contributors\...,827,75,206.75,[ 2.95152571e-02 2.50655767e-02 -6.83110673e-...
3,-36,Electrolytes Important for Fluid Balance\n111\...,1024,96,256.0,[ 5.23688234e-02 -3.01076123e-03 -7.41574401e-...
4,-35,Chapter 7. Alcohol\nIntroduction\n265\nAlcohol...,738,46,184.5,[ 4.84842025e-02 1.61825772e-02 -1.43093839e-...
