In [2]:
import torch

In [3]:
!nvidia-smi

Sat Mar 30 14:09:50 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   45C    P8             18W /  200W |     729MiB /   8192MiB |     24%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import os
import requests

# Get PDF document path

pdf_path = "human-nutrition-text.pdf"

# Download

if not os.path.exists(pdf_path):
    print(f"[INFO] PDF file doesn't exist, download...");

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The Local filename to save the downloaded file

    filename = pdf_path
    response = requests.get(url);

    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been dowloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists")    



File human-nutrition-text.pdf exists


In [5]:
import fitz # requires: !pip install PyMuPDF
from tqdm.auto import tqdm # pip install tqdm

def text_formatter(text: str) -> str:
    """Performs mino formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return cleaned_text;

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
        
    return pages_and_texts;

pages_and_texts = open_and_read_pdf(pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [6]:
import random

random.sample(pages_and_texts, k = 3)

[{'page_number': 720,
  'page_char_count': 2308,
  'page_word_count': 430,
  'page_sentence_count_raw': 17,
  'page_token_count': 577.0,
  'text': 'crackers equals one serving and that the whole box contains 10  servings. All other values listed thereafter, from the calories to the  dietary fiber, are based on this one serving. On the panel, the  serving size is followed by the number of calories and then a list  of selected nutrients. You will also see “Percent Daily Value” on the  far right-hand side. This helps you determine if the food is a good  source of a particular nutrient or not.  The Daily Value (DV) represents the recommended amount of  a given nutrient based on the RDA of that nutrient in a  2,000-kilocalorie diet. The DV was updated as part of the new  nutrition facts label announced in May 2016. The updated DV applies  to packaged food, beverages and dietary supplements that contain  ingredients with a DV. The percentage of Daily Value (percent DV)  represents the propor

In [7]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [8]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [9]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")


for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])



  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 33,
  'page_char_count': 755,
  'page_word_count': 126,
  'page_sentence_count_raw': 5,
  'page_token_count': 188.75,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=57    Achieving a Healthy Diet  |  33',
  'sentences': ['Learning Activities  Technology Note: The second edition of the Human  Nutritio

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


In [12]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str], slice_size:int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

split_list(list(range(25)))

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [14]:
random.sample(pages_and_texts, k=1)

[{'page_number': 950,
  'page_char_count': 1695,
  'page_word_count': 302,
  'page_sentence_count_raw': 13,
  'page_token_count': 423.75,
  'text': 'Fuel Sources  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The human body uses carbohydrate, fat and protein in food and  from body stores as energy. These essential nutrients are needed  regardless of the intensity of activity you are doing. If you are lying  down reading a book or running the the Honolulu Marathon, these  macronutrients are always needed in the body. However, in order  for these nutrients to be used as fuel for the body, their energy must  be transferred into the high energy molecule known as Adenosine  Triphosphate (ATP). ATP is the body’s immediate fuel source of  energy that can be generated either with the presences of oxygen  known as aerobic metabolism or without the presence of oxygen  by anaerobic metabolism. The type of metabolism that is  predominately us

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [16]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)


  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [17]:
random.sample(pages_and_chunks, k = 1)

[{'page_number': 176,
  'sentence_chunk': 'Sodium UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM Sodium is vital not only for maintaining fluid balance but also for many other essential functions. In contrast to many minerals, sodium absorption in the small intestine is extremely efficient and in a healthy individual all excess sodium is excreted by the kidneys. In fact, very little sodium is required in the diet (about 200 milligrams) because the kidneys actively reabsorb sodium. Kidney reabsorption of sodium is hormonally controlled, allowing for a relatively constant sodium concentration in the blood. Other Functions of Sodium in the Body The second notable function of sodium is in nerve impulse transmission. Nerve impulse transmission results from the transport of sodium cations into a nerve cell, which creates a charge difference (or voltage) between the nerve cell and its extracellular environment. Similar to how a current move

In [18]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [19]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 25.5 | Text: http://www.ajcn.org/cgi/ pmidlookup?view=long&pmid=10197575. Accessed October 6, 2017. 640 | Magnesium
Chunk token count: 19.5 | Text: Honor your health – gentle nutrition    Calories In Versus Calories Out | 1075
Chunk token count: 16.5 | Text: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281
Chunk token count: 29.5 | Text: Water As a Temperature Regulator Another homeostatic function of the body, termed 158 | Water’s Importance to Vitality
Chunk token count: 20.75 | Text: Centers for Disease Control and Prevention.http://www.cdc.gov/nutrition/ Iron | 661


In [20]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [21]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 890,
  'sentence_chunk': 'As during puberty, growth and development during adolescence differs in males than in females. In teenage girls, fat assumes a larger percentage of body weight, while teenage boys experience greater muscle and bone increases. For both, primary and secondary sex characteristics have fully developed and the rate of growth slows with the end of puberty. Also, the motor functions of an older adolescent are comparable to those of an adult.2 Again, adequate nutrition and healthy choices support this stage of growth and development. 2. Polan EU, Taylor DR. (2003). Journey Across the Life Span: Human Development and Health Promotion. Philadelphia: F. A. Davis Company, 171–173. 890 | Late Adolescence',
  'chunk_char_count': 700,
  'chunk_word_count': 109,
  'chunk_token_count': 175.0}]

In [22]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cpu")

sentences = ["The Sentence Transformer library provides an easy way to create embeddings", "Sentences can be embeedded one by one or in a list.", "I like horses!"]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

Sentence: The Sentence Transformer library provides an easy way to create embeddings
Embedding: [-3.17511596e-02  3.37268598e-02 -2.52437927e-02  5.22287898e-02
 -2.35248730e-02 -6.19118195e-03  1.35026434e-02 -6.25500381e-02
  7.50828721e-03 -2.29684077e-02  2.98146289e-02  4.57554311e-02
 -3.26700546e-02  1.39847202e-02  4.18013670e-02 -5.92970550e-02
  4.26309966e-02  5.04661538e-03 -2.44552549e-02  3.98591766e-03
  3.55897881e-02  2.78741904e-02  1.84098370e-02  3.67700197e-02
 -2.29961090e-02 -3.01796645e-02  5.99534018e-04 -3.64503190e-02
  5.69104515e-02 -7.49939587e-03 -3.70004438e-02 -3.04354634e-03
  4.64353710e-02  2.36151251e-03  9.06849380e-07  7.00034108e-03
 -3.92289385e-02 -5.95695619e-03  1.38653023e-02  1.87109411e-03
  5.34202047e-02 -6.18613847e-02  2.19613463e-02  4.86050434e-02
 -4.25697602e-02 -1.69858932e-02  5.04178889e-02  1.54734347e-02
  8.12859908e-02  5.07106893e-02 -2.27497071e-02 -4.35721017e-02
 -2.18390999e-03 -2.14092191e-02 -2.01757699e-02  3.0683280

In [23]:
embeddings[0].shape

(768,)

In [24]:
embedding = embedding_model.encode('My favourite animal is the cow!')
embedding


array([-1.45472819e-02,  7.66727403e-02, -2.85872445e-02, -3.31283845e-02,
        3.65210250e-02,  4.78570461e-02, -7.08107427e-02,  1.62833892e-02,
        1.93443913e-02, -2.80482098e-02, -2.91747078e-02,  5.11309430e-02,
       -3.28720659e-02, -8.98755156e-03, -1.03672454e-02, -3.15488279e-02,
        4.22783792e-02, -9.13278013e-03, -1.94017179e-02,  4.35688719e-02,
       -2.31997631e-02,  4.29883003e-02, -1.72393788e-02, -2.01372579e-02,
       -3.13573964e-02,  8.08166154e-03, -2.06724983e-02, -2.27869563e-02,
        2.44812742e-02,  1.71968136e-02, -6.26672879e-02, -7.54796639e-02,
        3.57421972e-02, -5.46570402e-03,  1.24730320e-06, -7.63201574e-03,
       -3.53222154e-02,  1.91327240e-02,  3.99045683e-02,  2.11734185e-03,
        1.64566208e-02,  9.84051824e-03, -1.80700570e-02,  9.33829602e-03,
        3.23483571e-02,  5.84785677e-02,  4.23187464e-02,  1.62091944e-02,
       -9.14911404e-02,  1.82305165e-02, -5.25728893e-03, -7.81024154e-03,
       -3.47644500e-02, -

In [25]:
# %%time

# embedding_model.to("cpu")

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"]  = embedding_model.encode(item["sentence_chunk"])

In [26]:
%%time

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"]  = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: total: 1min 53s
Wall time: 19.2 s


In [28]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[419]

len(text_chunks)

CPU times: total: 0 ns
Wall time: 0 ns


1680

In [31]:
%%time

text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

text_chunk_embeddings

CPU times: total: 55.9 s
Wall time: 12.1 s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

In [32]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"

text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [33]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242377e-02 9.02281404e-02 -5.09549072e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156344e-02 5.92139550e-02 -1.66167226e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,[ 2.79802009e-02 3.39813977e-02 -2.06426531e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,[ 6.82566836e-02 3.81274670e-02 -8.46855342e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264531e-02 -8.49766750e-03 9.57159698e-...
