In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print("File doesn't exist, downloading it.....")

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    filename = pdf_path

    response = requests.get(url)

    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"The file has been downloaded and saved as {filename}")

    else:
        print(f"Failed to download the file. status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [3]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

def open_and_read(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [4]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 9,
  'page_char_count': 1320,
  'page_word_count': 215,
  'page_sentence_count_raw': 4,
  'page_token_count': 330.0,
  'text': 'Minerals  Major Functions  Macro  Sodium  Fluid balance, nerve transmission, muscle contraction  Chloride  Fluid balance, stomach acid production  Potassium  Fluid balance, nerve transmission, muscle contraction  Calcium  Bone and teeth health maintenance, nerve transmission,  muscle contraction, blood clotting  Phosphorus  Bone and teeth health maintenance, acid-base balance  Magnesium  Protein production, nerve transmission, muscle  contraction  Sulfur  Protein production  Trace  Iron  Carries oxygen, assists in energy production  Zinc  Protein and DNA production, wound healing, growth,  immune system function  Iodine  Thyroid hormone production, growth, metabolism  Selenium  Antioxidant  Copper  Coenzyme, iron metabolism  Manganese  Coenzyme  Fluoride  Bone and teeth health maintenance, tooth decay  prevention  Chromium  Assists insulin in 

In [5]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


## Further text processing (splitting pages into sentences)

In [11]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

list(doc.sents)

[This is a sentence., This another sentence.]

In [14]:
for item in tqdm(pages_and_texts):
    item['sentences'] = list(nlp(item['text']).sents)

    item['sentences'] = [str(sentence) for sentence in item['sentences']]

    item["pages_sentence_count_spacy"] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [16]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1138,
  'page_char_count': 1416,
  'page_word_count': 224,
  'page_sentence_count_raw': 15,
  'page_token_count': 354.0,
  'text': 'programs and other forms of public assistance if their income is less  than a certain percentage of the federal poverty level, along with  other qualifications.  USDA Food Assistance Programs  Government food and nutrition assistance programs that are  organized and operated by the USDA work to increase food security.  They provide low-income households with access to food, the tools  for consuming a healthy diet, and education about nutrition. The  USDA monitors the extent and severity of food insecurity via an  annual survey. This contributes to the efficiency of food assistance  programs as well as the effectiveness of private charities and other  initiatives aimed at reducing food insecurity.10  The Supplemental Nutrition Assistance Program  Formerly known as the Food Stamp Program, the Supplemental  Nutrition Assistance Program (SNAP)

In [22]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,pages_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


### Chunking our sentences together


In [25]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [26]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1065,
  'page_char_count': 1427,
  'page_word_count': 258,
  'page_sentence_count_raw': 27,
  'page_token_count': 356.75,
  'text': 'reviews of randomized clinical trials reported that on average,  obesity treatments cause weight gain.56  This additional weight gain  leads to an increase in the set point, making it more difficult for an  individual to lose weight in the future.   Others reported a 3-5 %  weight loss was possible 4 years later if participants continued all  aspects of treatment.7  For a 200 pound person, this represents a  6-10 pound weight loss.  The health benefits of this modest weight  loss are unclear and it is far less what is expected or desired when  following a diet.  In conclusion, the diet industry makes money from  a product that is proven not to work.  5. Mann, T., Tomiyama, A. J., Westling, E., Lew, A.-M.,  Samuels, B., & Chatman, J. (2007). Medicare’s search for  effective obesity treatments: Diets are not the answer.  The American Psycho

In [27]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,pages_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [28]:
import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict['page_number'] = item['page_number']

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict['sentence_chunk'] = joined_sentence_chunk

        chunk_dict['chunk_char_count'] = len(joined_sentence_chunk)
        chunk_dict['chunk_word_count'] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict['chunk_token_count'] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [29]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 511,
  'sentence_chunk': 'http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=317  An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=317 Dietary, Behavioral, and Physical Activity Recommendations for Weight Management | 511',
  'chunk_char_count': 311,
  'chunk_word_count': 36,
  'chunk_token_count': 77.75}]

In [30]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


In [31]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 3.0 | Text: Iodine | 681
Chunk token count: 11.25 | Text: Accessed March 17, 2011. 212 | Water Concerns
Chunk token count: 20.5 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=84  The Digestive System | 81
Chunk token count: 6.5 | Text: Fat-Soluble Vitamins | 537
Chunk token count: 24.25 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=130  Introduction | 149


In [32]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]