In [14]:
import fitz 
from tqdm.auto import tqdm 
import re

pdf_path = "../raw data/human-nutrition-text.pdf"

def text_formatter(text: str) -> str:

    # Replace new lines with space
    cleaned_text = text.replace("\n", " ")

    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Remove special characters (optional, depending on use case)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,;!?\'\"-]', '', cleaned_text)

    # Fix common OCR issues (e.g., replacing 'ﬁ' with 'fi')
    cleaned_text = re.sub(r'ﬁ', 'fi', cleaned_text)
    cleaned_text = re.sub(r'ﬂ', 'fl', cleaned_text)

    # Convert text to lowercase for consistency
    cleaned_text = cleaned_text.lower().strip()

    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)

        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})  # lowercase text for consistency
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts

1208it [00:00, 1240.79it/s]


[{'page_number': -41,
  'page_char_count': 28,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.0,
  'text': 'human nutrition 2020 edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 305,
  'page_word_count': 42,
  'page_sentence_count_raw': 1,
  'page_token_count': 76.25,
  'text': 'human nutrition 2020 edition university of hawaii at mnoa food science and human nutrition program alan titchenal, skylar hara, noemi arceo caacbay, william meinke-lau, ya-yun yang, marie kainoa fialkowski revilla, jennifer draper, gemady langfelder, cheryl gibby, chyna nicole chun, and allison calabrese'},
 {'page_number': -38,
  'page_char_count': 207,
  'page_word_count': 30,
  'page_sentence_count_raw': 1,
  'page_token_count': 51.75,
  'text': 'human nutrition 2020 edition by university of hawaii at mnoa food science and human 

In [10]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 768,
  'page_char_count': 1595,
  'page_word_count': 258,
  'page_sentence_count_raw': 8,
  'page_token_count': 398.75,
  'text': 'Understanding the Bigger  Picture of Dietary Guidelines  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The first US dietary recommendations were set by the National  Academy of Sciences in 1941. The recommended dietary allowances  (RDA) were first established out of concern that America’s overseas  World War II troops were not consuming enough daily nutrients  to maintain good health. The first Food and Nutrition Board was  created in 1941, and in the same year set recommendations for the  adequate intakes of caloric energy and eight essential nutrients.  These were disseminated to officials responsible for food relief for  armed forces and civilians supporting the war effort. Since 1980,  the dietary guidelines have been reevaluated and updated every  five years by the advisory commit

### NLP Analisis

We must have in mind that Llama3 model have 4,096 tokens per interaction, therefore we must be shure that how much pages we can pass through the model for the retraival augmentation.  

In [11]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


Okay, looks like our average token count per page is 287.

For this particular use case, it meant that we can use a buch of pages as an input for the Llama3 model. We will tuk an amout of 8 pages for the retraival augmentation

In [2]:
from langchain_community.llms import Ollama

ollama = Ollama(model="llama3")
ollama.invoke("Hello, how are you?")

"Hello! I'm just a language model, so I don't have feelings or emotions like humans do. However, I'm functioning properly and ready to assist you with any questions or tasks you may have. How can I help you today?"