 **Create and Run a local RAG pipeline from scratch**

## 1. Document/text preprocessing and embedding creation

In [3]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrition-text.pdf"

# Download
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    # Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # THe Local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    responce = requests.get(url)

    # Check if the request was successfull
    if responce.status_code == 200:
        # Open the file and save it
        with open(filename , 'wb') as file:
            file.write(responce.content)
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Falied to download the file . Status code {responce.status_code}")

else:
    print(f"File {pdf_path} exists.")


File human-nutrition-text.pdf exists.


In [6]:
import fitz
from tqdm import tqdm


def text_formaatter(text: str) -> str:
    """ Performs minor formatting on text """
    cleaned_text = text.replace("\n" , " ").strip()

    # Pottentially more text formating functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number , page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formaatter(text=text)
        pages_and_text.append({"page_number" : page_number - 41 , 
                               "page_char_count": len(text) , 
                               "page_word_count": len(text.split(" ")) , 
                               "page_sentence_count_raw": len(text.split(". ")) , 
                               "page_token_count": len(text) / 4  , # 1 token = 4 character.
                               "text": text})
    
    return pages_and_text

pages_and_text = open_and_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]


1208it [00:01, 869.27it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random
random.sample(pages_and_text , k = 4)

[{'page_number': -6,
  'page_char_count': 544,
  'page_word_count': 96,
  'page_sentence_count_raw': 4,
  'page_token_count': 136.0,
  'text': 'Christina Gar Lai Young is currently an undergraduate student  in Dietetics at the College of Tropical Agriculture and Human  Resources at the University of Hawaiʻi at Mānoa. She plans to  become a Registered Dietitian after obtaining her undergraduate  degree and completing her supervised practice in dietetics.  Staff  Ty Lim  Ty Lim is an instructional designer who received his MA in  Education and BA in Anthropology.\xa0 He is excited about helping to  create open educational resources for FSHN and CTAHR.  xxxvi  |  About the Contributors'},
 {'page_number': 1029,
  'page_char_count': 749,
  'page_word_count': 121,
  'page_sentence_count_raw': 4,
  'page_token_count': 187.25,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning acti