In [1]:
# pip install PyMuPDF

In [2]:
# pip install tqdm

In [3]:
import fitz
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

In [5]:
pdf_path="DT.pdf"

In [6]:
# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

394it [00:00, 431.11it/s]


[{'page_number': -41,
  'page_char_count': 347,
  'page_word_count': 52,
  'page_sentence_count_raw': 2,
  'page_token_count': 86.75,
  'text': 'Page 1 / 394 Exported on: 04/04/2024 Clinical guidelines - Diagnosis and treatment manual For curative programmes in hospitals and dispensaries Guidance for prescribing \xa0 \xa0 © Médecins Sans Frontières All rights reserved for all countries. No reproduction, translation and adaptation may be done without the prior permission of the Copyright owner.'},
 {'page_number': -40,
  'page_char_count': 126,
  'page_word_count': 18,
  'page_sentence_count_raw': 3,
  'page_token_count': 31.5,
  'text': 'Page 2 / 394 ISBN Médecins Sans Frontières. Clinical guidelines - Diagnosis and treatment manual. March 2024 978-2-37585-253-8'}]

In [7]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 26,
  'page_char_count': 150,
  'page_word_count': 20,
  'page_sentence_count_raw': 1,
  'page_token_count': 37.5,
  'text': 'Page 68  / 394 Other upper respiratory tract infections \xa0 \xa0 \xa0 Laryngotracheitis and laryngotracheobronchitis (croup) Epiglottitis Bacterial tracheitis'},
 {'page_number': -39,
  'page_char_count': 511,
  'page_word_count': 71,
  'page_sentence_count_raw': 1,
  'page_token_count': 127.75,
  'text': 'Page 3 / 394 Table of contents Authors/Contributors Preface Abbreviations and acronyms Chapter 1: A few symptoms and syndromes Chapter 2: Respiratory diseases Chapter 3: Gastrointestinal disorders Chapter 4: Skin diseases Chapter 5: Eye diseases Chapter 6: Parasitic diseases Chapter 7: Bacterial diseases Chapter 8: Viral diseases Chapter 9: Genito-urinary diseases Chapter 10: Medical and minor surgical procedures Chapter 11: Mental disorders in adults Chapter 12: Other conditions Appendices Main references'},
 {'page_number': 318,
  'page_cha

In [8]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,347,52,2,86.75,Page 1 / 394 Exported on: 04/04/2024 Clinical ...
1,-40,126,18,3,31.5,Page 2 / 394 ISBN Médecins Sans Frontières. Cl...
2,-39,511,71,1,127.75,Page 3 / 394 Table of contents Authors/Contrib...
3,-38,1301,165,4,325.25,Page 4 / 394 Authors/Contributors The Clin...
4,-37,1697,263,13,424.25,Page 5 / 394 Preface This guide is designe...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,394.0,394.0,394.0,394.0,394.0
mean,155.5,1688.52,256.39,12.91,422.13
std,113.88,917.47,142.92,8.07,229.37
min,-41.0,15.0,5.0,1.0,3.75
25%,57.25,886.25,134.5,6.0,221.56
50%,155.5,1814.0,272.5,13.0,453.5
75%,253.75,2463.5,372.0,18.0,615.88
max,352.0,3724.0,608.0,42.0,931.0


In [10]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [11]:
# pip install spacy

In [12]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 394/394 [00:00<00:00, 438.56it/s]


In [13]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 311,
  'page_char_count': 1273,
  'page_word_count': 203,
  'page_sentence_count_raw': 11,
  'page_token_count': 318.25,
  'text': 'Page 353  / 394 Insomnia Last updated: November 2021 \xa0 Complaints may be: difficulty falling or remaining asleep, waking up too early in the morning, nightmares, or fatigue. Symptoms occur at least three times a week for at least one month. Management If insomnia is related to an organic cause, treat the cause (e.g. administer analgesics for pain). \xa0 If insomnia is related to the use of alcohol, drugs or a medication , management depends on the substance involved. \xa0 If insomnia is related to a particular life event (e.g. bereavement), a short term treatment with a sedative may be useful: promethazine\xa0PO: 25 mg once daily at bedtime for 7 to 10 days or, if promethazine is not available, hydroxyzine PO: 25 mg once daily at bedtime for 7 to 10 days or, as a last resort (risk of addiction), diazepam\xa0PO: 2 to 5 mg once daily\xa0a

In [14]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,394.0,394.0,394.0,394.0,394.0,394.0
mean,155.5,1688.52,256.39,12.91,422.13,12.96
std,113.88,917.47,142.92,8.07,229.37,7.99
min,-41.0,15.0,5.0,1.0,3.75,1.0
25%,57.25,886.25,134.5,6.0,221.56,6.0
50%,155.5,1814.0,272.5,13.0,453.5,13.0
75%,253.75,2463.5,372.0,18.0,615.88,19.0
max,352.0,3724.0,608.0,42.0,931.0,41.0


In [15]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 394/394 [00:00<00:00, 3129840.48it/s]


In [16]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 217,
  'page_char_count': 1617,
  'page_word_count': 245,
  'page_sentence_count_raw': 14,
  'page_token_count': 404.25,
  'sentences': ['Page 259  / 394 Dengue Last update: October 2022 \xa0 \xa0 Dengue fever is an arbovirus transmitted to humans by the bite of a mosquito (Aedes).',
   'Transmission by transfusion of contaminated blood and transplacental transmission to the foetus have also been reported.',
   'Four different serotypes of dengue have been described.',
   'Infection with one serotype provides a lifelong immunity to that specific serotype, but only partial, short-term immunity to other serotypes.',
   'There is no specific antiviral treatment.',
   '\xa0 Dengue is a mainly urban disease, present in tropical and subtropical regions , in particular in Asia, Central and South America and the Caribbean.',
   'Outbreaks have been described in Eastern Africa.',
   '\xa0 Primary infection may be asymptomatic or present as mild or occasionally severe dengue fev

In [17]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,394.0,394.0,394.0,394.0,394.0,394.0,394.0
mean,155.5,1688.52,256.39,12.91,422.13,12.96,1.79
std,113.88,917.47,142.92,8.07,229.37,7.99,0.78
min,-41.0,15.0,5.0,1.0,3.75,1.0,1.0
25%,57.25,886.25,134.5,6.0,221.56,6.0,1.0
50%,155.5,1814.0,272.5,13.0,453.5,13.0,2.0
75%,253.75,2463.5,372.0,18.0,615.88,19.0,2.0
max,352.0,3724.0,608.0,42.0,931.0,41.0,5.0


In [18]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 394/394 [00:00<00:00, 33219.20it/s]


706

In [19]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 329,
  'sentence_chunk': 'Page 371 / 394 Children ≥ 40 kg and adults: Ratio 8:1: 3000 mg daily (2 tab of 500/62.5 mg 3 times daily) Ratio 7:1: 2625 mg daily (1 tab of 875/125 mg 3 times\xa0daily) Patients over 2 years without acute anaemia can continue treatment as outpatients. Patients under 2 years or with acute anaemia or who cannot be monitored and treated at home by their family should complete PO antibiotherapy in hospital. Acute severe haemolysis Aplastic crisis Splenic sequestration \xa0 Note: splenectomy is contra-indicated (high operative mortality). Stroke If the patient is not improving, continue ceftriaxone until the patient is afebrile, then, change to PO treatment. Monitor for acute anaemia. Admit to hospital. Treat malaria if present. Transfuse packed red blood cells if Hb < 5 g/dl or drop of 2 g/dl below the patient’s baseline. Target a Hb level of 9 g/dl. d e \xa0 Start with 10 to 15 ml/kg in 3 to 4 hours.',
  'chunk_char_count': 902,
  'chunk_word_co

In [20]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,706.0,706.0,706.0,706.0
mean,156.18,939.62,140.88,234.91
std,114.99,558.89,87.89,139.72
min,-41.0,5.0,1.0,1.25
25%,55.0,460.25,66.0,115.06
50%,155.5,959.0,143.0,239.75
75%,256.0,1324.0,200.75,331.0
max,352.0,2782.0,436.0,695.5


In [21]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 28.0 | Text: No 8, 2018, 93, 73–96.https://www.who.int/immunization/policy/position_papers/bcg/en/ [Accessed 21 October 2019]
Chunk token count: 2.75 | Text: Management:
Chunk token count: 27.0 | Text: Page 132 / 394 Bacterial skin infections         Impetigo Furuncles and carbuncles Erysipelas and cellulitis
Chunk token count: 14.5 | Text: Close contacts: isoniazid preventive therapy for 6 months.
Chunk token count: 20.5 | Text: Neurological Aetiologies: History and clinical Positive malaria test: see Malaria,


In [22]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -41,
  'sentence_chunk': 'Page 1 / 394 Exported on: 04/04/2024 Clinical guidelines - Diagnosis and treatment manual For curative programmes in hospitals and dispensaries Guidance for prescribing \xa0 \xa0 © Médecins Sans Frontières All rights reserved for all countries. No reproduction, translation and adaptation may be done without the prior permission of the Copyright owner.',
  'chunk_char_count': 347,
  'chunk_word_count': 52,
  'chunk_token_count': 86.75},
 {'page_number': -40,
  'sentence_chunk': 'Page 2 / 394 ISBN Médecins Sans Frontières. Clinical guidelines - Diagnosis and treatment manual. March 2024 978-2-37585-253-8',
  'chunk_char_count': 126,
  'chunk_word_count': 18,
  'chunk_token_count': 31.5}]

In [23]:
# pip install sentence-transformers

In [24]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")


Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07982454e-02  3.03164795e-02 -2.01217998e-02  6.86484873e-02
 -2.55256053e-02 -8.47688317e-03 -2.07198813e-04 -6.32377788e-02
  2.81607024e-02 -3.33353691e-02  3.02634370e-02  5.30721545e-02
 -5.03526777e-02  2.62288805e-02  3.33313681e-02 -4.51577641e-02
  3.63044813e-02 -1.37119333e-03 -1.20170908e-02  1.14947148e-02
  5.04510887e-02  4.70856912e-02  2.11913809e-02  5.14606573e-02
 -2.03746818e-02 -3.58889475e-02 -6.67780987e-04 -2.94393767e-02
  4.95859310e-02 -1.05639622e-02 -1.52014000e-02 -1.31759909e-03
  4.48197350e-02  1.56023223e-02  8.60379259e-07 -1.21392915e-03
 -2.37978622e-02 -9.09366121e-04  7.34487362e-03 -2.53930339e-03
  5.23370504e-02 -4.68043424e-02  1.66214965e-02  4.71579507e-02
 -4.15599123e-02  9.01957566e-04  3.60278077e-02  3.42214517e-02
  9.68227386e-02  5.94829172e-02 -1.64984465e-02 -3.51248831e-02
  5.92511427e-03 -7.07933388e-04 -2.410

In [25]:
# pip install tf-keras

In [26]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97447948e-02 -4.51075844e-03 -4.98485751e-03  6.55444860e-02
 -9.87673178e-03  2.72836369e-02  3.66426371e-02 -3.30219069e-03
  8.50076228e-03  8.24953150e-03 -2.28498019e-02  4.02430296e-02
 -5.75200543e-02  6.33692220e-02  4.43207324e-02 -4.49506566e-02
  1.25284549e-02 -2.52011698e-02 -3.55292819e-02  1.29559152e-02
  8.67022853e-03 -1.92917623e-02  3.55634978e-03  1.89505499e-02
 -1.47128273e-02 -9.39846318e-03  7.64178485e-03  9.62185580e-03
 -5.98921441e-03 -3.90169099e-02 -5.47824912e-02 -5.67457452e-03
  1.11644482e-02  4.08067219e-02  1.76319122e-06  9.15305223e-03
 -8.77257995e-03  2.39382796e-02 -2.32784394e-02  8.04999769e-02
  3.19176950e-02  5.12598967e-03 -1.47708310e-02 -1.62525158e-02
 -6.03213087e-02 -4.35689762e-02  4.51211743e-02 -1.79053359e-02
  2.63366643e-02 -3.47867049e-02 -8.89174361e-03 -5.47675528e-02
 -1.24372616e-02 -2.38606837e-02  8.33496824e-02  5.71241863e-02
  1.13328351e-02 -1.49595067e-02  9.2037

In [27]:
%%time

# Send the model to the GPU
embedding_model.to("cuda") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 655/655 [00:30<00:00, 21.39it/s]

CPU times: total: 3min 4s
Wall time: 30.9 s





In [28]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [29]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 27.1 s
Wall time: 28.4 s


tensor([[ 0.0417, -0.0506,  0.0163,  ..., -0.0141, -0.0438, -0.0239],
        [ 0.0271, -0.0526,  0.0169,  ..., -0.0097, -0.0154, -0.0099],
        [ 0.0285,  0.0049,  0.0172,  ...,  0.0116,  0.0287, -0.0495],
        ...,
        [ 0.0478, -0.0780,  0.0040,  ...,  0.0007, -0.0786,  0.0087],
        [ 0.0259, -0.1053,  0.0016,  ..., -0.0130, -0.0834, -0.0103],
        [ 0.0315, -0.0328,  0.0055,  ...,  0.0077, -0.0650, -0.0279]],
       device='cuda:0')

In [30]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [31]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-41,Page 1 / 394 Exported on: 04/04/2024 Clinical ...,347,52,86.75,[ 4.17054705e-02 -5.05610220e-02 1.63177233e-...
1,-40,Page 2 / 394 ISBN Médecins Sans Frontières. Cl...,126,18,31.5,[ 2.70563308e-02 -5.25943972e-02 1.68960840e-...
2,-39,Page 3 / 394 Table of contents Authors/Contrib...,511,71,127.75,[ 2.85286382e-02 4.88796597e-03 1.72483865e-...
3,-38,Page 4 / 394 Authors/Contributors The Clin...,1298,162,324.5,[ 5.59403449e-02 -3.28842252e-02 1.94522981e-...
4,-37,Page 5 / 394 Preface This guide is designe...,1217,186,304.25,[ 4.81360741e-02 -8.10960084e-02 1.82636604e-...


In [32]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([655, 768])

In [1]:
text_chunks_and_embedding_df.head()

NameError: name 'text_chunks_and_embedding_df' is not defined