In [26]:
import pdfplumber

### Extract contents of the books pages into a list

In [27]:
text = []

with pdfplumber.open("Boericke_materia_medica.pdf") as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            text.append(page_text)


In [33]:
len(text)

596

In [30]:
for item in text[-10:]:
    print(item)
    print("-----------------")

William Boericke Homoeopathic Materia Medica
Marsh-Tea.....................................335MOSCHUS.................37, 82, 155, 377
May-apple......................................442Mountain Grape.............................107
Meadow Parsnip.............................575Mucuna.................................220, 488
Medicago Sativa...............................27Mugwort..........................................78
Medical Lake..................................500Mullein..................................552, 561
MEDORRHINUM.............................359MUREX PURPUREA.........................379
Medusa.................................360, 553Muriate of Magnesia.......................352
MEL CUM SALE...............................361Muriatic Acid............................40, 378
Melilot, Yellow................................361MURIATICUM ACIDUM...........378, 398
MELILOTUS OFFICINALIS...............361Musk......................25, 377, 404, 524
MENISPERMUM CANADENSE..........

### Remove cover and back pages by identifying those with non-header text and any other pages which dont have page no and header

Any page will have for sure header and page no at least

In [31]:
def remove_non_material_pages(pages):
    TARGET = "William Boericke Homoeopathic Materia Medica"

    retained_pages = []

    for page in pages:
        if not page:
            continue

        page_text = page.strip()
        lines = [line.strip() for line in page_text.split("\n") if line.strip()]

        if len(lines) < 2:
            continue

        if lines[0] != TARGET:
            continue

        retained_pages.append(page_text)

    return retained_pages

In [32]:
material_pages = remove_non_material_pages(text)

print(len(material_pages))

593


In [34]:
material_pages[:10]

['William Boericke Homoeopathic Materia Medica\nPREFACE TO THE NINTH EDITION\nWilliam Boericke\n1849-1929, US\nIn preparing the ninth edition of this work, I have followed the lines laid out for\nall the previous editions, namely, to present in a condensed form the\nhomœopathic Materia Medica for practical use.\nThe book contains the well known verified characteristic symptoms of all our\nmedicines besides other less important symptoms aiding the selection of the\ncurative remedy, All the new medicines and essentials of the published clinical\nexperience of the school have been added. In its present compact form it contains\nthe maximum number of reliable Materia Medica facts in the minimum space.\nI have tried to give a succinct resume of the symptomatology of every medicine\nused in Homœopathy, including also clinical suggestions of many drugs so far\nnot yet based on provings, thus offering the opportunity to experiment with\nthese and by future provings discover their distinctive u

### Remove header and page no from all pages

In [35]:
def clean_pages_header_page_num(page_text):
    lines = [line.strip() for line in page_text.split("\n") if line.strip()]

    if len(lines) < 3:
        return ""
    
    content_lines = lines[1:-1]

    return "\n".join(content_lines)



In [36]:
cleaned_pages = [
    clean_pages_header_page_num(page)
    for page in material_pages
    if clean_pages_header_page_num(page)
]
print(len(cleaned_pages))

592


In [37]:
cleaned_pages[-10:]

["IGNATIA AMARA............................290Jerusalem Oak...............................165\nIkshugandha.........................124, 544JOANESIA ASOCA...........................304\nILEX AQUIFOLIUM..........................292Juglans cinerea...........................304p.\nIllicium............................................50JUGLANS REGIA.............................305\nIndian Tobacco..............................342JUNCUS EFFUSUS...........................305\nIndigo.......................12, 92, 236, 293Juniper Berries...............................305\nINDIGO TINCTORIA.......................293JUNIPERUS COMMUNIS..................305\nIndigo, Wild.....................................92Justicia adhatoda...................306, 499\nIndium..........................................292KALIUM ARSENICOSUM..................306\nINDIUM METALLICUM....................292KALIUM BICHROMICUM..................307\nINDOLUM......................................293KALIUM BROMATUM................

In [38]:
cleaned_pages[10:15]

["Fever.––Cold stage most marked. Cold sweat and icy coldness of face. Coldness\nand heat alternate. Evening chilliness soon after going to bed. Cold waves pass\nthrough him. Thirst and restlessness always present. Chilly if uncovered or\ntouched. Dry heat, red face. Most valuable febrifuge with mental anguish,\nrestlessness, etc. Sweat drenching, on parts lain on; relieving all symptoms.\nModalities.––Better in open air; worse in warm room, in evening and night;\nworse lying on affected side, from music, from tobacco-smoke, dry, cold winds.\nVinegar in large doses is antidotal to poisonous effects.\nRelationship.––Acids, wine and coffee, lemonade, and acid fruits modify its\naction.\nNot indicated in malarial and low fevers or hectic and pyæmic conditions, and in\ninflammations when they localize themselves. Sulphur often follows it. Compare\nCham and Coffea in intense pain and sleeplessness.\nAgrostis acts like Acon in fever and inflammations, also Spiranthes.\nComplementary: Coffea;

### Filter appendix based on dots density

In [39]:
def dot_density(text):
    return text.count('.') / max(len(text), 1)

In [40]:
import numpy as np

In [41]:
dot_density_arr = np.array(
    [dot_density(p) for p in cleaned_pages],
    dtype=np.float64
)

In [42]:
mean = np.mean(dot_density_arr)
median = np.median(dot_density_arr)

percentiles = np.percentile(
    dot_density_arr,
    [1,5,10,25,50,75,90,95,96,97,98,99]
)

mean, median,percentiles

(np.float64(0.04322570013443084),
 np.float64(0.02572443889458518),
 array([0.01127795, 0.01883601, 0.0203372 , 0.02331373, 0.02572444,
        0.02855898, 0.03143036, 0.03421888, 0.03521837, 0.53648153,
        0.56426483, 0.58883302]))

### Huge rise in the last 3 percentiles --> 0.2 is a safe threshold

In [43]:
threshold = 0.2

cleaned_pages = [
    page for page in cleaned_pages
    if dot_density(page) < threshold
]
print(len(cleaned_pages))

573


In [44]:
cleaned_pages[:10]

['PREFACE TO THE NINTH EDITION\nWilliam Boericke\n1849-1929, US\nIn preparing the ninth edition of this work, I have followed the lines laid out for\nall the previous editions, namely, to present in a condensed form the\nhomœopathic Materia Medica for practical use.\nThe book contains the well known verified characteristic symptoms of all our\nmedicines besides other less important symptoms aiding the selection of the\ncurative remedy, All the new medicines and essentials of the published clinical\nexperience of the school have been added. In its present compact form it contains\nthe maximum number of reliable Materia Medica facts in the minimum space.\nI have tried to give a succinct resume of the symptomatology of every medicine\nused in Homœopathy, including also clinical suggestions of many drugs so far\nnot yet based on provings, thus offering the opportunity to experiment with\nthese and by future provings discover their distinctive use and so enlarging our\narmamentarium.\nI am 

In [45]:
cleaned_pages[-10:]

['over. Compressing pain in feet.\nModalities.––Worse, winter, cold, stormy weather; in bed. Movement; lying on\nleft side.\nRelationship.––Compare: Secale: Convallar; Bry; Puls; Rhodod. Guipsine -active\nprinciple––(exalts the hypotensive properties of Viscum). Hedera Helix-Ivy––\n(Intercranial pressure).\nDose.––Tincture and lower potencies.\nWYETHIA HELENOIDES\nPoison-weed\nHas marked effects on the throat, and has proven an excellent remedy in\npharyngitis, especially the follicular form. Irritable throats of singers and public\nspeakers. Useful also in hæmorrhoids. Hay-fever symptoms; itching in posterior\nnares.\nHead.––Nervous, uneasy, depressed. Dizzy. Rush of blood to head. Sharp pain\nin forehead.\nMouth.––Feels as if scalded; sensation of heat down œsophagus. Itching of the\npalate.\nThroat.––Constant clearing and hemming. Dry, posterior nares; no relief from\nclearing. Throat feels swollen; epiglottis dry and burning. Difficult swallowing.\nConstant desire to swallow saliva

### Hardcoded remove preface page

In [46]:
cleaned_pages = cleaned_pages[2:]


In [48]:
cleaned_pages[:10]

["ABIES CANADENSIS\nPinus canadensis\nHemlock Spruce\nMucous membranes are affected by Abies can and gastric symptoms are most\nmarked, and a catarrhal condition of the stomach is produced. There are peculiar\ncravings and chilly sensations that are very characteristic, especially for women\nwith uterine displacement, probably due to defective nutrition with debility.\nRespiration and heart action labored. Wants to lie down all the time; skin cold\nand clammy, hands cold; very faint. Right lung and liver feel small and hard.\nGleet.\nHead.––Feels light-headed, tipsy. Irritable.\nStomach.––Canine hunger with torpid liver. Gnawing, hungry, faint feeling at\nthe epigastrium. Great appetite, craving for meat, pickles, radishes, turnips,\nartichokes, coarse food. Tendency to eat far beyond capacity for digestion.\nBurning and distention of stomach and abdomen with palpitation. Flatulence\ndisturbs the heart's action. Pain in right shoulder-blade, and constipation, with\nburning in rectum.\n

In [47]:
len(cleaned_pages)

571

In [49]:
cleaned_pages[0]

"ABIES CANADENSIS\nPinus canadensis\nHemlock Spruce\nMucous membranes are affected by Abies can and gastric symptoms are most\nmarked, and a catarrhal condition of the stomach is produced. There are peculiar\ncravings and chilly sensations that are very characteristic, especially for women\nwith uterine displacement, probably due to defective nutrition with debility.\nRespiration and heart action labored. Wants to lie down all the time; skin cold\nand clammy, hands cold; very faint. Right lung and liver feel small and hard.\nGleet.\nHead.––Feels light-headed, tipsy. Irritable.\nStomach.––Canine hunger with torpid liver. Gnawing, hungry, faint feeling at\nthe epigastrium. Great appetite, craving for meat, pickles, radishes, turnips,\nartichokes, coarse food. Tendency to eat far beyond capacity for digestion.\nBurning and distention of stomach and abdomen with palpitation. Flatulence\ndisturbs the heart's action. Pain in right shoulder-blade, and constipation, with\nburning in rectum.\nF

### All text

In [50]:
full_final_text = "\n".join(cleaned_pages)

In [51]:
full_final_text

'ABIES CANADENSIS\nPinus canadensis\nHemlock Spruce\nMucous membranes are affected by Abies can and gastric symptoms are most\nmarked, and a catarrhal condition of the stomach is produced. There are peculiar\ncravings and chilly sensations that are very characteristic, especially for women\nwith uterine displacement, probably due to defective nutrition with debility.\nRespiration and heart action labored. Wants to lie down all the time; skin cold\nand clammy, hands cold; very faint. Right lung and liver feel small and hard.\nGleet.\nHead.––Feels light-headed, tipsy. Irritable.\nStomach.––Canine hunger with torpid liver. Gnawing, hungry, faint feeling at\nthe epigastrium. Great appetite, craving for meat, pickles, radishes, turnips,\nartichokes, coarse food. Tendency to eat far beyond capacity for digestion.\nBurning and distention of stomach and abdomen with palpitation. Flatulence\ndisturbs the heart\'s action. Pain in right shoulder-blade, and constipation, with\nburning in rectum.\n

In [52]:
import re

In [53]:
full_final_text = re.sub(r'\n(?=[a-z])', ' ', full_final_text)


In [54]:
full_final_text

'ABIES CANADENSIS\nPinus canadensis\nHemlock Spruce\nMucous membranes are affected by Abies can and gastric symptoms are most marked, and a catarrhal condition of the stomach is produced. There are peculiar cravings and chilly sensations that are very characteristic, especially for women with uterine displacement, probably due to defective nutrition with debility.\nRespiration and heart action labored. Wants to lie down all the time; skin cold and clammy, hands cold; very faint. Right lung and liver feel small and hard.\nGleet.\nHead.––Feels light-headed, tipsy. Irritable.\nStomach.––Canine hunger with torpid liver. Gnawing, hungry, faint feeling at the epigastrium. Great appetite, craving for meat, pickles, radishes, turnips, artichokes, coarse food. Tendency to eat far beyond capacity for digestion.\nBurning and distention of stomach and abdomen with palpitation. Flatulence disturbs the heart\'s action. Pain in right shoulder-blade, and constipation, with burning in rectum.\nFemale.–

In [55]:
output_file = "boericke_full_text.txt"

with open(output_file, "w", encoding="utf-8") as f:
    f.write(full_final_text)

print(f"Saved full text to {output_file}")

Saved full text to boericke_full_text.txt


### Important design decisions are choice of chunking and the metadata outside or inside of chunk