In [1]:
import os

files = os.listdir('/kaggle/input/abstract-files-diseases/abstract_files')
files

['apol1-mediated_kidney_disease.json',
 'minimal_change_disease.json',
 'cystinosis.json',
 'henoch-schönlein_purpura.json',
 'thrombotic_thrombocytopenic_purpura.json',
 'fabry_disease.json',
 'vasculitis.json',
 'iga_nephropathy.json',
 'glomerulonephritis.json',
 'hemolytic_uremic_syndrome.json',
 'cardiovascular-kidney-metabolic_syndrome.json',
 'ahus.json',
 'granulomatosis_with_polyangiitis.json',
 'amyloidosis.json',
 'congenital_abnormalities_of_the_kidneys_and_urinary_tract.json',
 'alport_syndrome.json',
 'polycystic_kidney_disease.json',
 'primary_hyperoxaluria_and_oxalate.json',
 'focal_segmental_glomerulosclerosis.json',
 'complement_3_glomerulopathy_.json',
 'interstitial_nephritis.json',
 'lupus_nephritis.json',
 'goodpasture_syndrome.json']

In [2]:
# Units for common clinical measurements
units = {
    "Age (years)": "years (y)",
    "GFR (Glomerular Filtration Rate)": "mL/min/1.73 m²",
    "Proteinuria": ["mg/day", "mg/g creatinine"],
    "Creatinine (serum)": ["mg/dL", "µmol/L"],
    "BUN (Blood Urea Nitrogen)": ["mg/dL", "mmol/L"],
    "BP (Blood Pressure)": "mmHg",
    "Hematuria": ["RBCs/HPF", "positive/negative (dipstick)"],
    "Weight": ["kg", "lb"],
    "Calcium (in blood)": ["mg/dL", "mmol/L"],
    "Cholesterol (total)": ["mg/dL", "mmol/L"]
}

In [3]:
units_variations = {
    "Age (years)": [
        "years", "yrs", "yr", "y", "yo", "age in years", "years-old", "year-old",
        "aged", "age:", "y/o", "yoa", "years of age"
    ],
    "GFR (Glomerular Filtration Rate)": [
        "mL/min/1.73 m²", "ml/min/1.73 m²", "ml/min/1.73m2", "ml/min/1.73m²", 
        "mL per min per 1.73 m²", "ml per min per 1.73m2", "mL/min per 1.73 m²",
        "ml/min", "mL/min", "mls/min", "mL/minute", "mL/min/1.73m²", 
        "glomerular filtration rate", "GFR", "GFR value", "L/hr", "L/h", "L/min",
        "ml/s", "ml per second", "cc/min"
    ],
    "Proteinuria": [
        "mg/day", "mg/d", "mg per day", "mg/24h", "mg per 24 h", "mg/24 hours",
        "mg/g creatinine", "mg per g creatinine", "g/day", "g/d", "g per day", 
        "g/24h", "g per 24 h", "g/24 hours", "g/L", "g per liter", "mg/L", "mg per liter",
        "protein excretion", "protein/creatinine ratio", "protein per creatinine ratio",
        "PCR", "UPCR", "g/g", "mg/mg"
    ],
    "Creatinine (serum)": [
        "mg/dL", "mg per dL", "µmol/L", "umol/L", "µmol per L", "umol per L", 
        "micromol/L", "micromoles per liter", "serum creatinine", "creatinine conc", 
        "creatinine concentration", "mg%", "mmol/L", "mmol per L", "μmol/L"
    ],
    "BUN (Blood Urea Nitrogen)": [
        "mg/dL", "mg per dL", "mg%", "mg/L", "mmol/L", "mmol per L", 
        "blood urea nitrogen", "urea nitrogen", "BUN value", "serum urea", "urea conc"
    ],
    "BP (Blood Pressure)": [
        "mmHg", "mm Hg", "millimeters of mercury", "blood pressure", "BP", 
        "systolic", "diastolic", "BP reading", "cmHg", "kPa", "Pa"
    ],
    "Hematuria": [
        "RBCs/HPF", "RBCs per HPF", "RBCs/hpf", "red cells/HPF", 
        "positive hematuria", "hematuria detected", "red blood cells in urine", 
        "urine dipstick positive", "urine dipstick", "RBCs/uL", "RBCs/mL",
        "RBC/hpf", "blood in urine"
    ],
    "Weight": [
        "kg", "kgs", "kilogram", "kilograms", "lb", "lbs", "pounds", "pound", 
        "body weight", "weight:", "g", "gram", "grams", "oz", "ounce", "ounces", 
        "stone", "stones", "st", "wt"
    ],
    "Calcium (in blood)": [
        "mg/dL", "mg per dL", "mg%", "mg/L", "mmol/L", "mmol per L",
        "serum calcium", "blood calcium", "calcium conc", "calcium concentration", 
        "total calcium"
    ],
    "Cholesterol (total)": [
        "mg/dL", "mg per dL", "mg%", "mg/L", "mmol/L", "mmol per L",
        "total cholesterol", "cholesterol level", "serum cholesterol", "cholesterol conc",
        "TC"
    ]
}

In [4]:
import re

def contains_unit_with_value(sentence: str) -> str:
    """
    Checks if a sentence contains a numerical value followed by a unit 
    and returns the entity name if found, else an empty string.
    """
    for entity, variations in units_variations.items():
        # Deduplicate and sort by length for better matching
        variations = sorted(set(variations), key=len, reverse=True)
        for unit in variations:
            # Regex handles:
            # - optional ~, <, >, ≤, ≥ symbols
            # - integers, decimals, fractions (e.g., 1/2)
            # - ranges like 120-130, 120/80
            # - optional spaces or hyphens before unit
            # - unit may be case-insensitive
            pattern = (
                rf'(~|≈|<|>|≤|≥)?\s*'              # optional comparison symbols
                rf'(\d+(\.\d+)?(/\d+)?(\-\d+(\.\d+)?)?)' # number, fraction, range
                rf'[\s\-]*'                         # optional space/hyphen
                rf'({re.escape(unit)})\b'           # unit word boundary
            )
            if re.search(pattern, sentence, re.IGNORECASE):
                return entity
    return ""


In [5]:
sents = [
    # Age
    "Patient age is 35 yrs.",
    "He is 72 year-old male.",
    # GFR
    "Estimated GFR is 65 ml/min/1.73 m2.",
    "GFR reported at 80 mL/min/1.73 m².",
    # Proteinuria
    "Urinary protein excretion was 300 mg/day.",
    "Proteinuria quantified as 1.2 mg/g creatinine.",
    # Creatinine
    "Creatinine value is 2.0 mg/dL.",
    "Serum creatinine measured at 95 µmol/L.",
    # BUN
    "BUN found to be 22 mg/dL.",
    "Blood urea nitrogen level is 7 mmol/L.",
    # BP
    "BP recorded at 140/90 mmHg.",
    "Blood pressure is 118 millimeters of mercury.",
    # Hematuria
    "Microscopic hematuria: RBCs/HPF 15.",
    "Urine analysis shows 8 RBCs/HPF.",
    # Weight
    "Body weight measured as 68 kg.",
    "Patient weighs 170 lbs.",
    # Calcium
    "Serum calcium reported at 10.1 mg/dL.",
    "Blood calcium concentration is 2.4 mmol/L.",
    # Cholesterol
    "Cholesterol reading is 210 mg/dL.",
    "Total cholesterol level: 6.0 mmol/L."
]

for sent in sents:
    print(f"{contains_unit_with_value(sent)}")

Age (years)
Age (years)
GFR (Glomerular Filtration Rate)
GFR (Glomerular Filtration Rate)
Proteinuria
Proteinuria
Creatinine (serum)
Creatinine (serum)
Creatinine (serum)
Creatinine (serum)
BP (Blood Pressure)
BP (Blood Pressure)

Hematuria
Weight
Weight
Creatinine (serum)
Creatinine (serum)
Creatinine (serum)
Creatinine (serum)


In [6]:
import os
import json
import re
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split
import nltk

# Download punkt tokenizer if not available
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
count_per_entity = 200
positive_sents = {entity: [] for entity in units_variations.keys()}
negative_sents = []

data_dir = "/kaggle/input/abstract-files-diseases/abstract_files"

# Keep track of remaining entities
remaining_entities = set(units_variations.keys())
max_negatives = count_per_entity

for file in os.listdir(data_dir):
    if not file.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file)
    with open(file_path, "r", encoding="utf8") as f:
        try:
            records = json.load(f)
        except json.JSONDecodeError:
            continue

    for rec in records:
        # If everything collected, break early
        if not remaining_entities and len(negative_sents) >= max_negatives:
            break

        abstract = rec.get("abstract", "")
        if not abstract:
            continue

        # Tokenize only if needed
        for sent in sent_tokenize(abstract):
            if not remaining_entities and len(negative_sents) >= max_negatives:
                break

            sent = sent.strip()
            if not sent:
                continue

            entity = contains_unit_with_value(sent)
            if entity:
                if entity in remaining_entities:
                    positive_sents[entity].append(sent)
                    if len(positive_sents[entity]) >= count_per_entity:
                        remaining_entities.remove(entity)
            else:
                if len(negative_sents) < max_negatives:
                    negative_sents.append(sent)

    # If everything collected, break file loop early
    if not remaining_entities and len(negative_sents) >= max_negatives:
        break
    
positive_pairs = [(sent, entity) for entity, sents in positive_sents.items() for sent in sents]

print("Positive sentences collected per entity:")
for entity, sents in positive_sents.items():
    print(f"{entity}: {len(sents)}")

print("Negative sentences collected:", len(negative_sents))

KeyboardInterrupt: 

In [8]:
positive_pairs = [(sent, entity) for entity, sents in positive_sents.items() for sent in sents]

print("Positive sentences collected per entity:")
for entity, sents in positive_sents.items():
    print(f"{entity}: {len(sents)}")

print("Negative sentences collected:", len(negative_sents))

Positive sentences collected per entity:
Age (years): 188
GFR (Glomerular Filtration Rate): 9
Proteinuria: 17
Creatinine (serum): 12
BUN (Blood Urea Nitrogen): 0
BP (Blood Pressure): 14
Hematuria: 0
Weight: 27
Calcium (in blood): 0
Cholesterol (total): 0
Negative sentences collected: 200


In [9]:
positive_list = [{"sentence": sent, "entity": entity} for sent, entity in positive_pairs]

data_to_save = {
    "positives": positive_list,
    "negatives": negative_sents
}

In [10]:
output_file = "filtered_sentences.json"

with open(output_file, "w", encoding="utf8") as f:
    json.dump(data_to_save, f, ensure_ascii=False, indent=2)

print(f"Saved {len(positive_list)} positive sentences and {len(negative_sents)} negative sentences to {output_file}")

Saved 267 positive sentences and 200 negative sentences to filtered_sentences.json
