### Install essential Python libraries for Natural Language Processing (NLP) tasks

In [1]:
!pip install transformers
!pip install evaluate
!pip install datasets
!pip install seqeval
!pip install accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [2]:
import transformers

print(transformers.__version__)

4.46.3


### Load the dataset from Hugging Face

In [3]:
from datasets import load_dataset

dataset = load_dataset("ktgiahieu/maccrobat2018_2020" ,  trust_remote_code=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

data.jsonl:   0%|          | 0.00/3.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

### Inspect the structure of DataSet and retrieve a sample from the training split.

In [4]:
# Inspect the dataset structure
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 400
    })
})
{'tokens': ['A', '68', '-', 'year', '-', 'old', 'female', 'nonsmoker', ',', 'nondrinker', 'with', 'a', 'medical', 'history', 'of', 'hypertension', 'presented', 'with', 'new', '-', 'onset', 'painless', 'jaundice', 'and', 'pruritus', ',', 'a', 'three', '-', 'month', 'history', 'of', '9.9', 'kg', 'weight', 'loss', 'and', 'chronic', 'diarrhea', 'with', 'four', 'to', 'five', 'loose', 'bowel', 'movements', 'per', 'day', '.', '\n', 'Medications', 'included', 'vitamin', 'D', ',', 'amlodipine', 'and', 'eprosartan', '.', '\n', 'Physical', 'examination', 'was', 'normal', 'except', 'for', 'jaundice', 'and', 'muscle', 'wasting', '.', '\n', 'Recent', 'colonoscopy', 'had', 'been', 'normal', '.', '\n', 'Total', 'and', 'direct', 'bilirubin', 'levels', 'were', '6.84', 'mg', '/', 'dL', '(', '116.96', 'μmol', '/', 'L', ')', 'and', '9.18', 'mg', '/', 'dL', '(', '156.98', 'μmol', '/', 'L', ')', ',', 'res

### Display the schema or structure of the features in each dataset record

In [5]:
print(dataset["train"].features)


{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [6]:
from datasets import Dataset, ClassLabel, Value, DatasetDict

# Define the ClassLabel feature with the label names
label_list = ['B-Activity', 'B-Administration', 'B-Age', 'B-Area', 'B-Biological_attribute', 'B-Biological_structure', 'B-Clinical_event', 'B-Color', 'B-Coreference', 'B-Date', 'B-Detailed_description', 'B-Diagnostic_procedure', 'B-Disease_disorder', 'B-Distance', 'B-Dosage', 'B-Duration', 'B-Family_history', 'B-Frequency', 'B-Height', 'B-History', 'B-Lab_value', 'B-Mass', 'B-Medication', 'B-Nonbiological_location', 'B-Occupation', 'B-Other_entity', 'B-Other_event', 'B-Outcome', 'B-Personal_background', 'B-Qualitative_concept', 'B-Quantitative_concept', 'B-Severity', 'B-Sex', 'B-Shape', 'B-Sign_symptom', 'B-Subject', 'B-Texture', 'B-Therapeutic_procedure', 'B-Time', 'B-Volume', 'B-Weight', 'I-Activity', 'I-Administration', 'I-Age', 'I-Area', 'I-Biological_attribute', 'I-Biological_structure', 'I-Clinical_event', 'I-Color', 'I-Coreference', 'I-Date', 'I-Detailed_description', 'I-Diagnostic_procedure', 'I-Disease_disorder', 'I-Distance', 'I-Dosage', 'I-Duration', 'I-Family_history', 'I-Frequency', 'I-Height', 'I-History', 'I-Lab_value', 'I-Mass', 'I-Medication', 'I-Nonbiological_location', 'I-Occupation', 'I-Other_entity', 'I-Other_event', 'I-Outcome', 'I-Personal_background', 'I-Qualitative_concept', 'I-Quantitative_concept', 'I-Severity', 'I-Shape', 'I-Sign_symptom', 'I-Subject', 'I-Texture', 'I-Therapeutic_procedure', 'I-Time', 'I-Volume', 'I-Weight', 'O']
label = ClassLabel(names=label_list)
# Define a new feature with the numeric labels
numeric_labels_feature = Value("int32")

In [7]:
dataset = dataset.map(lambda example: {"tokens": example['tokens'],
    "tags": example["tags"],
    "numeric_tags": [label.encode_example(x) for x in example["tags"]],
})

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'numeric_tags'],
        num_rows: 400
    })
})

In [9]:
print(dataset["train"][0])

{'tokens': ['A', '68', '-', 'year', '-', 'old', 'female', 'nonsmoker', ',', 'nondrinker', 'with', 'a', 'medical', 'history', 'of', 'hypertension', 'presented', 'with', 'new', '-', 'onset', 'painless', 'jaundice', 'and', 'pruritus', ',', 'a', 'three', '-', 'month', 'history', 'of', '9.9', 'kg', 'weight', 'loss', 'and', 'chronic', 'diarrhea', 'with', 'four', 'to', 'five', 'loose', 'bowel', 'movements', 'per', 'day', '.', '\n', 'Medications', 'included', 'vitamin', 'D', ',', 'amlodipine', 'and', 'eprosartan', '.', '\n', 'Physical', 'examination', 'was', 'normal', 'except', 'for', 'jaundice', 'and', 'muscle', 'wasting', '.', '\n', 'Recent', 'colonoscopy', 'had', 'been', 'normal', '.', '\n', 'Total', 'and', 'direct', 'bilirubin', 'levels', 'were', '6.84', 'mg', '/', 'dL', '(', '116.96', 'μmol', '/', 'L', ')', 'and', '9.18', 'mg', '/', 'dL', '(', '156.98', 'μmol', '/', 'L', ')', ',', 'respectively', '.', '\n', 'Other', 'results', 'included', 'an', 'international', 'normalized', 'ratio', 'of'

In [10]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [11]:
show_random_elements(dataset["train"])

Unnamed: 0,tokens,tags,numeric_tags
0,"[A, 76, -, year, old, woman, presented, with, a, 24, -, month, history, of, enlarging, mass, involving, the, back, history, of, trauma, ., \n, Physical, examination, showed, a, mass, of, an, 3x4, cm, in, diameter, ,, localized, in, the, right, inter, -, scapular, region, ., \n, The, mass, was, ulcerative, helophytic, ,, grayish, in, colour, ,, hard, in, consistency, and, easily, bleeding, on, manipulation, ., \n, The, remainder, of, the, examination, was, unremarkable, ;, no, lymphadenopathy, and, no, abdominal, masses, were, felt, ., \n, After, resection, ,, the, histological, examinations, of, the, specimens, have, concluded, for, basal, cell, carcinoma, ., ...]","[O, B-Age, I-Age, I-Age, I-Age, B-Sex, B-Clinical_event, O, O, B-Duration, I-Duration, I-Duration, I-Duration, O, O, B-Sign_symptom, O, O, B-Biological_structure, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...]","[81, 2, 43, 43, 43, 32, 6, 81, 81, 15, 56, 56, 56, 81, 81, 34, 81, 81, 5, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, ...]"
1,"[The, patient, is, an, 18, -, year, -, old, Han, female, admitted, to, Peking, Union, Medical, College, Hospital, due, to, adrenal, crisis, triggered, by, pneumonia, ., \n, She, has, developed, recurrent, respiratory, infections, since, age, 5, ,, and, failed, to, respond, to, multiple, hepatitis, B, virus, (, HBV, ), vaccinations, ., \n, Reduced, serum, cortisol, and, ACTH, levels, were, discovered, at, 16, when, glucocorticoid, replacement, was, initiated, ., \n, In, addition, ,, hair, loss, started, from, age, 4, ,, and, absence, of, pubic, and, axillary, hair, was, noticed, after, development, of, regular, menstruation, ., \n, Her, history, includes, nephrotic, syndrome, ...]","[O, O, O, O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Personal_background, B-Sex, B-Clinical_event, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, O, O, O, O, O, B-Disease_disorder, O, O, O, O, O, O, O, B-Disease_disorder, O, B-Date, I-Date, O, O, O, O, O, O, O, O, O, O, O, O, O, B-Medication, O, O, O, O, B-Diagnostic_procedure, O, B-Diagnostic_procedure, O, O, O, O, B-Date, O, B-Medication, I-Medication, O, O, O, O, O, O, O, B-Sign_symptom, I-Sign_symptom, O, O, B-Date, I-Date, O, O, B-Sign_symptom, I-Sign_symptom, I-Sign_symptom, I-Sign_symptom, I-Sign_symptom, I-Sign_symptom, O, O, O, B-Other_event, I-Other_event, I-Other_event, I-Other_event, O, O, O, O, O, B-Disease_disorder, I-Disease_disorder, ...]","[81, 81, 81, 81, 2, 43, 43, 43, 43, 28, 32, 6, 81, 23, 64, 64, 64, 64, 81, 81, 81, 81, 81, 81, 12, 81, 81, 81, 81, 81, 81, 81, 12, 81, 9, 50, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 22, 81, 81, 81, 81, 11, 81, 11, 81, 81, 81, 81, 9, 81, 22, 63, 81, 81, 81, 81, 81, 81, 81, 34, 74, 81, 81, 9, 50, 81, 81, 34, 74, 74, 74, 74, 74, 81, 81, 81, 26, 67, 67, 67, 81, 81, 81, 81, 81, 12, 53, ...]"
2,"[A, 68, -, year, -, old, man, referred, to, the, Internal, Medicine, Department, of, Razi, Hospital, in, Rasht, (, a, city, in, the, north, of, Iran, ), with, a, hypogastric, region, discomfort, ,, especially, in, the, right, lower, quadrant, for, one, month, ., \n, The, pain, was, a, colicky, form, which, had, a, few, episodes, each, day, ,, each, episode, lasting, for, 4–5, minutes, ., \n, The, pain, radiated, to, the, back, and, was, alleviated, by, resting, to, one, side, ., \n, No, association, between, the, pain, ,, defecation, ,, and, eating, were, reported, ., \n, Furthermore, ,, the, patient, ...]","[O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Sex, B-Clinical_event, O, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, B-Nonbiological_location, I-Nonbiological_location, O, B-Nonbiological_location, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, O, O, O, O, B-Sign_symptom, O, O, O, O, B-Biological_structure, I-Biological_structure, I-Biological_structure, O, B-Duration, I-Duration, O, O, O, B-Coreference, O, O, B-Detailed_description, I-Detailed_description, O, O, B-Frequency, I-Frequency, I-Frequency, I-Frequency, I-Frequency, O, B-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, O, O, O, B-Coreference, O, O, O, B-Nonbiological_location, O, O, O, O, B-Activity, I-Activity, I-Activity, I-Activity, O, O, B-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, O, O, O, O, O, O, O, O, ...]","[81, 2, 43, 43, 43, 43, 32, 6, 81, 81, 23, 64, 64, 81, 23, 64, 81, 23, 81, 23, 64, 64, 64, 64, 64, 64, 81, 81, 81, 81, 81, 34, 81, 81, 81, 81, 5, 46, 46, 81, 15, 56, 81, 81, 81, 8, 81, 81, 10, 51, 81, 81, 17, 58, 58, 58, 58, 81, 10, 51, 51, 51, 51, 51, 81, 81, 81, 8, 81, 81, 81, 23, 81, 81, 81, 81, 0, 41, 41, 41, 81, 81, 19, 60, 60, 60, 60, 60, 60, 60, 60, 60, 81, 81, 81, 81, 81, 81, 81, 81, ...]"
3,"[A, 58, -, year, -, old, cotton, farmer, was, presented, to, the, West, China, Hospital, of, Sichuan, University, because, of, an, over, 1, -, month, history, of, recurrent, fever, (, between, 38, and, 40, , °, C, ), ,, productive, cough, ,, and, dyspnea, ., \n, Prior, to, admission, ,, he, was, diagnosed, of, pneumonia, and, treated, with, latamoxef, ,, ofloxacin, ,, vancomycin, ,, and, voriconazole, at, local, hospital, ., \n, However, ,, no, remission, of, symptoms, was, observed, ., \n, Moreover, ,, he, was, a, hepatitis, B, virus, carrier, with, a, 10, pack, -, years, smoking, history, ., \n, ...]","[O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Occupation, I-Occupation, O, O, O, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, O, O, B-Duration, I-Duration, I-Duration, I-Duration, O, O, O, B-Sign_symptom, O, B-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, O, O, O, B-Sign_symptom, O, O, B-Sign_symptom, O, O, O, O, O, O, O, O, O, O, B-Disease_disorder, O, O, O, B-Medication, O, B-Medication, O, B-Medication, O, O, B-Medication, O, B-Nonbiological_location, I-Nonbiological_location, O, O, O, O, O, B-Sign_symptom, I-Sign_symptom, I-Sign_symptom, O, O, O, O, O, O, O, O, O, B-History, I-History, I-History, I-History, O, O, B-History, I-History, I-History, I-History, I-History, I-History, O, O, ...]","[81, 2, 43, 43, 43, 43, 24, 65, 81, 81, 81, 81, 23, 64, 64, 64, 64, 64, 81, 81, 81, 15, 56, 56, 56, 81, 81, 81, 34, 81, 20, 61, 61, 61, 61, 61, 61, 81, 81, 81, 34, 81, 81, 34, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 12, 81, 81, 81, 22, 81, 22, 81, 22, 81, 81, 22, 81, 23, 64, 81, 81, 81, 81, 81, 34, 74, 74, 81, 81, 81, 81, 81, 81, 81, 81, 81, 19, 60, 60, 60, 81, 81, 19, 60, 60, 60, 60, 60, 81, 81, ...]"
4,"[A, 36, -, yr, -, old, previously, healthy, Sri, Lankan, male, who, takes, care, of, a, horse, presented, to, the, medical, casualty, ward, with, fever, ,, arthralgia, and, myalgia, for, one, day, ., \n, He, complained, of, mild, dysuria, but, had, normal, urine, output, ., \n, He, did, not, have, chest, pain, or, shortness, of, breath, ., \n, Further, inquiry, revealed, that, he, was, treated, for, leptospirosis, during, a, febrile, illness, in, the, past, ., \n, On, examination, ,, patient, was, afebrile, ,, anicteric, ., \n, His, blood, pressure, was, 90/60, mmHg, and, pulse, rate, 76, bpm, ., \n, Rest, ...]","[O, B-Age, I-Age, I-Age, I-Age, I-Age, B-History, I-History, B-Personal_background, I-Personal_background, B-Sex, O, B-Occupation, I-Occupation, I-Occupation, I-Occupation, I-Occupation, B-Clinical_event, O, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, B-Sign_symptom, O, B-Sign_symptom, O, B-Sign_symptom, O, B-Duration, I-Duration, O, O, O, O, O, O, B-Sign_symptom, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, O, O, O, O, O, O, B-Sign_symptom, O, B-Sign_symptom, I-Sign_symptom, I-Sign_symptom, O, O, O, O, O, O, O, O, B-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, O, O, O, B-Diagnostic_procedure, O, O, O, B-Sign_symptom, O, B-Sign_symptom, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Lab_value, I-Lab_value, O, B-Diagnostic_procedure, I-Diagnostic_procedure, B-Lab_value, I-Lab_value, O, O, O, ...]","[81, 2, 43, 43, 43, 43, 19, 60, 28, 69, 32, 81, 24, 65, 65, 65, 65, 6, 81, 81, 23, 64, 64, 81, 34, 81, 34, 81, 34, 81, 15, 56, 81, 81, 81, 81, 81, 81, 34, 81, 81, 81, 11, 52, 81, 81, 81, 81, 81, 81, 81, 34, 81, 34, 74, 74, 81, 81, 81, 81, 81, 81, 81, 81, 19, 60, 60, 60, 60, 60, 60, 60, 60, 60, 81, 81, 81, 11, 81, 81, 81, 34, 81, 34, 81, 81, 81, 11, 52, 81, 20, 61, 81, 11, 52, 20, 61, 81, 81, 81, ...]"
5,"[A, 53, year, old, female, without, significant, past, medical, history, developed, severe, viral, pneumonia, ,, with, rapid, ,, progressive, deterioration, in, her, respiratory, status, ., \n, She, developed, ARDS, and, mechanical, ventilatory, management, using, ARDS, protocol, were, unable, to, maintain, adequate, oxygenation, ., \n, As, a, result, ,, bedside, VV, -, ECMO, was, planned, ., \n, Transesophageal, echocardiography, (, TEE, ), was, performed, to, visualize, proper, positioning, of, the, guidewire, and, cannula, ., \n, Using, the, Seldinger, technique, ,, the, right, internal, jugular, vein, was, accessed, and, a, guide, wire, was, placed, ., \n, Placement, of, the, guidewire, into, the, ...]","[O, B-Age, I-Age, I-Age, B-Sex, B-History, I-History, I-History, I-History, I-History, O, O, O, B-Disease_disorder, O, O, O, O, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, O, O, O, B-Disease_disorder, O, B-Therapeutic_procedure, I-Therapeutic_procedure, I-Therapeutic_procedure, O, B-Detailed_description, I-Detailed_description, O, O, O, O, O, B-Diagnostic_procedure, O, O, O, O, O, O, O, O, O, B-Therapeutic_procedure, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Diagnostic_procedure, O, O, O, O, O, O, B-Therapeutic_procedure, I-Therapeutic_procedure, I-Therapeutic_procedure, I-Therapeutic_procedure, O, B-Therapeutic_procedure, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-Coreference, I-Coreference, I-Coreference, I-Coreference, O, O, O, O, O, O, O, O, ...]","[81, 2, 43, 43, 32, 19, 60, 60, 60, 60, 81, 81, 81, 12, 81, 81, 81, 81, 81, 81, 81, 81, 11, 52, 81, 81, 81, 81, 12, 81, 37, 77, 77, 81, 10, 51, 81, 81, 81, 81, 81, 11, 81, 81, 81, 81, 81, 81, 81, 81, 81, 37, 81, 81, 81, 81, 11, 52, 81, 11, 81, 81, 81, 81, 81, 81, 37, 77, 77, 77, 81, 37, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 8, 49, 49, 49, 81, 81, 81, 81, 81, 81, 81, 81, ...]"
6,"[A, 68, -, year, -, old, man, referred, to, the, Internal, Medicine, Department, of, Razi, Hospital, in, Rasht, (, a, city, in, the, north, of, Iran, ), with, a, hypogastric, region, discomfort, ,, especially, in, the, right, lower, quadrant, for, one, month, ., \n, The, pain, was, a, colicky, form, which, had, a, few, episodes, each, day, ,, each, episode, lasting, for, 4–5, minutes, ., \n, The, pain, radiated, to, the, back, and, was, alleviated, by, resting, to, one, side, ., \n, No, association, between, the, pain, ,, defecation, ,, and, eating, were, reported, ., \n, Furthermore, ,, the, patient, ...]","[O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Sex, B-Clinical_event, O, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, B-Nonbiological_location, I-Nonbiological_location, O, B-Nonbiological_location, O, B-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, I-Nonbiological_location, O, O, O, O, O, B-Sign_symptom, O, O, O, O, B-Biological_structure, I-Biological_structure, I-Biological_structure, O, B-Duration, I-Duration, O, O, O, B-Coreference, O, O, B-Detailed_description, I-Detailed_description, O, O, B-Frequency, I-Frequency, I-Frequency, I-Frequency, I-Frequency, O, B-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, I-Detailed_description, O, O, O, B-Coreference, O, O, O, B-Nonbiological_location, O, O, O, O, B-Activity, I-Activity, I-Activity, I-Activity, O, O, B-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, O, O, O, O, O, O, O, O, ...]","[81, 2, 43, 43, 43, 43, 32, 6, 81, 81, 23, 64, 64, 81, 23, 64, 81, 23, 81, 23, 64, 64, 64, 64, 64, 64, 81, 81, 81, 81, 81, 34, 81, 81, 81, 81, 5, 46, 46, 81, 15, 56, 81, 81, 81, 8, 81, 81, 10, 51, 81, 81, 17, 58, 58, 58, 58, 81, 10, 51, 51, 51, 51, 51, 81, 81, 81, 8, 81, 81, 81, 23, 81, 81, 81, 81, 0, 41, 41, 41, 81, 81, 19, 60, 60, 60, 60, 60, 60, 60, 60, 60, 81, 81, 81, 81, 81, 81, 81, 81, ...]"
7,"[An, 18, -, year, -, old, male, was, diagnosed, with, attention, -, deficit, hyperactivity, disorder, (, ADHD, ), in, 2005, ., \n, He, was, overweight, with, a, body, mass, index, (, BMI, ), of, 40, ., \n, He, was, started, on, quetiapine, fumarate, (, Seroquel, ®, ), 900, mg, daily, in, April, 2005, and, methylphenidate, (, Concerta, ®, ), 54, mg, daily, in, September, 2005, ., \n, In, the, beginning, of, August, 2006, he, was, admitted, to, his, local, hospital, with, severe, dyspnoea, ,, tachypnea, ,, tachycardia, ,, and, cyanosis, ., \n, On, admission, the, blood, pressure, was, 120/80, mmHg, ...]","[O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Sex, O, O, O, B-Disease_disorder, I-Disease_disorder, I-Disease_disorder, I-Disease_disorder, I-Disease_disorder, O, B-Disease_disorder, O, O, B-Date, O, O, O, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Diagnostic_procedure, O, O, B-Lab_value, O, O, O, O, O, O, B-Medication, I-Medication, B-Medication, I-Medication, I-Medication, I-Medication, B-Dosage, I-Dosage, I-Dosage, O, B-Date, I-Date, O, B-Medication, B-Medication, I-Medication, I-Medication, I-Medication, B-Dosage, I-Dosage, I-Dosage, O, B-Date, I-Date, O, O, O, O, B-Date, I-Date, I-Date, I-Date, O, O, B-Clinical_event, O, O, B-Nonbiological_location, I-Nonbiological_location, O, B-Severity, B-Sign_symptom, O, B-Sign_symptom, O, B-Sign_symptom, O, O, B-Sign_symptom, O, O, O, B-Clinical_event, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Lab_value, I-Lab_value, ...]","[81, 2, 43, 43, 43, 43, 32, 81, 81, 81, 12, 53, 53, 53, 53, 81, 12, 81, 81, 9, 81, 81, 81, 81, 81, 81, 81, 11, 52, 52, 81, 11, 81, 81, 20, 81, 81, 81, 81, 81, 81, 22, 63, 22, 63, 63, 63, 14, 55, 55, 81, 9, 50, 81, 22, 22, 63, 63, 63, 14, 55, 55, 81, 9, 50, 81, 81, 81, 81, 9, 50, 50, 50, 81, 81, 6, 81, 81, 23, 64, 81, 31, 34, 81, 34, 81, 34, 81, 81, 34, 81, 81, 81, 6, 81, 11, 52, 81, 20, 61, ...]"
8,"[This, is, the, case, of, a, 58, -, year, -, old, white, Hispanic, woman, with, a, history, of, uveal, melanoma, in, her, right, eye, (, Fig.1, ), ., \n, She, was, admitted, to, the, hospital, with, jaundice, and, abdominal, pain, for, 10, days, ., \n, On, admission, ,, laboratory, tests, were, obtained, (, a, complete, blood, count, was, within, normal, limits, ,, amylase, :, 136, U, /, L, ,, total, bilirubin, :, 6.37, mg, /, dL, with, a, direct, fraction, of, 5.30, mg, /, dL, ), ., \n, Cross, -, sectional, ,, abdominal, computed, tomography, (, CT, ), with, contrast, ...]","[O, O, O, O, O, O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Personal_background, B-Personal_background, B-Sex, O, O, B-History, I-History, I-History, I-History, I-History, I-History, I-History, I-History, O, O, O, O, O, O, O, B-Clinical_event, O, O, O, O, B-Sign_symptom, O, O, B-Sign_symptom, O, B-Duration, I-Duration, O, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, I-Diagnostic_procedure, O, O, O, O, O, B-Diagnostic_procedure, O, O, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Lab_value, I-Lab_value, I-Lab_value, I-Lab_value, O, O, O, O, O, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Diagnostic_procedure, O, B-Detailed_description, I-Detailed_description, ...]","[81, 81, 81, 81, 81, 81, 2, 43, 43, 43, 43, 28, 28, 32, 81, 81, 19, 60, 60, 60, 60, 60, 60, 60, 81, 81, 81, 81, 81, 81, 81, 6, 81, 81, 81, 81, 34, 81, 81, 34, 81, 15, 56, 81, 81, 81, 81, 81, 11, 52, 81, 81, 81, 81, 11, 52, 52, 81, 81, 81, 81, 81, 11, 81, 81, 81, 81, 81, 81, 11, 52, 81, 20, 61, 61, 61, 81, 81, 11, 52, 81, 20, 61, 61, 61, 81, 81, 81, 81, 81, 81, 81, 81, 11, 52, 81, 11, 81, 10, 51, ...]"
9,"[n, March, 2015, ,, a, 62, -, year, -, old, woman, was, admitted, to, our, hospital, ., \n, She, complained, of, progressive, visual, disturbance, ,, which, began, about, 4, years, ago, and, was, treated, as, cataract, in, local, hospital, ,, but, no, relief, was, seen, ., \n, On, the, contrary, ,, the, symptoms, aggravated, half, a, year, ago, ,, together, with, headache, ,, left, eye, pain, ,, tearing, and, increased, secretions, ,, and, the, computed, tomography, (, CT, ), scan, of, the, brain, in, local, hospital, showed, a, sellar, region, lesion, ., \n, Besides, ,, 2, years, earlier, ,, the, ...]","[O, B-Date, I-Date, O, O, B-Age, I-Age, I-Age, I-Age, I-Age, B-Sex, O, B-Clinical_event, O, O, B-Nonbiological_location, O, O, O, O, O, O, B-Sign_symptom, I-Sign_symptom, O, O, O, O, B-Date, I-Date, I-Date, O, O, O, O, B-Disease_disorder, O, B-Nonbiological_location, I-Nonbiological_location, O, O, O, B-Sign_symptom, O, O, O, O, O, O, O, O, O, O, O, B-Date, I-Date, I-Date, I-Date, O, O, O, B-Sign_symptom, O, O, O, B-Sign_symptom, O, B-Sign_symptom, O, O, B-Sign_symptom, O, O, O, B-Diagnostic_procedure, I-Diagnostic_procedure, O, B-Diagnostic_procedure, O, O, O, O, B-Biological_structure, O, B-Nonbiological_location, I-Nonbiological_location, O, O, O, O, B-Sign_symptom, O, O, O, O, B-Date, I-Date, I-Date, O, O, ...]","[81, 9, 50, 81, 81, 2, 43, 43, 43, 43, 32, 81, 6, 81, 81, 23, 81, 81, 81, 81, 81, 81, 34, 74, 81, 81, 81, 81, 9, 50, 50, 81, 81, 81, 81, 12, 81, 23, 64, 81, 81, 81, 34, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 9, 50, 50, 50, 81, 81, 81, 34, 81, 81, 81, 34, 81, 34, 81, 81, 34, 81, 81, 81, 11, 52, 81, 11, 81, 81, 81, 81, 5, 81, 23, 64, 81, 81, 81, 81, 34, 81, 81, 81, 81, 9, 50, 50, 81, 81, ...]"


### Split the dataset into : training, validation, and test sets and print first elements of the split datasets

In [12]:
# Train, validation, test split
dataset = dataset["train"].train_test_split(test_size=0.2)
train_val_split = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]
test_dataset = dataset["test"]

print(f"Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

Train: 288, Validation: 32, Test: 80


In [13]:
print(train_dataset[0])

{'tokens': ['Our', '24', '-', 'year', '-', 'old', 'non', '-', 'smoking', 'male', 'patient', 'presented', 'with', 'repeated', 'hemoptysis', 'in', 'May', '2008', 'with', '4', 'days', 'of', 'concomitant', 'right', 'thoracic', 'pain', 'which', 'intensified', 'while', 'breathing', '.', '\n', 'During', 'holidays', 'in', 'his', 'home', 'country', ',', 'this', 'Cuban', 'patient', 'suffered', 'from', 'a', 'cold', 'with', 'fever', 'and', 'a', 'strong', 'cough', '.', '\n', 'The', 'strong', 'dry', 'cough', 'persisted', 'after', 'recovery', 'from', 'the', 'cold', '.', '\n', 'The', 'patient', 'did', 'not', 'report', 'any', 'loss', 'of', 'weight', '.', '\n', 'The', 'initial', 'CT', 'scan', 'of', 'the', 'thorax', 'showed', 'a', '12', '×', '4', 'cm', 'solid', 'mass', 'paravertebral', 'right', 'in', 'the', 'lower', 'thorax', 'without', 'any', 'signs', 'of', 'metastases', '(', 'Figure', '1', ')', '.', '\n', 'The', 'bronchoscopy', '(', 'Figure', '\u200b2', ')', 'with', 'non', '-', 'bleeding', 'biopsy', 'r

In [14]:
print(test_dataset[0])

{'tokens': ['A', '51', '-', 'year', '-', 'old', 'G1P1', 'Caucasian', 'female', 'with', 'lifelong', 'neurogenic', 'bladder', 'secondary', 'to', 'spina', 'bifida', 'occulta', 'was', 'referred', 'for', 'symptoms', 'of', 'constipation', 'and', '(', 'FI', ')', '.', '\n', 'She', 'averaged', 'one', 'Bristol', 'Type', '1–2', 'stool', 'every', '5', 'days', 'requiring', 'frequent', 'manual', 'disimpaction', '.', '\n', 'Additionally', ',', 'she', 'reported', 'twice', 'weekly', 'episodes', 'of', 'urgent', 'fecal', 'seepage', ',', 'which', 'required', 'the', 'use', 'of', 'daily', 'continence', 'pads', '.', '\n', 'Her', 'symptoms', 'did', 'not', 'improve', 'with', 'the', 'addition', 'of', 'psyllium', 'and', 'bisacodyl', 'suppositories', '.', '\n', 'A', 'defecography', 'suggested', 'atrophy', 'of', 'the', 'puborectalis', 'and', 'poor', 'squeeze', 'with', 'EAS', 'muscle', 'atrophy', '.', '\n', 'Anorectal', 'manometry', '(', 'ARM', ')', 'showed', 'a', 'normal', 'resting', 'pressure', 'with', 'no', 'aug

In [15]:
print(val_dataset[0])

{'tokens': ['We', 'present', 'a', 'case', 'of', 'pancreatic', 'tumor', 'without', 'a', 'history', 'of', 'trauma', 'or', 'panceratitis', '.', '\n', 'A', '47', '-', 'year', '-', 'old', 'Tunisian', 'man', 'with', 'a', 'history', 'of', 'Crohn', "'s", 'disease', 'was', 'admitted', 'to', 'the', 'University', 'Hospital', 'in', '2015', 'because', 'of', 'fluid', 'chronic', 'diarrhea', 'with', '4', 'stools', 'per', 'day', 'daytime', 'only', 'with', 'out', 'ooddebr', 'is', 'associated', 'with', 'vomiting', 'with', 'out', 'abdominal', 'pain', 'or', 'fever', 'with', 'a', 'weight', 'loss', 'not', 'encrypted', 'dating', 'from', '6', 'months', '.', '\n', 'Laboratory', 'tests', 'were', 'normal', '.', '\n', 'Nonspecific', 'elevations', 'of', 'serum', 'pancreatic', 'enzymes', '.', '\n', 'Patient', 'underwent', 'an', 'abdominal', 'ultrasound', 'and', 'computed', 'tomography', '(', 'CT', ')', 'that', 'revealed', ':', 'Aspect', 'of', 'ileitis', 'of', 'the', 'last', 'ileal', 'loop', 'extended', 'by', '300', 

In [16]:
input = (train_dataset[0])['tokens']
print(input)

['Our', '24', '-', 'year', '-', 'old', 'non', '-', 'smoking', 'male', 'patient', 'presented', 'with', 'repeated', 'hemoptysis', 'in', 'May', '2008', 'with', '4', 'days', 'of', 'concomitant', 'right', 'thoracic', 'pain', 'which', 'intensified', 'while', 'breathing', '.', '\n', 'During', 'holidays', 'in', 'his', 'home', 'country', ',', 'this', 'Cuban', 'patient', 'suffered', 'from', 'a', 'cold', 'with', 'fever', 'and', 'a', 'strong', 'cough', '.', '\n', 'The', 'strong', 'dry', 'cough', 'persisted', 'after', 'recovery', 'from', 'the', 'cold', '.', '\n', 'The', 'patient', 'did', 'not', 'report', 'any', 'loss', 'of', 'weight', '.', '\n', 'The', 'initial', 'CT', 'scan', 'of', 'the', 'thorax', 'showed', 'a', '12', '×', '4', 'cm', 'solid', 'mass', 'paravertebral', 'right', 'in', 'the', 'lower', 'thorax', 'without', 'any', 'signs', 'of', 'metastases', '(', 'Figure', '1', ')', '.', '\n', 'The', 'bronchoscopy', '(', 'Figure', '\u200b2', ')', 'with', 'non', '-', 'bleeding', 'biopsy', 'revealed', '

### Pre Processing the data for BaseLine Models

In [17]:
# Flatten labels and features for baseline
def flatten_labels_and_features(dataset_split):
    texts = []
    labels = []
    for tokens, label_seq in zip(dataset_split["tokens"], dataset_split["numeric_tags"]):
        for token, label in zip(tokens, label_seq):
            texts.append(token)  # Use individual tokens for baseline
            labels.append(label)  # Add corresponding label
    return texts, labels

train_tokens, train_labels = flatten_labels_and_features(train_dataset)
test_tokens, test_labels = flatten_labels_and_features(test_dataset)
val_tokens, val_labels = flatten_labels_and_features(val_dataset)

print(f"Number of tokens in train dataset: {len(train_tokens)}")
print(f"Number of labels in train dataset: {len(train_labels)}")


Number of tokens in train dataset: 159514
Number of labels in train dataset: 159514


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, lowercase=True, analyzer='word')

# Fit on training tokens and transform tokens for all splits
X_train = tfidf_vectorizer.fit_transform(train_tokens)
X_val = tfidf_vectorizer.transform(val_tokens)
X_test = tfidf_vectorizer.transform(test_tokens)


In [19]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
logistic_model = LogisticRegression(max_iter=1000 , class_weight="balanced")
logistic_model.fit(X_train, train_labels)


In [20]:
from sklearn.metrics import classification_report

# Predict on validation and test sets
val_predictions = logistic_model.predict(X_val)
test_predictions = logistic_model.predict(X_test)

print("Validation Set Metrics:")
print(classification_report(val_labels, val_predictions, zero_division=0))


print("Test Set Metrics:")
print(classification_report(test_labels, test_predictions, zero_division=0))


Validation Set Metrics:
              precision    recall  f1-score   support

           0       0.10      0.23      0.14        13
           1       0.09      0.86      0.17         7
           2       0.16      0.71      0.26        31
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         0
           5       0.21      0.53      0.30       126
           6       0.50      0.91      0.64        67
           7       0.00      0.00      0.00         0
           8       0.09      0.42      0.15        26
           9       0.35      0.37      0.36       109
          10       0.08      0.34      0.14        76
          11       0.50      0.44      0.47       536
          12       0.21      0.36      0.27       138
          13       0.03      0.20      0.06         5
          14       0.12      0.22      0.16        36
          15       0.09      0.18      0.12        34
          16       0.02      0.33      0.04         9
   

In [21]:
print("Unique true labels:", set(test_labels))
print("Unique predicted labels:", set(test_predictions))


Unique true labels: {0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81}
Unique predicted labels: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81}


In [22]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes model
naive_bayes_model = MultinomialNB()

# Train the model on the training data
naive_bayes_model.fit(X_train, train_labels)


In [23]:
from sklearn.metrics import classification_report

# Make predictions on validation and test sets
val_predictions = naive_bayes_model.predict(X_val)
test_predictions = naive_bayes_model.predict(X_test)

# Print classification report for validation set
print("Validation Set Metrics:")
print(classification_report(val_labels, val_predictions, zero_division=0))

# Print classification report for test set
print("Test Set Metrics:")
print(classification_report(test_labels, test_predictions, zero_division=0))


Validation Set Metrics:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        31
           3       0.00      0.00      0.00         3
           5       0.75      0.02      0.05       126
           6       0.00      0.00      0.00        67
           8       0.00      0.00      0.00        26
           9       0.00      0.00      0.00       109
          10       0.00      0.00      0.00        76
          11       0.64      0.13      0.22       536
          12       0.50      0.01      0.01       138
          13       0.00      0.00      0.00         5
          14       0.00      0.00      0.00        36
          15       0.00      0.00      0.00        34
          16       0.00      0.00      0.00         9
          17       0.00      0.00      0.00        14
          18       0.00      0.00      0.00         1
   

In [24]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT" , add_prefix_space=True)
model = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [25]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [26]:
input = train_dataset[0]['tokens']
print(input)

['Our', '24', '-', 'year', '-', 'old', 'non', '-', 'smoking', 'male', 'patient', 'presented', 'with', 'repeated', 'hemoptysis', 'in', 'May', '2008', 'with', '4', 'days', 'of', 'concomitant', 'right', 'thoracic', 'pain', 'which', 'intensified', 'while', 'breathing', '.', '\n', 'During', 'holidays', 'in', 'his', 'home', 'country', ',', 'this', 'Cuban', 'patient', 'suffered', 'from', 'a', 'cold', 'with', 'fever', 'and', 'a', 'strong', 'cough', '.', '\n', 'The', 'strong', 'dry', 'cough', 'persisted', 'after', 'recovery', 'from', 'the', 'cold', '.', '\n', 'The', 'patient', 'did', 'not', 'report', 'any', 'loss', 'of', 'weight', '.', '\n', 'The', 'initial', 'CT', 'scan', 'of', 'the', 'thorax', 'showed', 'a', '12', '×', '4', 'cm', 'solid', 'mass', 'paravertebral', 'right', 'in', 'the', 'lower', 'thorax', 'without', 'any', 'signs', 'of', 'metastases', '(', 'Figure', '1', ')', '.', '\n', 'The', 'bronchoscopy', '(', 'Figure', '\u200b2', ')', 'with', 'non', '-', 'bleeding', 'biopsy', 'revealed', '

In [27]:
output = tokenizer(input,is_split_into_words=True)
print(output)

{'input_ids': [101, 1412, 1572, 118, 1214, 118, 1385, 1664, 118, 9987, 2581, 5351, 2756, 1114, 4892, 23123, 4184, 2340, 4863, 1107, 1336, 1369, 1114, 125, 1552, 1104, 14255, 8178, 5168, 2227, 1268, 24438, 6533, 6617, 1665, 2489, 1134, 16744, 1229, 4943, 119, 1219, 13413, 1107, 1117, 1313, 1583, 117, 1142, 16408, 7167, 5351, 3421, 1121, 170, 2504, 1114, 10880, 1105, 170, 2012, 21810, 119, 1103, 2012, 3712, 21810, 20702, 1170, 7593, 1121, 1103, 2504, 119, 1103, 5351, 1225, 1136, 2592, 1251, 2445, 1104, 2841, 119, 1103, 3288, 172, 1204, 14884, 1104, 1103, 24438, 25632, 2799, 170, 1367, 240, 125, 3975, 4600, 3367, 18311, 12986, 15581, 4412, 1268, 1107, 1103, 2211, 24438, 25632, 1443, 1251, 5300, 1104, 27154, 8419, 8830, 113, 2482, 122, 114, 119, 1103, 9304, 1320, 8401, 11428, 5005, 113, 2482, 123, 114, 1114, 1664, 118, 9793, 25128, 12685, 3090, 170, 3367, 1104, 1103, 2211, 1268, 9304, 1320, 15548, 1134, 1117, 2430, 7810, 1193, 1105, 13280, 13601, 2728, 27516, 2430, 7810, 1193, 2136, 2554, 

In [28]:
example = dataset["train"][4]
print(example["tokens"])

['A', '26', '-', 'year', '-', 'old', 'man', 'of', 'Portuguese', 'descent', 'with', 'no', 'significant', 'past', 'medical', 'history', 'presents', 'with', 'subacute', 'onset', 'of', 'right', '-', 'sided', 'hemiparesis', 'and', 'aphasia', ',', 'with', 'marked', 'expressive', 'aphasia', ',', 'word', '-', 'finding', 'difficulty', ',', 'and', 'rare', 'paraphrasic', 'errors', '.', '\n', 'Examination', 'demonstrated', 'right', 'central', 'facial', 'weakness', ',', '2–3/5', 'strength', 'in', 'the', 'right', 'arm', 'and', 'leg', ',', 'and', 'dysmetria', 'in', 'the', 'right', 'upper', 'and', 'lower', 'extremities', '.', '\n', 'Reflexes', 'were', 'hyperactive', 'on', 'the', 'right', 'with', 'a', 'right', '-', 'sided', 'Babinski', 'response', '.', '\n', 'MRI', 'of', 'the', 'brain', 'showed', 'multiple', 'bilateral', 'concentric', 'ring', '-', 'like', 'structures', 'in', 'the', 'centrum', 'semiovale', 'and', 'the', 'corona', 'radiata', 'on', 'T2', 'imaging', '(', 'figure', ',', 'A', ')', ',', 'cons

In [29]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'a', '26', '-', 'year', '-', 'old', 'man', 'of', 'port', '##ug', '##ues', '##e', 'descent', 'with', 'no', 'significant', 'past', 'medical', 'history', 'presents', 'with', 'sub', '##ac', '##ute', 'onset', 'of', 'right', '-', 'sided', 'hem', '##ip', '##ares', '##is', 'and', 'a', '##pha', '##sia', ',', 'with', 'marked', 'expressive', 'a', '##pha', '##sia', ',', 'word', '-', 'finding', 'difficulty', ',', 'and', 'rare', 'para', '##ph', '##ras', '##ic', 'errors', '.', 'examination', 'demonstrated', 'right', 'central', 'facial', 'weakness', ',', '2', '–', '3', '/', '5', 'strength', 'in', 'the', 'right', 'arm', 'and', 'leg', ',', 'and', 'd', '##ys', '##met', '##ria', 'in', 'the', 'right', 'upper', 'and', 'lower', 'ex', '##tre', '##mit', '##ies', '.', 'reflex', '##es', 'were', 'h', '##yper', '##active', 'on', 'the', 'right', 'with', 'a', 'right', '-', 'sided', 'b', '##abi', '##ns', '##ki', 'response', '.', 'm', '##ri', 'of', 'the', 'brain', 'showed', 'multiple', 'bilateral', 'con', '#

In [30]:
len(example[f"numeric_tags"]), len(tokenized_input["input_ids"])

(417, 541)

In [31]:
print(tokenized_input.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 18, 18, 19, 20, 21, 22, 23, 24, 24, 24, 24, 25, 26, 26, 26, 27, 28, 29, 30, 31, 31, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 51, 51, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 68, 68, 69, 71, 71, 72, 73, 73, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 82, 82, 82, 83, 84, 86, 86, 87, 88, 89, 90, 91, 92, 93, 93, 94, 95, 96, 97, 98, 99, 100, 100, 101, 101, 101, 102, 103, 104, 104, 105, 105, 105, 106, 107, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 120, 121, 121, 122, 122, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 136, 137, 138, 139, 139, 140, 142, 142, 142, 143, 144, 145, 146, 146, 146, 147, 148, 149, 150, 150, 150, 151, 152, 153, 154, 154, 155, 156, 157, 157, 157, 157, 157, 158, 159, 160, 161, 162, 163, 163, 163, 163, 163, 163, 164, 165, 166, 167, 169, 170, 1

In [32]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"numeric_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

541 541


In [33]:
# output.word_ids()

In [34]:
# tokenizer.convert_ids_to_tokens(output.input_ids)

In [35]:
# output.word_ids(batch_index=0)

In [36]:
label_all_tokens = True

In [37]:
# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=512)

    labels = []
    for i, label in enumerate(examples[f"numeric_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/288 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [38]:
print(tokenized_train_dataset)

Dataset({
    features: ['tokens', 'tags', 'numeric_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 288
})


In [39]:
print(tokenized_test_dataset)

Dataset({
    features: ['tokens', 'tags', 'numeric_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 80
})


In [40]:
print(tokenized_val_dataset)

Dataset({
    features: ['tokens', 'tags', 'numeric_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 32
})


In [41]:
print(tokenized_train_dataset[0])
print(tokenized_test_dataset[0])
print(tokenized_val_dataset[0])

{'tokens': ['Our', '24', '-', 'year', '-', 'old', 'non', '-', 'smoking', 'male', 'patient', 'presented', 'with', 'repeated', 'hemoptysis', 'in', 'May', '2008', 'with', '4', 'days', 'of', 'concomitant', 'right', 'thoracic', 'pain', 'which', 'intensified', 'while', 'breathing', '.', '\n', 'During', 'holidays', 'in', 'his', 'home', 'country', ',', 'this', 'Cuban', 'patient', 'suffered', 'from', 'a', 'cold', 'with', 'fever', 'and', 'a', 'strong', 'cough', '.', '\n', 'The', 'strong', 'dry', 'cough', 'persisted', 'after', 'recovery', 'from', 'the', 'cold', '.', '\n', 'The', 'patient', 'did', 'not', 'report', 'any', 'loss', 'of', 'weight', '.', '\n', 'The', 'initial', 'CT', 'scan', 'of', 'the', 'thorax', 'showed', 'a', '12', '×', '4', 'cm', 'solid', 'mass', 'paravertebral', 'right', 'in', 'the', 'lower', 'thorax', 'without', 'any', 'signs', 'of', 'metastases', '(', 'Figure', '1', ')', '.', '\n', 'The', 'bronchoscopy', '(', 'Figure', '\u200b2', ')', 'with', 'non', '-', 'bleeding', 'biopsy', 'r

In [42]:
tokenize_and_align_labels(dataset['train'][:5])

{'input_ids': [[101, 170, 1406, 118, 1214, 118, 1385, 2331, 8071, 26181, 6997, 1182, 1299, 2756, 1106, 1412, 2704, 1114, 1126, 19700, 1105, 1956, 1104, 190, 16996, 1596, 4035, 2093, 20695, 13200, 23610, 119, 1995, 1552, 2988, 1106, 1142, 8685, 117, 1119, 1125, 4531, 5199, 3105, 24716, 2489, 1105, 26979, 1158, 117, 1105, 1119, 1125, 1151, 5165, 1107, 170, 2425, 1920, 3695, 1111, 1515, 170, 1692, 1104, 12104, 13316, 13782, 11745, 6620, 119, 1117, 3288, 8006, 4725, 132, 1649, 117, 1119, 6044, 1245, 1126, 8212, 1665, 1105, 4267, 21484, 9080, 1174, 119, 1173, 1119, 1108, 3175, 1106, 1412, 2704, 1111, 1748, 2635, 119, 1120, 8685, 1106, 1412, 2704, 117, 1119, 1108, 8669, 24346, 117, 20806, 117, 1105, 4267, 21484, 9080, 1174, 119, 1119, 1108, 27629, 8992, 1643, 1673, 1596, 1114, 5190, 14426, 2184, 119, 10496, 18081, 5048, 14494, 1108, 1675, 132, 1649, 117, 1117, 179, 9610, 5552, 1396, 23901, 2997, 1108, 1136, 2120, 119, 1117, 8561, 1108, 11150, 11883, 120, 2517, 117, 1117, 1892, 2997, 1108, 81

### Fine-Tuning the model

**Conditional Random Fields (CRF) Model**

In [43]:
pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [44]:
# Train, validation, test split
dataset = dataset["train"].train_test_split(test_size=0.2)
train_val_split = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]
test_dataset = dataset["test"]


In [45]:
def flatten_labels_and_features(dataset_split, label_list):
    texts = []
    labels = []
    for tokens, label_seq in zip(dataset_split["tokens"], dataset_split["numeric_tags"]):
        sentence_tokens = []
        sentence_labels = []
        for token, label in zip(tokens, label_seq):
            sentence_tokens.append(token)  # Keep tokens as is
            sentence_labels.append(label_list[label])  # Map numeric label to string label
        texts.append(sentence_tokens)
        labels.append(sentence_labels)
    return texts, labels

In [46]:
# Map the numeric labels to string labels using the label_list
label_list = ['B-Activity', 'B-Administration', 'B-Age', 'B-Area', 'B-Biological_attribute', 'B-Biological_structure', 'B-Clinical_event', 'B-Color', 'B-Coreference', 'B-Date', 'B-Detailed_description', 'B-Diagnostic_procedure', 'B-Disease_disorder', 'B-Distance', 'B-Dosage', 'B-Duration', 'B-Family_history', 'B-Frequency', 'B-Height', 'B-History', 'B-Lab_value', 'B-Mass', 'B-Medication', 'B-Nonbiological_location', 'B-Occupation', 'B-Other_entity', 'B-Other_event', 'B-Outcome', 'B-Personal_background', 'B-Qualitative_concept', 'B-Quantitative_concept', 'B-Severity', 'B-Sex', 'B-Shape', 'B-Sign_symptom', 'B-Subject', 'B-Texture', 'B-Therapeutic_procedure', 'B-Time', 'B-Volume', 'B-Weight', 'I-Activity', 'I-Administration', 'I-Age', 'I-Area', 'I-Biological_attribute', 'I-Biological_structure', 'I-Clinical_event', 'I-Color', 'I-Coreference', 'I-Date', 'I-Detailed_description', 'I-Diagnostic_procedure', 'I-Disease_disorder', 'I-Distance', 'I-Dosage', 'I-Duration', 'I-Family_history', 'I-Frequency', 'I-Height', 'I-History', 'I-Lab_value', 'I-Mass', 'I-Medication', 'I-Nonbiological_location', 'I-Occupation', 'I-Other_entity', 'I-Other_event', 'I-Outcome', 'I-Personal_background', 'I-Qualitative_concept', 'I-Quantitative_concept', 'I-Severity', 'I-Shape', 'I-Sign_symptom', 'I-Subject', 'I-Texture', 'I-Therapeutic_procedure', 'I-Time', 'I-Volume', 'I-Weight', 'O']

train_tokens, train_labels = flatten_labels_and_features(train_dataset, label_list)
val_tokens, val_labels = flatten_labels_and_features(val_dataset, label_list)
test_tokens, test_labels = flatten_labels_and_features(test_dataset, label_list)

print(f"Number of tokens in train dataset: {len(train_tokens)}")
print(f"Number of labels in train dataset: {len(train_labels)}")

Number of tokens in train dataset: 230
Number of labels in train dataset: 230


In [47]:
# Define a extraction function for CRF
def extract_features(tokens):
    features = []
    for i in range(len(tokens)):
        token = tokens[i]
        features.append({
            'word': token,  # Basic feature: the token itself
            'is_capitalized': token[0].upper() == token[0],
            'is_digit': token.isdigit(),
            'prefix': token[:3],
            'suffix': token[-3:],
        })
    return features

# Extract features for train, validation, and test datasets
X_train_crf = [extract_features(tokens) for tokens in train_tokens]
X_val_crf = [extract_features(tokens) for tokens in val_tokens]
X_test_crf = [extract_features(tokens) for tokens in test_tokens]

In [48]:
from sklearn_crfsuite import CRF

crf_model = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

crf_model.fit(X_train_crf, train_labels)

# Predict on validation and test datasets
val_predictions_crf = crf_model.predict(X_val_crf)
test_predictions_crf = crf_model.predict(X_test_crf)

In [49]:
# Import classification_report for evaluation
from sklearn_crfsuite.metrics import flat_classification_report

print("Validation Set Metrics (CRF):")
print(flat_classification_report(val_labels, val_predictions_crf))

Validation Set Metrics (CRF):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

              B-Activity       0.00      0.00      0.00        11
        B-Administration       0.00      0.00      0.00         4
                   B-Age       0.96      0.93      0.94        27
                  B-Area       1.00      0.20      0.33         5
  B-Biological_structure       0.49      0.30      0.37       142
        B-Clinical_event       0.67      0.33      0.44        42
           B-Coreference       0.00      0.00      0.00        10
                  B-Date       0.74      0.68      0.71        62
  B-Detailed_description       0.19      0.05      0.08        63
  B-Diagnostic_procedure       0.55      0.44      0.49       415
      B-Disease_disorder       0.50      0.16      0.24       114
              B-Distance       0.50      0.10      0.17        10
                B-Dosage       0.50      0.39      0.44        18
              B-Duration       0.00      0.00      0.00        18
        B

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
print("Test Set Metrics (CRF):")
print(flat_classification_report(test_labels, test_predictions_crf))

Test Set Metrics (CRF):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

              B-Activity       0.00      0.00      0.00        13
        B-Administration       0.00      0.00      0.00        11
                   B-Age       0.97      0.88      0.92        66
                  B-Area       1.00      0.20      0.33        10
  B-Biological_structure       0.48      0.23      0.32       379
        B-Clinical_event       0.71      0.31      0.43       155
                 B-Color       0.00      0.00      0.00         2
           B-Coreference       0.00      0.00      0.00        55
                  B-Date       0.57      0.50      0.53       164
  B-Detailed_description       0.27      0.05      0.09       186
  B-Diagnostic_procedure       0.58      0.45      0.51      1012
      B-Disease_disorder       0.43      0.15      0.22       293
              B-Distance       0.50      0.09      0.15        11
                B-Dosage       0.60      0.48      0.53        67
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
from transformers import AutoConfig, AutoModelForTokenClassification

config = AutoConfig.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",  num_labels=len(label_list))
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [53]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [54]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")
# label_names = dataset["train"].features["ner_labels"].feature.names

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [55]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="finetuned_ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Train the model
trainer.train()





  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [None]:
trainer.evaluate()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
from transformers import AutoConfig, AutoModelForTokenClassification

config = AutoConfig.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",  num_labels=len(label_list))
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)