### 1. Load and Split data

In [12]:
!pip install -Uqq datasets fsspec transformers evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
from datasets import load_dataset

data = load_dataset("gretelai/gretel-patient-events-v1")

README.md:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7348 [00:00<?, ? examples/s]

In [4]:
data = data["train"].train_test_split(test_size=0.2, seed=42)
data

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'sex', 'race', 'weight', 'height', 'event_id', 'event_type', 'event_date', 'event_name', 'provider_name', 'reason', 'result', 'details', 'notes'],
        num_rows: 5878
    })
    test: Dataset({
        features: ['patient_id', 'first_name', 'last_name', 'date_of_birth', 'sex', 'race', 'weight', 'height', 'event_id', 'event_type', 'event_date', 'event_name', 'provider_name', 'reason', 'result', 'details', 'notes'],
        num_rows: 1470
    })
})

### 2. Preprocess

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

In [6]:
def preprocess(samples):
    texts = []
    for i in range(len(samples["patient_id"])):
        fields = [
            f"Patient ID: {samples['patient_id'][i]}",
            f"Name: {samples['first_name'][i]} {samples['last_name'][i]}",
            f"DOB: {samples['date_of_birth'][i]}",
            f"Sex: {samples['sex'][i]}",
            f"Race: {samples['race'][i]}",
            f"Weight: {samples['weight'][i]} kg",
            f"Height: {samples['height'][i]} cm",
            f"Event ID: {samples['event_id'][i]}",
            f"Event Type: {samples['event_type'][i]}",
            f"Event Date: {samples['event_date'][i]}",
            f"Event Name: {samples['event_name'][i]}",
            f"Provider: {samples['provider_name'][i]}",
            f"Reason: {samples['reason'][i]}",
            f"Result: {samples['result'][i]}",
            f"Details: {samples['details'][i]}",
            f"Notes: {samples['notes'][i]}"
        ]
        # Only include non-empty fields
        text = ", ".join([f for f in fields if f and f != 'nan'])
        texts.append(text)
    model_inputs = tokenizer(texts)
    # model_inputs["text"] = texts
    return model_inputs


tokenized_data = data.map(preprocess, batched=True, remove_columns=data["train"].column_names)

Map:   0%|          | 0/5878 [00:00<?, ? examples/s]

Map:   0%|          | 0/1470 [00:00<?, ? examples/s]

In [13]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5878
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1470
    })
})

In [14]:
print(tokenized_data["train"][0])

{'input_ids': [1, 18276, 4843, 4576, 35, 4751, 438, 12, 401, 27873, 28621, 12, 134, 6, 10704, 35, 6939, 15738, 3578, 6, 14010, 387, 35, 12087, 73, 996, 73, 1646, 4432, 6, 15516, 35, 23172, 6, 8989, 35, 735, 6, 17515, 35, 10572, 4, 288, 14091, 6, 37099, 35, 5138, 4, 288, 25434, 6, 11373, 4576, 35, 132, 6, 11373, 7773, 35, 3067, 37361, 6, 11373, 10566, 35, 15140, 73, 1092, 73, 844, 1922, 6, 11373, 10704, 35, 12464, 23063, 6, 31314, 35, 7312, 17129, 824, 6, 31613, 35, 31424, 6, 33868, 35, 23867, 16003, 46224, 6, 10574, 35, 25522, 48268, 17152, 35, 12464, 14194, 1487, 2849, 8293, 9799, 46224, 5542, 678, 7482, 34049, 38951, 4, 17638, 1720, 18719, 9, 1416, 8, 55, 2167, 3044, 13, 33945, 25934, 16836, 13310, 12673, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
from transformers import DataCollatorForLanguageModeling

# collate_fn with masking and dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

### 3. Prepare and Train model

In [9]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [17]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("microsoft/deberta-base")
model

In [17]:
from peft import LoraConfig, TaskType, get_peft_model

# specify how to build peft model
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["in_proj"],  # inject for every in_proj layer
    lora_dropout=0.05
)

# freeze everything but adapters
model = get_peft_model(model, peft_config)

# unfreeze untrained cls head for mlm
for name, param in model.named_parameters():
    if "cls.predictions." in name:
        param.requires_grad = True

# Define a new “patched” forward that removes 'num_items_in_batch'
# _original_forward = model.forward

# def _forward_without_num_items(*args, **kwargs):
#     kwargs.pop("num_items_in_batch", None)
#     return _original_forward(*args, **kwargs)

# model.forward = _forward_without_num_items

model.print_trainable_parameters()

trainable params: 937,305 || all params: 139,539,033 || trainable%: 0.6717


In [27]:
import evaluate
from transformers import EvalPrediction

# Load metric from evaluate
metric = evaluate.load("perplexity")

# Custom compute_metrics function
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    results = metric.compute(predictions=preds, references=labels)
    return results

In [18]:
train_args = TrainingArguments("results2",
                               eval_strategy="no",
                               max_steps= 2000,
                               # eval_steps= 500,
                               logging_steps=200,
                               per_device_train_batch_size=4,
                               per_device_eval_batch_size=4,
                               weight_decay=0.01,
                               remove_unused_columns=False,
                               report_to="none")

trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_data["train"],
                  # eval_dataset=tokenized_data["test"],
                  data_collator=data_collator,
                  # compute_metrics=compute_metrics
                 )
trainer.train()

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
200,5.0937
400,2.842
600,2.4976
800,2.3099
1000,2.1924
1200,2.1345
1400,2.0616
1600,1.9609
1800,1.9522
2000,2.0165




TrainOutput(global_step=2000, training_loss=2.506134017944336, metrics={'train_runtime': 356.6675, 'train_samples_per_second': 44.86, 'train_steps_per_second': 5.607, 'total_flos': 1521689573089764.0, 'train_loss': 2.506134017944336, 'epoch': 2.7210884353741496})

### 4. Test inference

In [24]:
tokenizer.decode(tokenized_data['test'][0]['input_ids'])

'[CLS]Patient ID: pmc-6186366-2, Name: Yasir Shah, DOB: 03/18/1961, Sex: Male, Race: Asian, Weight: 176.0 kg, Height: 68.0 cm, Event ID: 4, Event Type: Surgery, Event Date: 06/28/2022, Event Name: Biopsy Procedure, Provider: Specialist Dr. Liu, Reason: To confirm biopsy recommendation, Result: Esophageal tissue samples obtained, Details: {"intensity":null,"location":"esophagus"}, Notes: Biopsy conducted, awaiting histopathological results to determine presence of Barrett\'s or other changes.[SEP]'

In [21]:
text = 'Patient ID: pmc-6186366-2, Name: Yasir Shah, DOB: 03/18/1961, Sex: Male, Race: Asian, Weight: 176.0 kg, Height: 68.0 cm, Event ID: 4, Event Type: Surgery, Event Date: 06/28/2022, Event Name: [MASK] Procedure, Provider: Specialist Dr. Liu, Reason: To confirm biopsy recommendation, Result: Esophageal tissue samples obtained, Details: {"intensity":null,"location":"esophagus"}, Notes: Biopsy conducted, awaiting histopathological results to determine presence of Barrett\'s or other changes.'
inputs = tokenizer(text, return_tensors="pt").to(model.device)

with torch.inference_mode():
    y_logits = model(**inputs).logits
    print(y_logits.shape)

sequence = []
for token in y_logits.squeeze(0):
    sequence.append(tokenizer.decode(token.argmax(0)))
print(sequence)