In [1]:
from datasets import load_dataset

# Load from cleaned JSON
dataset = load_dataset("json", data_files="../data/cleaned_mts_dialogue_summary.json", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dataset.rename_columns({
    "dialogue": "input",
    "section_text": "target"
})

In [3]:
dataset[0]

{'ID': 0,
 'section_header': 'GENHX',
 'target': 'Symptoms: no fever, no chills, no cough, no congestion, no nausea, no vomiting, no chest pain, no chest pressure.\nDiagnosis: hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis, kidney stones\nHistory of Patient: 76-year-old white female, presents to the clinic today originally for hypertension and a med check, followed by Dr. Kumar, issues stable\nPlan of Action: N/A',
 'input': 'Doctor: What brings you back into the clinic today, miss? \nPatient: I came in for a refill of my blood pressure medicine. \nDoctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues?  \nPatient: No. \nDoctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?\nPatient: No.  \nDoctor: Grea

In [4]:
# Split raw dataset (not tokenized yet!)
split_dataset = dataset.train_test_split(test_size=0.1)
raw_train = split_dataset["train"]
raw_eval = split_dataset["test"]

In [5]:
# Load Tokenizer

from transformers import AutoTokenizer

model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Tokenize the Dataset

def tokenize_function(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target"],
            max_length=128,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = [int(id) for id in labels["input_ids"]]
    return model_inputs


In [7]:
# Tokenize both splits
tokenized_train = raw_train.map(tokenize_function, batched=True)
tokenized_eval = raw_eval.map(tokenize_function, batched=True)

Map: 100%|██████████| 1161/1161 [00:00<00:00, 2038.21 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1909.37 examples/s]


In [8]:
# Save Tokenized splits

tokenized_train.save_to_disk("../data/tokenized_mts_summarizer_train")
tokenized_eval.save_to_disk("../data/tokenized_mts_summarizer_eval")


Saving the dataset (1/1 shards): 100%|██████████| 1161/1161 [00:00<00:00, 46392.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 129/129 [00:00<00:00, 5055.50 examples/s]
