In [16]:
import pandas as pd                         # For handling CSV files and dataframes
from sklearn.model_selection import train_test_split  # For splitting data into training and evaluation sets
from datasets import Dataset                # For converting dataframes to Hugging Face datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments  # Hugging Face tools

In [17]:
data = {
    'text': [
        "Patient diagnosed with Type 2 Diabetes. Prescribed Metformin.",
        "Blood pressure is elevated. Recommend increasing dosage of Lisinopril.",
        "MRI shows a herniated disc at L4-L5. Consider surgical consultation.",
        "Follow-up required for abnormal liver enzyme levels.",
        "Patient reports increased shortness of breath. Check for possible COPD exacerbation.",
        "Hemoglobin levels are low. Suggest iron supplements.",
        "Cholesterol levels are high. Initiate statin therapy.",
        "Signs of early-stage kidney disease. Advise on low-protein diet.",
        "Recommend regular blood sugar monitoring.",
        "Patient experiencing joint pain. Consider Rheumatology referral."
    ],
    'label': [
        "Diabetes Management", "Hypertension Management", "Spinal Surgery",
        "Liver Function", "COPD Management", "Anemia Management",
        "Hyperlipidemia", "Kidney Function", "Diabetes Monitoring", "Arthritis Management"
    ]
}


In [18]:
# Step 2: Define a label mapping (convert string labels to integers)
label_mapping = {
    "Diabetes Management": 0,
    "Hypertension Management": 1,
    "Spinal Surgery": 2,
    "Liver Function": 3,
    "COPD Management": 4,
    "Anemia Management": 5,
    "Hyperlipidemia": 6,
    "Kidney Function": 7,
    "Diabetes Monitoring": 8,
    "Arthritis Management": 9
}

# Step 3: Apply the label mapping
df = pd.DataFrame(data)
df['label'] = df['label'].map(label_mapping)
dataset = Dataset.from_pandas(df)

In [19]:

# Load the pre-trained BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")




In [20]:
def tokenize_function(example):
    return tokenizer(
        example['text'], 
        padding='max_length',        # Pad all sentences to the model's max length
        truncation=True,             # Truncate longer sequences
        max_length=128               # Set max length to avoid excessive padding
    )

In [21]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [22]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1", num_labels=len(label_mapping)
)

  return torch.load(checkpoint_file, map_location=map_location)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start the training process
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9824172258377075, 'eval_runtime': 0.678, 'eval_samples_per_second': 2.95, 'eval_steps_per_second': 1.475, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.006671905517578, 'eval_runtime': 0.5112, 'eval_samples_per_second': 3.912, 'eval_steps_per_second': 1.956, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.021620750427246, 'eval_runtime': 1.2343, 'eval_samples_per_second': 1.62, 'eval_steps_per_second': 0.81, 'epoch': 3.0}
{'train_runtime': 28.8492, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'train_loss': 2.3633880615234375, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=2.3633880615234375, metrics={'train_runtime': 28.8492, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'train_loss': 2.3633880615234375, 'epoch': 3.0})