In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [3]:
# Prepare labeled dataset only
df_labeled = df[df['SOC'].notna()].drop_duplicates(subset=['pt']).copy()
df_labeled = df_labeled[df_labeled['pt'].notna() & (df_labeled['pt'].str.strip() != '')]

In [4]:
# Encode labels
le = LabelEncoder()
df_labeled['label'] = le.fit_transform(df_labeled['SOC'])
num_labels = len(le.classes_)

In [5]:
train_df, val_df = train_test_split(df_labeled, test_size=0.2, stratify=df_labeled['label'], random_state=42)

In [6]:
# Tokenize
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

def tokenize(example):
    return tokenizer(example['pt'], truncation=True, padding='max_length', max_length=128)

train_ds = Dataset.from_pandas(train_df[['pt', 'label']]).map(tokenize, batched=True)
val_ds = Dataset.from_pandas(val_df[['pt', 'label']]).map(tokenize, batched=True)

train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 6522/6522 [00:00<00:00, 14864.54 examples/s]
Map: 100%|██████████| 1631/1631 [00:00<00:00, 16212.22 examples/s]


In [7]:
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {'accuracy': (preds == labels).mean()}

In [9]:
# Train the model
training_args = TrainingArguments(
    output_dir='data/results_biobert_finetune',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='data/logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9692,0.780633,0.83691
2,0.3033,0.465603,0.900674
3,0.1188,0.388739,0.917229
4,0.0741,0.361457,0.926426
5,0.1214,0.357949,0.927039


TrainOutput(global_step=2040, training_loss=0.5661450909194993, metrics={'train_runtime': 11046.7822, 'train_samples_per_second': 2.952, 'train_steps_per_second': 0.185, 'total_flos': 2145494358950400.0, 'train_loss': 0.5661450909194993, 'epoch': 5.0})

In [10]:
trainer.save_model('biobert-finetuned')
tokenizer.save_pretrained('biobert-finetuned')

('biobert-finetuned\\tokenizer_config.json',
 'biobert-finetuned\\special_tokens_map.json',
 'biobert-finetuned\\vocab.txt',
 'biobert-finetuned\\added_tokens.json',
 'biobert-finetuned\\tokenizer.json')

In [11]:
# Evaluate the model
outputs = trainer.predict(val_ds)
y_val_pred = np.argmax(outputs.predictions, axis=1)
y_val_true = val_df['label'].values

print(classification_report(y_val_true, y_val_pred, target_names=le.classes_))

                                                                     precision    recall  f1-score   support

                               Blood and lymphatic system disorders       1.00      0.78      0.88         9
                                                  Cardiac disorders       0.95      0.95      0.95        37
                         Congenital, familial and genetic disorders       0.88      0.82      0.85        17
                                        Ear and labyrinth disorders       1.00      0.89      0.94         9
                                                Endocrine disorders       0.90      0.90      0.90        10
                                                      Eye disorders       0.85      0.95      0.90        59
                                         Gastrointestinal disorders       0.95      0.92      0.94       114
               General disorders and administration site conditions       0.86      0.88      0.87       112
                  

In [12]:
# Test data preprocessing
test_df = df[df['SOC'].isna()].drop_duplicates(subset=['pt']).copy()
test_df = test_df[test_df['pt'].notna() & (test_df['pt'].str.strip() != '')]
test_df = test_df.reset_index(drop=True)

test_ds = Dataset.from_dict({'pt': test_df['pt'].tolist()}).map(tokenize, batched=True)
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map: 100%|██████████| 1132/1132 [00:00<00:00, 6043.39 examples/s]


In [13]:
# # Predict
# test_outputs = trainer.predict(test_ds)
# test_pred_labels = np.argmax(test_outputs.predictions, axis=1)

# test_pred_socs = le.inverse_transform(test_pred_labels)
# test_df['SOC_biobert'] = test_pred_socs

In [14]:
# # Save predictions to CSV
# test_df[['pt', 'SOC_biobert']].to_csv('data/4.2 BioBERT_predictions.csv', index=False)