In [None]:
# 1. Basic manipulations
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("sampled_labels_text_label_merged.csv", encoding="utf-8-sig")

df = df[["text", "label"]].copy()
df = df.dropna(subset=["text", "label"])

# Yes/No to 1/0
label_map = {"Yes": 1, "No": 0}
df["label_id"] = df["label"].map(label_map)

print("Label distribution:\n", df["label_id"].value_counts())

# divide train / valid
train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label_id"],
)

print("Train size:", len(train_df), "Valid size:", len(valid_df))

Label distribution:
 label_id
1    745
0    596
Name: count, dtype: int64
Train size: 1072 Valid size: 269


In [None]:
# 2. Hugging Face Datasets encapsulation
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
valid_ds = Dataset.from_pandas(valid_df[["text", "label_id"]])


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# 3. loading ModernBERT model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 4. coding the tokenization function and tokenize the datasets
max_length = 256

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

train_ds_tokenized = train_ds.map(tokenize_function, batched=True)
valid_ds_tokenized = valid_ds.map(tokenize_function, batched=True)

train_ds_tokenized = train_ds_tokenized.rename_column("label_id", "labels")
valid_ds_tokenized = valid_ds_tokenized.rename_column("label_id", "labels")

train_ds_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)
valid_ds_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

Map: 100%|██████████| 1072/1072 [00:00<00:00, 1938.02 examples/s]
Map: 100%|██████████| 269/269 [00:00<00:00, 1621.19 examples/s]


In [None]:
# # 5. define parameters and Trainer
# from transformers import TrainingArguments, Trainer
# import evaluate
# import numpy as np

# accuracy_metric = evaluate.load("accuracy")
# f1_metric = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = np.argmax(logits, axis=-1)
#     return {
#         "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
#         "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
#     }

# training_args = TrainingArguments(
#     output_dir="./modernbert_scientific_activity",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=32,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     evaluation_strategy="epoch", 
#     save_strategy="epoch",
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1",
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds_tokenized,
#     eval_dataset=valid_ds_tokenized,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# 5. define parameters and Trainer（fitted old version transformers）
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

training_args = TrainingArguments(
    output_dir="./modernbert_scientific_activity",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tokenized,
    eval_dataset=valid_ds_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# 6. Training
trainer.train()

# 7. assess on validation set
metrics = trainer.evaluate()
print("Eval metrics:", metrics)



The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Step,Training Loss
50,0.5179


: 

In [None]:
# 8. Save the fine-tuned model
save_dir = "./modernbert_scientific_activity_final"
trainer.model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Model saved to", save_dir)