In [1]:
import os
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')
import pickle
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback

from mod.mod_text import *

# prepare the input data
with open("./data/DosageSensitivity/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/DosageSensitivity/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

# Prepare datasets
train_texts_all, train_labels_all = train_data['desc'], train_data['labels']
test_texts, test_labels = test_data['desc'], test_data['labels']

# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/david/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
n_splits = 5  # Number of folds
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=7)

# Initialize a list to store AUC scores for each fold and on the test data
val_auc_scores = []
test_auc_scores = []
output_dirs = []  # Track output directories for each fold

# Create test dataset
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

In [3]:
# Loop over each fold
for fold, (train_index, val_index) in enumerate(kf.split(train_texts_all, train_labels_all)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Split data into training and validation for this fold
    train_texts, val_texts = [train_texts_all[i] for i in train_index], [train_texts_all[i] for i in val_index]
    train_labels, val_labels = [train_labels_all[i] for i in train_index], [train_labels_all[i] for i in val_index]

    # Create PyTorch datasets
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)

    # Define output directory for this fold
    output_dir = f"./results/Sensitivity/fold_{fold + 1}"
    os.makedirs(output_dir, exist_ok=True)
    output_dirs.append(output_dir)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch", # Save the model after each epoch
        load_best_model_at_end=True, # Load the best model at the end of each fold
        save_total_limit=1, # Keep only the best model checkpoint
        learning_rate=1e-5, 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=20,
        max_grad_norm=1.0,
        warmup_ratio=0.1,
        weight_decay=0.01,
        metric_for_best_model="AUC",
        greater_is_better=True
    )

    # Initialize the model and Trainer for this fold
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        eval_metric="AUC",
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # Train the model on this fold
    trainer.train()

    # Evaluate on the validation set and save the best model's AUC
    val_results = trainer.evaluate()
    val_auc = val_results["eval_AUC"]
    print(f"Fold {fold + 1} Validation AUC: {val_auc}")
    val_auc_scores.append(val_auc)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/5


  0%|          | 0/880 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6264938712120056, 'eval_AUC': 0.8174931129476585, 'eval_runtime': 1.7566, 'eval_samples_per_second': 50.098, 'eval_steps_per_second': 6.262, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4956623911857605, 'eval_AUC': 0.8415977961432507, 'eval_runtime': 1.052, 'eval_samples_per_second': 83.652, 'eval_steps_per_second': 10.457, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.42081698775291443, 'eval_AUC': 0.8519283746556474, 'eval_runtime': 1.0292, 'eval_samples_per_second': 85.501, 'eval_steps_per_second': 10.688, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.38763362169265747, 'eval_AUC': 0.8753443526170799, 'eval_runtime': 1.0654, 'eval_samples_per_second': 82.599, 'eval_steps_per_second': 10.325, 'epoch': 4.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3692934215068817, 'eval_AUC': 0.8884297520661157, 'eval_runtime': 1.0235, 'eval_samples_per_second': 85.976, 'eval_steps_per_second': 10.747, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3478052318096161, 'eval_AUC': 0.8960055096418733, 'eval_runtime': 1.0263, 'eval_samples_per_second': 85.747, 'eval_steps_per_second': 10.718, 'epoch': 6.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3457548916339874, 'eval_AUC': 0.9022038567493114, 'eval_runtime': 1.0815, 'eval_samples_per_second': 81.37, 'eval_steps_per_second': 10.171, 'epoch': 7.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3341217339038849, 'eval_AUC': 0.902892561983471, 'eval_runtime': 1.0431, 'eval_samples_per_second': 84.362, 'eval_steps_per_second': 10.545, 'epoch': 8.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3595989942550659, 'eval_AUC': 0.8932506887052342, 'eval_runtime': 1.0611, 'eval_samples_per_second': 82.936, 'eval_steps_per_second': 10.367, 'epoch': 9.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.37102916836738586, 'eval_AUC': 0.8898071625344354, 'eval_runtime': 1.0267, 'eval_samples_per_second': 85.714, 'eval_steps_per_second': 10.714, 'epoch': 10.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3653239905834198, 'eval_AUC': 0.8911845730027549, 'eval_runtime': 1.0566, 'eval_samples_per_second': 83.286, 'eval_steps_per_second': 10.411, 'epoch': 11.0}
{'loss': 0.3903, 'grad_norm': 0.6354420781135559, 'learning_rate': 4.7979797979797985e-06, 'epoch': 11.36}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.38598373532295227, 'eval_AUC': 0.8960055096418733, 'eval_runtime': 1.0557, 'eval_samples_per_second': 83.356, 'eval_steps_per_second': 10.42, 'epoch': 12.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.40537071228027344, 'eval_AUC': 0.8953168044077136, 'eval_runtime': 1.0939, 'eval_samples_per_second': 80.445, 'eval_steps_per_second': 10.056, 'epoch': 13.0}
{'train_runtime': 112.3768, 'train_samples_per_second': 62.29, 'train_steps_per_second': 7.831, 'train_loss': 0.37212568229728643, 'epoch': 13.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Validation AUC: 0.902892561983471
Fold 2/5


  0%|          | 0/880 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.634581983089447, 'eval_AUC': 0.8326446280991735, 'eval_runtime': 1.1001, 'eval_samples_per_second': 79.995, 'eval_steps_per_second': 9.999, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5066567063331604, 'eval_AUC': 0.8767217630853994, 'eval_runtime': 1.0816, 'eval_samples_per_second': 81.358, 'eval_steps_per_second': 10.17, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.43230876326560974, 'eval_AUC': 0.8829201101928374, 'eval_runtime': 1.0851, 'eval_samples_per_second': 81.1, 'eval_steps_per_second': 10.137, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.40145790576934814, 'eval_AUC': 0.8898071625344354, 'eval_runtime': 1.1349, 'eval_samples_per_second': 77.539, 'eval_steps_per_second': 9.692, 'epoch': 4.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.384097695350647, 'eval_AUC': 0.8994490358126722, 'eval_runtime': 1.084, 'eval_samples_per_second': 81.181, 'eval_steps_per_second': 10.148, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.37548306584358215, 'eval_AUC': 0.9152892561983471, 'eval_runtime': 1.1247, 'eval_samples_per_second': 78.24, 'eval_steps_per_second': 9.78, 'epoch': 6.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3767509460449219, 'eval_AUC': 0.8994490358126722, 'eval_runtime': 1.1129, 'eval_samples_per_second': 79.072, 'eval_steps_per_second': 9.884, 'epoch': 7.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.38557395339012146, 'eval_AUC': 0.8829201101928374, 'eval_runtime': 1.1582, 'eval_samples_per_second': 75.983, 'eval_steps_per_second': 9.498, 'epoch': 8.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.39226868748664856, 'eval_AUC': 0.884297520661157, 'eval_runtime': 1.1305, 'eval_samples_per_second': 77.839, 'eval_steps_per_second': 9.73, 'epoch': 9.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4123501777648926, 'eval_AUC': 0.9139118457300275, 'eval_runtime': 1.1637, 'eval_samples_per_second': 75.623, 'eval_steps_per_second': 9.453, 'epoch': 10.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.416269987821579, 'eval_AUC': 0.9173553719008265, 'eval_runtime': 1.1327, 'eval_samples_per_second': 77.692, 'eval_steps_per_second': 9.711, 'epoch': 11.0}
{'loss': 0.383, 'grad_norm': 5.624505043029785, 'learning_rate': 4.7979797979797985e-06, 'epoch': 11.36}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.42816245555877686, 'eval_AUC': 0.9132231404958677, 'eval_runtime': 1.1277, 'eval_samples_per_second': 78.032, 'eval_steps_per_second': 9.754, 'epoch': 12.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45542553067207336, 'eval_AUC': 0.9132231404958678, 'eval_runtime': 1.1559, 'eval_samples_per_second': 76.129, 'eval_steps_per_second': 9.516, 'epoch': 13.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.44463634490966797, 'eval_AUC': 0.9173553719008265, 'eval_runtime': 1.1311, 'eval_samples_per_second': 77.8, 'eval_steps_per_second': 9.725, 'epoch': 14.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45761987566947937, 'eval_AUC': 0.9173553719008265, 'eval_runtime': 1.1611, 'eval_samples_per_second': 75.79, 'eval_steps_per_second': 9.474, 'epoch': 15.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.46537676453590393, 'eval_AUC': 0.915977961432507, 'eval_runtime': 1.1236, 'eval_samples_per_second': 78.322, 'eval_steps_per_second': 9.79, 'epoch': 16.0}
{'train_runtime': 130.2428, 'train_samples_per_second': 53.746, 'train_steps_per_second': 6.757, 'train_loss': 0.33179908990859985, 'epoch': 16.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 Validation AUC: 0.9173553719008265
Fold 3/5


  0%|          | 0/880 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6380318403244019, 'eval_AUC': 0.7878787878787878, 'eval_runtime': 1.1573, 'eval_samples_per_second': 76.038, 'eval_steps_per_second': 9.505, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5436145067214966, 'eval_AUC': 0.7961432506887052, 'eval_runtime': 1.1187, 'eval_samples_per_second': 78.666, 'eval_steps_per_second': 9.833, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4642440676689148, 'eval_AUC': 0.8202479338842975, 'eval_runtime': 1.1764, 'eval_samples_per_second': 74.803, 'eval_steps_per_second': 9.35, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4294477701187134, 'eval_AUC': 0.8333333333333333, 'eval_runtime': 1.1413, 'eval_samples_per_second': 77.108, 'eval_steps_per_second': 9.638, 'epoch': 4.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4252989888191223, 'eval_AUC': 0.8168044077134986, 'eval_runtime': 1.1576, 'eval_samples_per_second': 76.02, 'eval_steps_per_second': 9.502, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.40686747431755066, 'eval_AUC': 0.8684573002754821, 'eval_runtime': 1.1238, 'eval_samples_per_second': 78.306, 'eval_steps_per_second': 9.788, 'epoch': 6.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4235215187072754, 'eval_AUC': 0.8546831955922866, 'eval_runtime': 1.1591, 'eval_samples_per_second': 75.921, 'eval_steps_per_second': 9.49, 'epoch': 7.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.40978679060935974, 'eval_AUC': 0.8767217630853995, 'eval_runtime': 1.1319, 'eval_samples_per_second': 77.746, 'eval_steps_per_second': 9.718, 'epoch': 8.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4153749644756317, 'eval_AUC': 0.8746556473829201, 'eval_runtime': 1.1674, 'eval_samples_per_second': 75.383, 'eval_steps_per_second': 9.423, 'epoch': 9.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4351847469806671, 'eval_AUC': 0.8808539944903581, 'eval_runtime': 1.1269, 'eval_samples_per_second': 78.088, 'eval_steps_per_second': 9.761, 'epoch': 10.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4557287395000458, 'eval_AUC': 0.8746556473829201, 'eval_runtime': 1.0855, 'eval_samples_per_second': 81.07, 'eval_steps_per_second': 10.134, 'epoch': 11.0}
{'loss': 0.3844, 'grad_norm': 12.879369735717773, 'learning_rate': 4.7979797979797985e-06, 'epoch': 11.36}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.46161648631095886, 'eval_AUC': 0.8732782369146006, 'eval_runtime': 1.1187, 'eval_samples_per_second': 78.664, 'eval_steps_per_second': 9.833, 'epoch': 12.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.46311986446380615, 'eval_AUC': 0.8842975206611571, 'eval_runtime': 1.1268, 'eval_samples_per_second': 78.095, 'eval_steps_per_second': 9.762, 'epoch': 13.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.46962645649909973, 'eval_AUC': 0.8829201101928374, 'eval_runtime': 1.1716, 'eval_samples_per_second': 75.114, 'eval_steps_per_second': 9.389, 'epoch': 14.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4834193289279938, 'eval_AUC': 0.8863636363636364, 'eval_runtime': 1.1284, 'eval_samples_per_second': 77.984, 'eval_steps_per_second': 9.748, 'epoch': 15.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5074056386947632, 'eval_AUC': 0.8739669421487604, 'eval_runtime': 1.1753, 'eval_samples_per_second': 74.874, 'eval_steps_per_second': 9.359, 'epoch': 16.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5204262733459473, 'eval_AUC': 0.8705234159779615, 'eval_runtime': 1.1578, 'eval_samples_per_second': 76.007, 'eval_steps_per_second': 9.501, 'epoch': 17.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.515810489654541, 'eval_AUC': 0.8815426997245179, 'eval_runtime': 1.12, 'eval_samples_per_second': 78.574, 'eval_steps_per_second': 9.822, 'epoch': 18.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5063533186912537, 'eval_AUC': 0.8822314049586777, 'eval_runtime': 1.0764, 'eval_samples_per_second': 81.752, 'eval_steps_per_second': 10.219, 'epoch': 19.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5096075534820557, 'eval_AUC': 0.8856749311294766, 'eval_runtime': 1.1823, 'eval_samples_per_second': 74.429, 'eval_steps_per_second': 9.304, 'epoch': 20.0}
{'train_runtime': 165.4455, 'train_samples_per_second': 42.31, 'train_steps_per_second': 5.319, 'train_loss': 0.3075168176130815, 'epoch': 20.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 3 Validation AUC: 0.8863636363636364
Fold 4/5


  0%|          | 0/880 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.640727162361145, 'eval_AUC': 0.804895104895105, 'eval_runtime': 1.1212, 'eval_samples_per_second': 77.599, 'eval_steps_per_second': 9.811, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.528918981552124, 'eval_AUC': 0.8741258741258742, 'eval_runtime': 1.1379, 'eval_samples_per_second': 76.457, 'eval_steps_per_second': 9.667, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.44044557213783264, 'eval_AUC': 0.8867132867132866, 'eval_runtime': 1.0687, 'eval_samples_per_second': 81.41, 'eval_steps_per_second': 10.293, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4006654918193817, 'eval_AUC': 0.8692307692307691, 'eval_runtime': 1.0797, 'eval_samples_per_second': 80.581, 'eval_steps_per_second': 10.188, 'epoch': 4.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3798763155937195, 'eval_AUC': 0.8944055944055944, 'eval_runtime': 1.1964, 'eval_samples_per_second': 72.717, 'eval_steps_per_second': 9.194, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3474891185760498, 'eval_AUC': 0.8783216783216783, 'eval_runtime': 1.085, 'eval_samples_per_second': 80.183, 'eval_steps_per_second': 10.138, 'epoch': 6.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.34058186411857605, 'eval_AUC': 0.8811188811188811, 'eval_runtime': 1.1551, 'eval_samples_per_second': 75.319, 'eval_steps_per_second': 9.523, 'epoch': 7.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3450172543525696, 'eval_AUC': 0.8825174825174825, 'eval_runtime': 1.138, 'eval_samples_per_second': 76.452, 'eval_steps_per_second': 9.666, 'epoch': 8.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.33641713857650757, 'eval_AUC': 0.8874125874125874, 'eval_runtime': 1.0643, 'eval_samples_per_second': 81.744, 'eval_steps_per_second': 10.335, 'epoch': 9.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3495103716850281, 'eval_AUC': 0.8874125874125874, 'eval_runtime': 1.0864, 'eval_samples_per_second': 80.082, 'eval_steps_per_second': 10.125, 'epoch': 10.0}
{'train_runtime': 82.8727, 'train_samples_per_second': 84.708, 'train_steps_per_second': 10.619, 'train_loss': 0.40675707730379973, 'epoch': 10.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 4 Validation AUC: 0.8944055944055944
Fold 5/5


  0%|          | 0/880 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6463003158569336, 'eval_AUC': 0.8510489510489511, 'eval_runtime': 1.1052, 'eval_samples_per_second': 78.722, 'eval_steps_per_second': 9.953, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5141425132751465, 'eval_AUC': 0.8993006993006993, 'eval_runtime': 1.1558, 'eval_samples_per_second': 75.272, 'eval_steps_per_second': 9.517, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4230550527572632, 'eval_AUC': 0.9006993006993007, 'eval_runtime': 1.126, 'eval_samples_per_second': 77.267, 'eval_steps_per_second': 9.769, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3699590861797333, 'eval_AUC': 0.9146853146853147, 'eval_runtime': 1.1808, 'eval_samples_per_second': 73.679, 'eval_steps_per_second': 9.316, 'epoch': 4.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3499019742012024, 'eval_AUC': 0.913986013986014, 'eval_runtime': 1.137, 'eval_samples_per_second': 76.519, 'eval_steps_per_second': 9.675, 'epoch': 5.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.33215266466140747, 'eval_AUC': 0.9083916083916085, 'eval_runtime': 1.0985, 'eval_samples_per_second': 79.199, 'eval_steps_per_second': 10.014, 'epoch': 6.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.32188040018081665, 'eval_AUC': 0.8916083916083917, 'eval_runtime': 1.0842, 'eval_samples_per_second': 80.246, 'eval_steps_per_second': 10.146, 'epoch': 7.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.31591010093688965, 'eval_AUC': 0.8881118881118881, 'eval_runtime': 1.1177, 'eval_samples_per_second': 77.837, 'eval_steps_per_second': 9.841, 'epoch': 8.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.3191523849964142, 'eval_AUC': 0.8846153846153846, 'eval_runtime': 1.1167, 'eval_samples_per_second': 77.911, 'eval_steps_per_second': 9.851, 'epoch': 9.0}
{'train_runtime': 72.6013, 'train_samples_per_second': 96.693, 'train_steps_per_second': 12.121, 'train_loss': 0.4242021194612137, 'epoch': 9.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Fold 5 Validation AUC: 0.9146853146853147


In [4]:
# Calculate mean and standard deviation for validation AUC scores
mean_val_auc = np.mean(val_auc_scores)
std_val_auc = np.std(val_auc_scores)

# Print the results
print(f"Validation AUC: Mean = {mean_val_auc:.4f}, Standard Deviation = {std_val_auc:.4f}")

Validation AUC: Mean = 0.9031, Standard Deviation = 0.0118


In [5]:
best_fold_idx = np.argmax(val_auc_scores)
best_model_dir = output_dirs[best_fold_idx]  # Directory of the best model
print(f"Best model found in fold {best_fold_idx + 1} with Validation AUC: {val_auc_scores[best_fold_idx]}")


Best model found in fold 2 with Validation AUC: 0.9173553719008265


In [6]:
# use the best model and do the final training
best_model_dir = './results/Sensitivity/fold_2/checkpoint-484'
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_dir)

full_train_dataset = TextDataset(train_texts_all, train_labels_all, tokenizer)

# Define training arguments for the final training phase
final_training_args = TrainingArguments(
    output_dir="./Sensitivity/final_model",       # Directory to save the final model
    evaluation_strategy="no",         # No evaluation during training
    save_strategy="no",            # Save the model at each epoch
    save_total_limit=1,               # Keep only the last checkpoint to save storage
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10000,              # Minimize logging output
    report_to="none"                  # Disable logging to external tools
)

# Initialize the Trainer with the full dataset and final training arguments
trainer = Trainer(
    model=best_model,
    args=final_training_args,
    train_dataset=full_train_dataset
)

# Evaluate on the test set
test_results = trainer.predict(test_dataset)
# Calculate AUC on the test data
test_probs = torch.nn.functional.softmax(torch.tensor(test_results.predictions), dim=1)[:, 1].numpy()
test_auc = roc_auc_score(test_results.label_ids, test_probs)
print(f"Test AUC with the best model: {test_auc}")



  0%|          | 0/7 [00:00<?, ?it/s]

Test AUC with the best model: 0.8445945945945945
