In [None]:
import pandas as pd
import time
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
import evaluate

from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, AutoModelForSequenceClassification

# ----------------------------------------------------
# Step 1. Read CSV file and basic preprocessing
# ----------------------------------------------------
df = pd.read_csv("output_file2.csv")
df["text"] = df["text"].str.replace(r"http\S+", "", regex=True)
df['label'] = df['label'].replace(2, 1)
df['label'] = df['label'].replace(3, 2)

# ----------------------------------------------------
# Step 2. Automatically compute class weights
# ----------------------------------------------------
label_counts = df['label'].value_counts().sort_index()  # Assuming sorted by label value
print("Label counts:\n", label_counts)

total_samples = len(df)
num_classes = len(label_counts)
computed_weights = total_samples / (num_classes * label_counts)
print("Computed class weights:\n", computed_weights)

class_weights = torch.tensor(computed_weights.values, dtype=torch.float)
print("Tensor class weights:", class_weights)

# ----------------------------------------------------
# Step 3. Load model, configuration, and tokenizer
# ----------------------------------------------------
model_checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

config = RobertaConfig.from_pretrained(model_checkpoint)
config.num_labels = num_classes

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

model = RobertaForSequenceClassification.from_pretrained(
    model_checkpoint,
    config=config,
    from_tf=True,
    ignore_mismatched_sizes=True
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded successfully!")
print("Classifier layer shape:", model.classifier.out_proj.weight.shape)

output_dir = "./results"

# ----------------------------------------------------
# Step 4. Tokenize the dataset
# ----------------------------------------------------
def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, truncation=True, max_length=512)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# ----------------------------------------------------
# Step 5. Create custom Trainer with weighted loss
# ----------------------------------------------------
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Ensure the class weights are on the same device as the logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ----------------------------------------------------
# Step 6. Metrics computation for evaluation
# ----------------------------------------------------
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "weighted_f1": f1,
    }

# ----------------------------------------------------
# Step 7. Setup Stratified K-Fold Cross-Validation and training
# ----------------------------------------------------
n_splits = 10

labels = df['label'].values

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_metrics = {}
results_report = [] 

data_collator = DataCollatorWithPadding(tokenizer)

print("Starting 10-fold Cross-Validation with StratifiedKFold...\n")

for fold, (train_index, test_index) in enumerate(skf.split(np.zeros(len(labels)), labels), start=1):
    print(f"Starting fold {fold}...")
    start_fold_time = time.time()

    train_split = tokenized_dataset.select(train_index.tolist())
    val_split = tokenized_dataset.select(test_index.tolist())

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-05,
        weight_decay=0.1,
        logging_steps=10,
        disable_tqdm=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_weighted_f1",
        greater_is_better=True,
        no_cuda=False,
        warmup_steps=500,
        gradient_accumulation_steps=5,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        logging_dir='./logs',
        run_name="roberta_weighted_loss_stratified",
        save_total_limit=3,
    )

    fold_model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_classes,
        from_tf=True
    ).to(device)
    
    trainer = WeightedLossTrainer(
        model=fold_model,
        args=training_args,
        train_dataset=train_split,
        eval_dataset=val_split,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    metrics = trainer.evaluate()

    end_fold_time = time.time()
    time_elapsed = end_fold_time - start_fold_time
    metrics["time_elapsed_secs"] = time_elapsed

    print(f"Metrics for fold {fold}: {metrics}\n")
    fold_metrics[fold] = metrics

    fold_report = {
        "fold": fold,
        "hyperparameters": {
            "num_train_epochs": training_args.num_train_epochs,
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "weight_decay": training_args.weight_decay,
        },
        "metrics": metrics
    }
    results_report.append(fold_report)

    trainer.model.save_pretrained(f"./binary_roberta_final_fold_{fold}")
    tokenizer.save_pretrained(f"./binary_roberta_fold_{fold}")

trainer.save_model("./final_finetuned_model")
tokenizer.save_pretrained("./final_finetuned_model")
if all("accuracy" in fold_metrics[f] for f in fold_metrics):
    avg_accuracy = np.mean([fold_metrics[f]["accuracy"] for f in fold_metrics])
    print(f"Average eval accuracy over {n_splits} folds: {avg_accuracy:.4f}")
else:
    print("Some folds did not return 'accuracy' metric.")

print("\nFinal Cross-Validation Report:")
for fold_report in results_report:
    print(f"Fold {fold_report['fold']}:")
    print("  Hyperparameters:")
    for hp, val in fold_report["hyperparameters"].items():
        print(f"    {hp}: {val}")
    print("  Metrics:")
    for metric, value in fold_report["metrics"].items():
        if metric == "time_elapsed_secs":
            print(f"    {metric}: {value:.2f} seconds")
        else:
            print(f"    {metric}: {value:.4f}")
    print("")


  from .autonotebook import tqdm as notebook_tqdm



Label counts:
 label
0     2884
1     3846
2    12356
Name: count, dtype: int64
Computed class weights:
 label
0    2.205964
1    1.654186
2    0.514892
Name: count, dtype: float64
Tensor class weights: tensor([2.2060, 1.6542, 0.5149])



All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


Model loaded successfully!
Classifier layer shape: torch.Size([3, 768])


Map: 100%|██████████| 19086/19086 [00:03<00:00, 6055.80 examples/s]


Starting 10-fold Cross-Validation with StratifiedKFold...

Starting fold 1...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.4455,0.495441,0.811943,0.849332,0.811943,0.820918
1,0.3246,0.331663,0.876899,0.879987,0.876899,0.877715
2,0.2749,0.335327,0.888423,0.899047,0.888423,0.890573
3,0.1979,0.340052,0.903091,0.913377,0.903091,0.905243
4,0.0716,0.311322,0.933473,0.933073,0.933473,0.93315
5,0.032,0.332763,0.939759,0.940771,0.939759,0.94012
6,0.05,0.326009,0.941854,0.942928,0.941854,0.94203
7,0.0255,0.339222,0.944474,0.944752,0.944474,0.944589
8,0.0123,0.345773,0.947093,0.947287,0.947093,0.947158
9,0.0105,0.346155,0.947617,0.947851,0.947617,0.9477


Metrics for fold 1: {'eval_loss': 0.3461545705795288, 'eval_accuracy': 0.9476165531691986, 'eval_precision': 0.9478505819056045, 'eval_recall': 0.9476165531691986, 'eval_weighted_f1': 0.9477001468421047, 'eval_runtime': 6.0227, 'eval_samples_per_second': 316.965, 'eval_steps_per_second': 39.683, 'epoch': 9.998603351955307, 'time_elapsed_secs': 2156.668385028839}

Starting fold 2...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.3659,0.352166,0.872708,0.880702,0.872708,0.875105
1,0.3352,0.300409,0.899948,0.90406,0.899948,0.900922
2,0.1958,0.322024,0.888947,0.900761,0.888947,0.891344
3,0.0919,0.35154,0.910424,0.919317,0.910424,0.912263
4,0.0951,0.305053,0.935045,0.93519,0.935045,0.9346
5,0.0602,0.343808,0.929282,0.931357,0.929282,0.929945
6,0.0123,0.30083,0.943426,0.943183,0.943426,0.943271
7,0.0219,0.35501,0.942378,0.942641,0.942378,0.942484
8,0.0324,0.353812,0.940807,0.941299,0.940807,0.940985


Metrics for fold 2: {'eval_loss': 0.30083024501800537, 'eval_accuracy': 0.9434258774227344, 'eval_precision': 0.9431832236309632, 'eval_recall': 0.9434258774227344, 'eval_weighted_f1': 0.9432711235298249, 'eval_runtime': 31.1055, 'eval_samples_per_second': 61.372, 'eval_steps_per_second': 7.684, 'epoch': 8.998603351955307, 'time_elapsed_secs': 4073.7829988002777}

Starting fold 3...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.4601,0.433537,0.833944,0.858515,0.833944,0.839237
1,0.2604,0.292305,0.900471,0.90352,0.900471,0.90138
2,0.2508,0.278545,0.906757,0.90861,0.906757,0.907394
3,0.1914,0.261319,0.924044,0.924314,0.924044,0.924155
4,0.0567,0.305064,0.942378,0.942156,0.942378,0.942163
5,0.0496,0.282355,0.944474,0.944663,0.944474,0.944558
6,0.0316,0.30537,0.949712,0.949995,0.949712,0.949834
7,0.0381,0.298898,0.951283,0.951016,0.951283,0.950995
8,0.0094,0.286704,0.953379,0.953103,0.953379,0.953182
9,0.0176,0.286094,0.954426,0.954169,0.954426,0.954241


Metrics for fold 3: {'eval_loss': 0.28609412908554077, 'eval_accuracy': 0.9544264012572027, 'eval_precision': 0.9541694699532889, 'eval_recall': 0.9544264012572027, 'eval_weighted_f1': 0.9542410945205343, 'eval_runtime': 5.5486, 'eval_samples_per_second': 344.053, 'eval_steps_per_second': 43.074, 'epoch': 9.998603351955307, 'time_elapsed_secs': 2134.239250898361}

Starting fold 4...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.4823,0.439942,0.838659,0.859375,0.838659,0.843804
1,0.2888,0.299529,0.894185,0.897587,0.894185,0.895314
2,0.2166,0.440011,0.856993,0.887707,0.856993,0.861658
3,0.252,0.295099,0.910948,0.916729,0.910948,0.912326
4,0.0981,0.337965,0.929282,0.928718,0.929282,0.928795
5,0.0291,0.333169,0.936616,0.936126,0.936616,0.936145
6,0.0455,0.338766,0.936092,0.936325,0.936092,0.936192
7,0.0056,0.33548,0.942902,0.943409,0.942902,0.943048
8,0.048,0.343962,0.94814,0.948129,0.94814,0.948116
9,0.01,0.346386,0.946569,0.946646,0.946569,0.946599


Metrics for fold 4: {'eval_loss': 0.34396159648895264, 'eval_accuracy': 0.9481403876375065, 'eval_precision': 0.9481287323752459, 'eval_recall': 0.9481403876375065, 'eval_weighted_f1': 0.948116189583171, 'eval_runtime': 5.189, 'eval_samples_per_second': 367.896, 'eval_steps_per_second': 46.059, 'epoch': 9.998603351955307, 'time_elapsed_secs': 2139.797264099121}

Starting fold 5...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.4707,0.428293,0.842326,0.859798,0.842326,0.846381
1,0.369,0.301847,0.890519,0.900519,0.890519,0.892707
2,0.2174,0.278917,0.918806,0.920859,0.918806,0.91943
3,0.1351,0.269988,0.935045,0.936326,0.935045,0.935397
4,0.0387,0.242363,0.94395,0.944087,0.94395,0.944011
5,0.0379,0.308663,0.943426,0.944827,0.943426,0.943828
6,0.0038,0.30523,0.944997,0.948063,0.944997,0.94571
7,0.0627,0.287851,0.952855,0.953009,0.952855,0.952921
8,0.0146,0.302958,0.952331,0.952807,0.952331,0.952489
9,0.0029,0.298939,0.953379,0.953657,0.953379,0.953478


Metrics for fold 5: {'eval_loss': 0.2989385426044464, 'eval_accuracy': 0.9533787323205867, 'eval_precision': 0.9536566258372823, 'eval_recall': 0.9533787323205867, 'eval_weighted_f1': 0.9534782040715195, 'eval_runtime': 44.9112, 'eval_samples_per_second': 42.506, 'eval_steps_per_second': 5.322, 'epoch': 9.998603351955307, 'time_elapsed_secs': 3073.0223982334137}

Starting fold 6...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
0,0.5122,0.410772,0.855422,0.867416,0.855422,0.856708
1,0.3291,0.252798,0.911472,0.912136,0.911472,0.911678
2,0.2237,0.282109,0.907281,0.91646,0.907281,0.909251
3,0.1172,0.261542,0.927711,0.929963,0.927711,0.928389
4,0.045,0.281528,0.933997,0.93356,0.933997,0.933582
5,0.0495,0.281731,0.94814,0.948713,0.94814,0.948239
6,0.0196,0.308626,0.946045,0.945798,0.946045,0.945863
7,0.0401,0.316958,0.947617,0.948184,0.947617,0.947819


Metrics for fold 6: {'eval_loss': 0.28173086047172546, 'eval_accuracy': 0.9481403876375065, 'eval_precision': 0.9487130911949786, 'eval_recall': 0.9481403876375065, 'eval_weighted_f1': 0.948239099582891, 'eval_runtime': 5.3194, 'eval_samples_per_second': 358.876, 'eval_steps_per_second': 44.93, 'epoch': 7.998603351955307, 'time_elapsed_secs': 1756.2710304260254}

Starting fold 7...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
1,0.4596,0.494261,0.809748,0.855635,0.809748,0.819266
2,0.3296,0.296103,0.893606,0.897098,0.893606,0.894226
3,0.147,0.237472,0.917191,0.919829,0.917191,0.917986
4,0.1004,0.204788,0.939727,0.939682,0.939727,0.939244
5,0.1292,0.277056,0.942348,0.942165,0.942348,0.942231
6,0.119,0.288825,0.944444,0.944427,0.944444,0.944344
7,0.0485,0.269726,0.949686,0.949407,0.949686,0.94949
8,0.0229,0.287809,0.949686,0.949505,0.949686,0.949561
9,0.0089,0.289261,0.957023,0.956768,0.957023,0.956807


Metrics for fold 7: {'eval_loss': 0.2892606854438782, 'eval_accuracy': 0.9570230607966457, 'eval_precision': 0.9567676045610131, 'eval_recall': 0.9570230607966457, 'eval_weighted_f1': 0.9568069749294131, 'eval_runtime': 68.5354, 'eval_samples_per_second': 27.84, 'eval_steps_per_second': 3.487, 'epoch': 9.977653631284916, 'time_elapsed_secs': 9475.37679886818}

Starting fold 8...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
1,0.4192,0.477342,0.833857,0.853325,0.833857,0.838903
2,0.3593,0.336912,0.877883,0.888041,0.877883,0.879718
3,0.2195,0.411549,0.878407,0.900437,0.878407,0.883356
4,0.093,0.30932,0.918763,0.92144,0.918763,0.919519
5,0.0981,0.38058,0.924528,0.925432,0.924528,0.924902
6,0.0211,0.367486,0.935535,0.934873,0.935535,0.93505
7,0.0807,0.350087,0.939203,0.938713,0.939203,0.938828
8,0.0198,0.393625,0.938155,0.937725,0.938155,0.937772
9,0.0004,0.398001,0.9413,0.941376,0.9413,0.941293


Metrics for fold 8: {'eval_loss': 0.39800071716308594, 'eval_accuracy': 0.9412997903563941, 'eval_precision': 0.941376311202224, 'eval_recall': 0.9412997903563941, 'eval_weighted_f1': 0.9412929896215166, 'eval_runtime': 5.2014, 'eval_samples_per_second': 366.824, 'eval_steps_per_second': 45.949, 'epoch': 9.977653631284916, 'time_elapsed_secs': 2171.687174320221}

Starting fold 9...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
1,0.5302,0.522111,0.800314,0.842746,0.800314,0.807293
2,0.3709,0.318466,0.884696,0.88348,0.884696,0.882716
3,0.188,0.308727,0.897799,0.902023,0.897799,0.899137
4,0.1319,0.328702,0.909329,0.913788,0.909329,0.910422
5,0.07,0.378811,0.918763,0.921733,0.918763,0.919592
6,0.0908,0.330094,0.936583,0.937164,0.936583,0.936747
7,0.1046,0.346972,0.939727,0.939359,0.939727,0.939346
8,0.034,0.363171,0.938155,0.938382,0.938155,0.938251
9,0.0065,0.376176,0.932914,0.933212,0.932914,0.93304


Metrics for fold 9: {'eval_loss': 0.34697163105010986, 'eval_accuracy': 0.939727463312369, 'eval_precision': 0.9393586954905871, 'eval_recall': 0.939727463312369, 'eval_weighted_f1': 0.9393458638706391, 'eval_runtime': 5.9182, 'eval_samples_per_second': 322.393, 'eval_steps_per_second': 40.384, 'epoch': 9.0, 'time_elapsed_secs': 1932.3676807880402}

Starting fold 10...


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Weighted F1
1,0.4509,0.529989,0.811845,0.859058,0.811845,0.822848
2,0.37,0.303827,0.896226,0.895241,0.896226,0.895048
3,0.1886,0.319373,0.906709,0.911749,0.906709,0.907992
4,0.0784,0.313091,0.924004,0.925043,0.924004,0.924295
5,0.1184,0.3162,0.931866,0.934022,0.931866,0.932597
6,0.0359,0.437987,0.925577,0.93111,0.925577,0.927039
7,0.0223,0.322401,0.944969,0.944696,0.944969,0.94475
8,0.0093,0.339707,0.948113,0.94847,0.948113,0.948222
9,0.0039,0.340034,0.949686,0.949665,0.949686,0.949642


Metrics for fold 10: {'eval_loss': 0.3394358158111572, 'eval_accuracy': 0.949685534591195, 'eval_precision': 0.9498381282892528, 'eval_recall': 0.949685534591195, 'eval_weighted_f1': 0.9497263601159663, 'eval_runtime': 5.8361, 'eval_samples_per_second': 326.931, 'eval_steps_per_second': 40.952, 'epoch': 9.977653631284916, 'time_elapsed_secs': 2143.9426476955414}

Some folds did not return 'accuracy' metric.

Final Cross-Validation Report:
Fold 1:
  Hyperparameters:
    num_train_epochs: 10
    per_device_train_batch_size: 8
    learning_rate: 3e-05
    weight_decay: 0.1
  Metrics:
    eval_loss: 0.3462
    eval_accuracy: 0.9476
    eval_precision: 0.9479
    eval_recall: 0.9476
    eval_weighted_f1: 0.9477
    eval_runtime: 6.0227
    eval_samples_per_second: 316.9650
    eval_steps_per_second: 39.6830
    epoch: 9.9986
    time_elapsed_secs: 2156.67 seconds

Fold 2:
  Hyperparameters:
    num_train_epochs: 10
    per_device_train_batch_size: 8
    learning_rate: 3e-05
    weight_decay