In [1]:
import os
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')
import pickle
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback

from mod.mod_text import *

# prepare the input data
with open("./data/long_vs_shortTF/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/long_vs_shortTF/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

# Prepare datasets
train_texts_all, train_labels_all = train_data['desc'], train_data['labels']
test_texts, test_labels = test_data['desc'], test_data['labels']

# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n_splits = 2  # Number of folds
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=7)

# Initialize a list to store AUC scores for each fold and on the test data
val_auc_scores = []
output_dirs = []  # Track output directories for each fold

# Create test dataset
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

In [3]:
# Loop over each fold
for fold, (train_index, val_index) in enumerate(kf.split(train_texts_all, train_labels_all)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Split data into training and validation for this fold
    train_texts, val_texts = [train_texts_all[i] for i in train_index], [train_texts_all[i] for i in val_index]
    train_labels, val_labels = [train_labels_all[i] for i in train_index], [train_labels_all[i] for i in val_index]

    # Create PyTorch datasets
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)

    # Define output directory for this fold
    output_dir = f"./results/LongShortTF/fold_{fold + 1}"
    os.makedirs(output_dir, exist_ok=True)
    output_dirs.append(output_dir)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch", # Save checkpoints at the end of each epoch
        load_best_model_at_end=True, # Load the best model at the end of each fold
        save_total_limit=1, # Keep only the best model checkpoint
        learning_rate=1e-4, 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=20,
        max_grad_norm=1.0,
        warmup_ratio=0.1,
        weight_decay=0.01,
        metric_for_best_model="AUC",
        greater_is_better=True
    )

    # Initialize the model and Trainer for this fold
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        eval_metric="AUC",
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # Train the model on this fold
    trainer.train()
    trainer.save_model(output_dir)
    
    # Evaluate on the validation set and save the best model's AUC
    val_results = trainer.evaluate()
    val_auc = val_results["eval_AUC"]
    print(f"Fold {fold + 1} Validation AUC: {val_auc}")
    val_auc_scores.append(val_auc)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/2


                                                
  5%|▌         | 10/200 [00:09<02:34,  1.23it/s]

{'eval_loss': 0.6026808619499207, 'eval_AUC': 0.5301724137931034, 'eval_runtime': 1.3034, 'eval_samples_per_second': 59.842, 'eval_steps_per_second': 7.672, 'epoch': 1.0}


                                                
 10%|█         | 20/200 [00:12<00:32,  5.47it/s]

{'eval_loss': 0.5705962181091309, 'eval_AUC': 0.6568965517241379, 'eval_runtime': 0.3513, 'eval_samples_per_second': 222.024, 'eval_steps_per_second': 28.465, 'epoch': 2.0}


                                                
 15%|█▌        | 30/200 [00:14<00:25,  6.61it/s]

{'eval_loss': 0.5694783329963684, 'eval_AUC': 0.628448275862069, 'eval_runtime': 0.3622, 'eval_samples_per_second': 215.379, 'eval_steps_per_second': 27.613, 'epoch': 3.0}


                                                
 20%|██        | 40/200 [00:16<00:24,  6.64it/s]

{'eval_loss': 0.5654751658439636, 'eval_AUC': 0.6827586206896551, 'eval_runtime': 0.3319, 'eval_samples_per_second': 234.993, 'eval_steps_per_second': 30.127, 'epoch': 4.0}


                                                
 25%|██▌       | 50/200 [00:18<00:22,  6.81it/s]

{'eval_loss': 0.5622307658195496, 'eval_AUC': 0.596551724137931, 'eval_runtime': 0.3329, 'eval_samples_per_second': 234.275, 'eval_steps_per_second': 30.035, 'epoch': 5.0}


                                                
 30%|███       | 60/200 [00:21<00:20,  6.88it/s]

{'eval_loss': 0.5650736689567566, 'eval_AUC': 0.5499999999999999, 'eval_runtime': 0.3264, 'eval_samples_per_second': 238.967, 'eval_steps_per_second': 30.637, 'epoch': 6.0}


                                                
 35%|███▌      | 70/200 [00:23<00:20,  6.29it/s]

{'eval_loss': 0.6142027378082275, 'eval_AUC': 0.506896551724138, 'eval_runtime': 0.3518, 'eval_samples_per_second': 221.747, 'eval_steps_per_second': 28.429, 'epoch': 7.0}


                                                
 40%|████      | 80/200 [00:25<00:18,  6.56it/s]

{'eval_loss': 0.7038120031356812, 'eval_AUC': 0.5344827586206897, 'eval_runtime': 0.3531, 'eval_samples_per_second': 220.893, 'eval_steps_per_second': 28.32, 'epoch': 8.0}


                                                
 45%|████▌     | 90/200 [00:28<00:17,  6.43it/s]

{'eval_loss': 0.7227447628974915, 'eval_AUC': 0.5767241379310345, 'eval_runtime': 0.3493, 'eval_samples_per_second': 223.3, 'eval_steps_per_second': 28.628, 'epoch': 9.0}


 45%|████▌     | 90/200 [00:28<00:35,  3.13it/s]


{'train_runtime': 28.7666, 'train_samples_per_second': 54.23, 'train_steps_per_second': 6.953, 'train_loss': 0.5181591033935546, 'epoch': 9.0}


100%|██████████| 10/10 [00:00<00:00, 34.14it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Validation AUC: 0.6827586206896551
Fold 2/2


  5%|▌         | 10/200 [00:01<00:25,  7.32it/s]
  5%|▌         | 10/200 [00:01<00:25,  7.32it/s]

{'eval_loss': 0.6239805817604065, 'eval_AUC': 0.4644945697577276, 'eval_runtime': 0.3546, 'eval_samples_per_second': 219.941, 'eval_steps_per_second': 28.198, 'epoch': 1.0}


 10%|█         | 20/200 [00:03<00:27,  6.61it/s]
 10%|█         | 20/200 [00:04<00:27,  6.61it/s]

{'eval_loss': 0.5938037037849426, 'eval_AUC': 0.4945697577276525, 'eval_runtime': 0.3825, 'eval_samples_per_second': 203.928, 'eval_steps_per_second': 26.145, 'epoch': 2.0}


 15%|█▌        | 30/200 [00:06<00:25,  6.58it/s]
 15%|█▌        | 30/200 [00:06<00:25,  6.58it/s]

{'eval_loss': 0.591926634311676, 'eval_AUC': 0.4619883040935673, 'eval_runtime': 0.3471, 'eval_samples_per_second': 224.729, 'eval_steps_per_second': 28.811, 'epoch': 3.0}


 20%|██        | 40/200 [00:08<00:26,  5.97it/s]
 20%|██        | 40/200 [00:08<00:26,  5.97it/s]

{'eval_loss': 0.5851405262947083, 'eval_AUC': 0.5497076023391814, 'eval_runtime': 0.3696, 'eval_samples_per_second': 211.032, 'eval_steps_per_second': 27.055, 'epoch': 4.0}


 25%|██▌       | 50/200 [00:10<00:22,  6.66it/s]
 25%|██▌       | 50/200 [00:11<00:22,  6.66it/s]

{'eval_loss': 0.5866970419883728, 'eval_AUC': 0.5680868838763575, 'eval_runtime': 0.3798, 'eval_samples_per_second': 205.392, 'eval_steps_per_second': 26.332, 'epoch': 5.0}


 30%|███       | 60/200 [00:13<00:21,  6.48it/s]
 30%|███       | 60/200 [00:13<00:21,  6.48it/s]

{'eval_loss': 0.5742367506027222, 'eval_AUC': 0.5914786967418546, 'eval_runtime': 0.366, 'eval_samples_per_second': 213.097, 'eval_steps_per_second': 27.32, 'epoch': 6.0}


 35%|███▌      | 70/200 [00:15<00:19,  6.64it/s]
 35%|███▌      | 70/200 [00:16<00:19,  6.64it/s]

{'eval_loss': 0.5656718015670776, 'eval_AUC': 0.6491228070175439, 'eval_runtime': 0.342, 'eval_samples_per_second': 228.066, 'eval_steps_per_second': 29.239, 'epoch': 7.0}


 40%|████      | 80/200 [00:17<00:17,  6.81it/s]
 40%|████      | 80/200 [00:18<00:17,  6.81it/s]

{'eval_loss': 0.5687446594238281, 'eval_AUC': 0.6649958228905597, 'eval_runtime': 0.34, 'eval_samples_per_second': 229.427, 'eval_steps_per_second': 29.414, 'epoch': 8.0}


 45%|████▌     | 90/200 [00:20<00:16,  6.85it/s]
 45%|████▌     | 90/200 [00:20<00:16,  6.85it/s]

{'eval_loss': 0.6990585327148438, 'eval_AUC': 0.5981620718462823, 'eval_runtime': 0.3509, 'eval_samples_per_second': 222.266, 'eval_steps_per_second': 28.496, 'epoch': 9.0}


 50%|█████     | 100/200 [00:22<00:14,  6.74it/s]
 50%|█████     | 100/200 [00:22<00:14,  6.74it/s]

{'eval_loss': 0.8085477352142334, 'eval_AUC': 0.606516290726817, 'eval_runtime': 0.35, 'eval_samples_per_second': 222.884, 'eval_steps_per_second': 28.575, 'epoch': 10.0}


 55%|█████▌    | 110/200 [00:24<00:13,  6.83it/s]
 55%|█████▌    | 110/200 [00:25<00:13,  6.83it/s]

{'eval_loss': 1.00529944896698, 'eval_AUC': 0.6466165413533834, 'eval_runtime': 0.3489, 'eval_samples_per_second': 223.544, 'eval_steps_per_second': 28.66, 'epoch': 11.0}


 60%|██████    | 120/200 [00:26<00:11,  6.79it/s]
 60%|██████    | 120/200 [00:27<00:11,  6.79it/s]

{'eval_loss': 1.1588016748428345, 'eval_AUC': 0.6090225563909775, 'eval_runtime': 0.3786, 'eval_samples_per_second': 205.996, 'eval_steps_per_second': 26.41, 'epoch': 12.0}


 65%|██████▌   | 130/200 [00:29<00:10,  6.80it/s]
 65%|██████▌   | 130/200 [00:29<00:10,  6.80it/s]

{'eval_loss': 1.279964804649353, 'eval_AUC': 0.6115288220551378, 'eval_runtime': 0.3511, 'eval_samples_per_second': 222.153, 'eval_steps_per_second': 28.481, 'epoch': 13.0}


 65%|██████▌   | 130/200 [00:30<00:16,  4.30it/s]


{'train_runtime': 30.2468, 'train_samples_per_second': 51.576, 'train_steps_per_second': 6.612, 'train_loss': 0.38888611426720254, 'epoch': 13.0}


100%|██████████| 10/10 [00:00<00:00, 35.56it/s]

Fold 2 Validation AUC: 0.6649958228905597





In [4]:
# Calculate mean and standard deviation for validation AUC scores
mean_val_auc = np.mean(val_auc_scores)
std_val_auc = np.std(val_auc_scores)

# Print the results
print(f"Validation AUC: Mean = {mean_val_auc:.4f}, Standard Deviation = {std_val_auc:.4f}")

Validation AUC: Mean = 0.6739, Standard Deviation = 0.0089


In [5]:
best_fold_idx = np.argmax(val_auc_scores)
best_model_dir = output_dirs[best_fold_idx]  # Directory of the best model
print(f"Best model found in fold {best_fold_idx + 1} with Validation AUC: {val_auc_scores[best_fold_idx]}")


Best model found in fold 1 with Validation AUC: 0.6827586206896551


In [6]:
# use the best model and do the final training
best_model_dir = './results/LongShortTF/fold_' + str(best_fold_idx + 1)
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_dir)

full_train_dataset = TextDataset(train_texts_all, train_labels_all, tokenizer)

# Define training arguments for the final training phase
final_training_args = TrainingArguments(
    output_dir="./LongShortTF/final_model",       # Directory to save the final model
    evaluation_strategy="no",         # No evaluation during training
    save_strategy="no",            # Save the model at each epoch
    save_total_limit=1,               # Keep only the last checkpoint to save storage
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10000,              # Minimize logging output
    report_to="none"                  # Disable logging to external tools
)

# Initialize the Trainer with the full dataset and final training arguments
trainer = Trainer(
    model=best_model,
    args=final_training_args,
    train_dataset=full_train_dataset
)

# Evaluate on the test set
test_results = trainer.predict(test_dataset)
# Calculate AUC on the test data
test_probs = torch.nn.functional.softmax(torch.tensor(test_results.predictions), dim=1)[:, 1].numpy()
test_auc = roc_auc_score(test_results.label_ids, test_probs)
print(f"Test AUC with the best model: {test_auc}")


100%|██████████| 3/3 [00:01<00:00,  2.67it/s]

Test AUC with the best model: 0.2615384615384615





In [9]:
import csv

# File path
output_file = "./res/2024_1119/auc.csv"

# Save the numbers to a CSV file
with open(output_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["mean_validation_auc", "test_auc"])  # Header row
    writer.writerow([f"{mean_val_auc:.4f}", f"{test_auc:.4f}"])     # Data row

print(f"Numbers saved to {output_file}")

Numbers saved to ./res/2024_1119/auc.csv
