In [1]:
import os
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')
import pickle
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback

from mod.mod_text import *

# prepare the input data
with open("./data/MethylationState/bivalent_vs_no_methyl/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/MethylationState/bivalent_vs_no_methyl/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

# Prepare datasets
train_texts_all, train_labels_all = train_data['desc'], train_data['labels']
test_texts, test_labels = test_data['desc'], test_data['labels']

# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/david/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
n_splits = 5  # Number of folds
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=10)

# Initialize a list to store AUC scores for each fold and on the test data
val_auc_scores = []
output_dirs = []  # Track output directories for each fold

# Create test dataset
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

In [3]:
# Loop over each fold
for fold, (train_index, val_index) in enumerate(kf.split(train_texts_all, train_labels_all)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Split data into training and validation for this fold
    train_texts, val_texts = [train_texts_all[i] for i in train_index], [train_texts_all[i] for i in val_index]
    train_labels, val_labels = [train_labels_all[i] for i in train_index], [train_labels_all[i] for i in val_index]

    # Create PyTorch datasets
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)

    # Define output directory for this fold
    output_dir = f"./results/BivalentNoMethyl/fold_{fold + 1}"
    os.makedirs(output_dir, exist_ok=True)
    output_dirs.append(output_dir)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch", # Save the model after each epoch
        load_best_model_at_end=True, # Load the best model at the end of each fold
        save_total_limit=1, # Keep only the best model checkpoint
        learning_rate=1e-4, 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=20,
        max_grad_norm=1.0,
        weight_decay=0.01,
        metric_for_best_model="AUC",
        greater_is_better=True
    )

    # Initialize the model and Trainer for this fold
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        eval_metric="AUC",
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # Train the model on this fold
    trainer.train()

    # Evaluate on the validation set and save the best model's AUC
    val_results = trainer.evaluate()
    val_auc = val_results["eval_AUC"]
    print(f"Fold {fold + 1} Validation AUC: {val_auc}")
    val_auc_scores.append(val_auc)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/5


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5794541239738464, 'eval_AUC': 0.8823529411764706, 'eval_runtime': 0.4153, 'eval_samples_per_second': 57.795, 'eval_steps_per_second': 7.224, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4575110673904419, 'eval_AUC': 0.8739495798319328, 'eval_runtime': 0.1044, 'eval_samples_per_second': 229.967, 'eval_steps_per_second': 28.746, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4493255615234375, 'eval_AUC': 0.8319327731092437, 'eval_runtime': 0.096, 'eval_samples_per_second': 250.029, 'eval_steps_per_second': 31.254, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5459502339363098, 'eval_AUC': 0.8571428571428571, 'eval_runtime': 0.0963, 'eval_samples_per_second': 249.281, 'eval_steps_per_second': 31.16, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.7707942128181458, 'eval_AUC': 0.8655462184873949, 'eval_runtime': 0.0974, 'eval_samples_per_second': 246.399, 'eval_steps_per_second': 30.8, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6160604953765869, 'eval_AUC': 0.8487394957983193, 'eval_runtime': 0.101, 'eval_samples_per_second': 237.603, 'eval_steps_per_second': 29.7, 'epoch': 6.0}
{'train_runtime': 21.5193, 'train_samples_per_second': 89.222, 'train_steps_per_second': 11.153, 'train_loss': 0.3243210580613878, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Fold 1 Validation AUC: 0.8823529411764706
Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5831356048583984, 'eval_AUC': 0.9327731092436974, 'eval_runtime': 0.0967, 'eval_samples_per_second': 248.312, 'eval_steps_per_second': 31.039, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3824063241481781, 'eval_AUC': 0.9495798319327731, 'eval_runtime': 0.0974, 'eval_samples_per_second': 246.531, 'eval_steps_per_second': 30.816, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.30977120995521545, 'eval_AUC': 1.0, 'eval_runtime': 0.0946, 'eval_samples_per_second': 253.718, 'eval_steps_per_second': 31.715, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.18067996203899384, 'eval_AUC': 1.0, 'eval_runtime': 0.1024, 'eval_samples_per_second': 234.446, 'eval_steps_per_second': 29.306, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.16879792511463165, 'eval_AUC': 1.0, 'eval_runtime': 0.0985, 'eval_samples_per_second': 243.568, 'eval_steps_per_second': 30.446, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.1399870663881302, 'eval_AUC': 0.9831932773109244, 'eval_runtime': 0.1369, 'eval_samples_per_second': 175.301, 'eval_steps_per_second': 21.913, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5634542107582092, 'eval_AUC': 0.9411764705882353, 'eval_runtime': 0.1018, 'eval_samples_per_second': 235.761, 'eval_steps_per_second': 29.47, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6414007544517517, 'eval_AUC': 0.9327731092436975, 'eval_runtime': 0.0962, 'eval_samples_per_second': 249.48, 'eval_steps_per_second': 31.185, 'epoch': 8.0}
{'train_runtime': 19.2136, 'train_samples_per_second': 99.929, 'train_steps_per_second': 12.491, 'train_loss': 0.29132992029190063, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 Validation AUC: 1.0
Fold 3/5


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5188202857971191, 'eval_AUC': 0.8907563025210085, 'eval_runtime': 0.0988, 'eval_samples_per_second': 242.881, 'eval_steps_per_second': 30.36, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4193272888660431, 'eval_AUC': 0.8235294117647058, 'eval_runtime': 0.0979, 'eval_samples_per_second': 245.264, 'eval_steps_per_second': 30.658, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.40653195977211, 'eval_AUC': 0.8403361344537815, 'eval_runtime': 0.098, 'eval_samples_per_second': 244.903, 'eval_steps_per_second': 30.613, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.7280054092407227, 'eval_AUC': 0.8151260504201681, 'eval_runtime': 0.0962, 'eval_samples_per_second': 249.61, 'eval_steps_per_second': 31.201, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6680724024772644, 'eval_AUC': 0.9327731092436975, 'eval_runtime': 0.0979, 'eval_samples_per_second': 245.156, 'eval_steps_per_second': 30.645, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5444197058677673, 'eval_AUC': 0.8991596638655461, 'eval_runtime': 0.1642, 'eval_samples_per_second': 146.16, 'eval_steps_per_second': 18.27, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5517790913581848, 'eval_AUC': 0.9327731092436975, 'eval_runtime': 0.0987, 'eval_samples_per_second': 243.052, 'eval_steps_per_second': 30.382, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6804139614105225, 'eval_AUC': 0.9327731092436975, 'eval_runtime': 0.0996, 'eval_samples_per_second': 241.044, 'eval_steps_per_second': 30.13, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6421211361885071, 'eval_AUC': 0.9243697478991597, 'eval_runtime': 0.0987, 'eval_samples_per_second': 243.166, 'eval_steps_per_second': 30.396, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8308438658714294, 'eval_AUC': 0.8907563025210083, 'eval_runtime': 0.0994, 'eval_samples_per_second': 241.432, 'eval_steps_per_second': 30.179, 'epoch': 10.0}
{'train_runtime': 24.6786, 'train_samples_per_second': 77.8, 'train_steps_per_second': 9.725, 'train_loss': 0.19348257382710773, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Fold 3 Validation AUC: 0.9327731092436975
Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5534107089042664, 'eval_AUC': 0.7815126050420168, 'eval_runtime': 0.0986, 'eval_samples_per_second': 243.329, 'eval_steps_per_second': 30.416, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5106108784675598, 'eval_AUC': 0.7983193277310924, 'eval_runtime': 0.0982, 'eval_samples_per_second': 244.357, 'eval_steps_per_second': 30.545, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5801253914833069, 'eval_AUC': 0.7815126050420168, 'eval_runtime': 0.0973, 'eval_samples_per_second': 246.721, 'eval_steps_per_second': 30.84, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6718123555183411, 'eval_AUC': 0.7815126050420168, 'eval_runtime': 0.1533, 'eval_samples_per_second': 156.55, 'eval_steps_per_second': 19.569, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0348540544509888, 'eval_AUC': 0.7394957983193278, 'eval_runtime': 0.0973, 'eval_samples_per_second': 246.626, 'eval_steps_per_second': 30.828, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.6767897605895996, 'eval_AUC': 0.7647058823529412, 'eval_runtime': 0.0973, 'eval_samples_per_second': 246.753, 'eval_steps_per_second': 30.844, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5242306590080261, 'eval_AUC': 0.8739495798319328, 'eval_runtime': 0.0979, 'eval_samples_per_second': 245.25, 'eval_steps_per_second': 30.656, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.716492235660553, 'eval_AUC': 0.8823529411764706, 'eval_runtime': 0.0976, 'eval_samples_per_second': 245.872, 'eval_steps_per_second': 30.734, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0146487951278687, 'eval_AUC': 0.7983193277310925, 'eval_runtime': 0.0977, 'eval_samples_per_second': 245.642, 'eval_steps_per_second': 30.705, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9380818009376526, 'eval_AUC': 0.8067226890756303, 'eval_runtime': 0.0997, 'eval_samples_per_second': 240.797, 'eval_steps_per_second': 30.1, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9910576343536377, 'eval_AUC': 0.7983193277310925, 'eval_runtime': 0.0977, 'eval_samples_per_second': 245.685, 'eval_steps_per_second': 30.711, 'epoch': 11.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0726784467697144, 'eval_AUC': 0.8067226890756304, 'eval_runtime': 0.1244, 'eval_samples_per_second': 192.918, 'eval_steps_per_second': 24.115, 'epoch': 12.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.1213358640670776, 'eval_AUC': 0.8235294117647058, 'eval_runtime': 0.0984, 'eval_samples_per_second': 243.946, 'eval_steps_per_second': 30.493, 'epoch': 13.0}
{'train_runtime': 31.4578, 'train_samples_per_second': 61.034, 'train_steps_per_second': 7.629, 'train_loss': 0.16818546637510642, 'epoch': 13.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Fold 4 Validation AUC: 0.8823529411764706
Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5051503777503967, 'eval_AUC': 0.907563025210084, 'eval_runtime': 0.1108, 'eval_samples_per_second': 216.585, 'eval_steps_per_second': 27.073, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4475466310977936, 'eval_AUC': 0.9327731092436974, 'eval_runtime': 0.0982, 'eval_samples_per_second': 244.312, 'eval_steps_per_second': 30.539, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.651891827583313, 'eval_AUC': 0.9159663865546218, 'eval_runtime': 0.0993, 'eval_samples_per_second': 241.806, 'eval_steps_per_second': 30.226, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.7346649765968323, 'eval_AUC': 0.8991596638655462, 'eval_runtime': 0.0986, 'eval_samples_per_second': 243.484, 'eval_steps_per_second': 30.435, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0520241260528564, 'eval_AUC': 0.8739495798319328, 'eval_runtime': 0.0989, 'eval_samples_per_second': 242.753, 'eval_steps_per_second': 30.344, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8220150470733643, 'eval_AUC': 0.8571428571428571, 'eval_runtime': 0.1128, 'eval_samples_per_second': 212.747, 'eval_steps_per_second': 26.593, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9682524800300598, 'eval_AUC': 0.865546218487395, 'eval_runtime': 0.0981, 'eval_samples_per_second': 244.586, 'eval_steps_per_second': 30.573, 'epoch': 7.0}
{'train_runtime': 16.8291, 'train_samples_per_second': 114.088, 'train_steps_per_second': 14.261, 'train_loss': 0.22946419034685409, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Fold 5 Validation AUC: 0.9327731092436974


In [4]:
# Calculate mean and standard deviation for validation AUC scores
mean_val_auc = np.mean(val_auc_scores)
std_val_auc = np.std(val_auc_scores)

# Print the results
print(f"Validation AUC: Mean = {mean_val_auc:.4f}, Standard Deviation = {std_val_auc:.4f}")

Validation AUC: Mean = 0.9261, Standard Deviation = 0.0433


In [5]:
best_fold_idx = np.argmax(val_auc_scores)
best_model_dir = output_dirs[best_fold_idx]  # Directory of the best model
print(f"Best model found in fold {best_fold_idx + 1} with Validation AUC: {val_auc_scores[best_fold_idx]}")


Best model found in fold 2 with Validation AUC: 1.0


In [7]:
# use the best model and do the final training
best_model_dir = './results/BivalentNoMethyl/fold_2/checkpoint-36'
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_dir)

full_train_dataset = TextDataset(train_texts_all, train_labels_all, tokenizer)

# Define training arguments for the final training phase
final_training_args = TrainingArguments(
    output_dir="./LongShortTF/final_model",       # Directory to save the final model
    evaluation_strategy="no",         # No evaluation during training
    save_strategy="no",            # Save the model at each epoch
    save_total_limit=1,               # Keep only the last checkpoint to save storage
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10000,              # Minimize logging output
    report_to="none"                  # Disable logging to external tools
)

# Initialize the Trainer with the full dataset and final training arguments
trainer = Trainer(
    model=best_model,
    args=final_training_args,
    train_dataset=full_train_dataset
)

# Evaluate on the test set
test_results = trainer.predict(test_dataset)
# Calculate AUC on the test data
test_probs = torch.nn.functional.softmax(torch.tensor(test_results.predictions), dim=1)[:, 1].numpy()
test_auc = roc_auc_score(test_results.label_ids, test_probs)
print(f"Test AUC with the best model: {test_auc}")



  0%|          | 0/2 [00:00<?, ?it/s]

Test AUC with the best model: 0.975
