In [1]:
import os
from torch import nn, optim
from sentence_transformers import SentenceTransformer
import pickle
# Specify the working directory
os.chdir('/Users/david/Desktop/FinetuneEmbed')

from mod.mod import *

  from tqdm.autonotebook import tqdm, trange
  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/david/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
# Set the device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load a pre-trained SBERT model
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
# prepare the input data
with open("./data/long_vs_shortTF/train_data.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("./data/long_vs_shortTF/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Define your dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove the batch dimension added by the tokenizer (squeeze the single dimension)
        encoding = {key: value.squeeze(0) for key, value in encoding.items()}
        encoding["label"] = torch.tensor(label, dtype=torch.long)

        return encoding

# Load model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Choose an appropriate Sentence BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare datasets
train_texts, train_labels = train_data['desc'], train_data['labels']   # your training texts and labels
test_texts, test_labels = test_data['desc'], test_data['labels']    # your test texts and labels

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

class CustomTrainer(Trainer):
    def __init__(self, *args, eval_metric="AUC", **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_metric = eval_metric
        self.reduce_lr_scheduler = None  # Initialize as None

    def train(self, *args, **kwargs):
        # Initialize the optimizer and standard scheduler
        output = super().train(*args, **kwargs)
        
        # Create ReduceLROnPlateau scheduler after the optimizer has been created
        self.reduce_lr_scheduler = ReduceLROnPlateau(self.optimizer, mode="max", factor=0.8, patience=2, verbose=True)
        return output

    def evaluate(self, *args, **kwargs):
        # Evaluate and store the results
        eval_output = super().evaluate(*args, **kwargs)
        
        # Access the chosen evaluation metric and step the scheduler
        metric_value = eval_output[f"eval_{self.eval_metric}"]
        
        # Step the scheduler if it’s initialized
        if self.reduce_lr_scheduler:
            self.reduce_lr_scheduler.step(metric_value)
        
        return eval_output

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    weight_decay=0.01,
    # max_grad_norm=1.0,
    # warmup_ratio=0.1,
    metric_for_best_model="AUC",
    greater_is_better=True
)

# Define the compute_metrics function for AUC
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()  # Get probability of the positive class
    auc = roc_auc_score(labels, probs)
    return {"AUC": auc}

# Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
# )

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    eval_metric="AUC",  # Specify the metric to monitor for learning rate adjustment
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Fine-tune the model
trainer.train()

# Evaluate on the test set
results = trainer.evaluate()
print("Test AUC:", results["eval_AUC"])



  0%|          | 0/480 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6320387721061707, 'eval_AUC': 0.7216117216117216, 'eval_runtime': 1.8822, 'eval_samples_per_second': 28.158, 'eval_steps_per_second': 3.719, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5964165925979614, 'eval_AUC': 0.7472527472527473, 'eval_runtime': 0.2155, 'eval_samples_per_second': 245.952, 'eval_steps_per_second': 32.484, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5809365510940552, 'eval_AUC': 0.6465201465201466, 'eval_runtime': 0.2222, 'eval_samples_per_second': 238.572, 'eval_steps_per_second': 31.51, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5780684351921082, 'eval_AUC': 0.6666666666666667, 'eval_runtime': 0.2581, 'eval_samples_per_second': 205.344, 'eval_steps_per_second': 27.121, 'epoch': 4.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5768019556999207, 'eval_AUC': 0.5842490842490842, 'eval_runtime': 0.217, 'eval_samples_per_second': 244.227, 'eval_steps_per_second': 32.256, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5748694539070129, 'eval_AUC': 0.5952380952380952, 'eval_runtime': 0.261, 'eval_samples_per_second': 203.071, 'eval_steps_per_second': 26.821, 'epoch': 6.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.5735970139503479, 'eval_AUC': 0.6190476190476191, 'eval_runtime': 0.2552, 'eval_samples_per_second': 207.712, 'eval_steps_per_second': 27.434, 'epoch': 7.0}
{'train_runtime': 29.1725, 'train_samples_per_second': 124.432, 'train_steps_per_second': 16.454, 'train_loss': 0.6104997226170131, 'epoch': 7.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Test AUC: 0.7472527472527473
