In [1]:
import torch
from torch import nn, optim
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange
  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/david/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
print(torch.__version__)

2.5.1


In [3]:
# Set the device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained SBERT model
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [4]:
# Define custom classification model with SBERT embeddings
class SBERTClassifier(nn.Module):
    def __init__(self, sbert_model):
        super(SBERTClassifier, self).__init__()
        self.sbert_model = sbert_model
        self.classifier = nn.Sequential(
            nn.Linear(self.sbert_model.get_sentence_embedding_dimension(), 1),
            nn.Sigmoid()  # Sigmoid to output probabilities between 0 and 1
        )
    
    def forward(self, input_texts):
        # Encode the input texts using SBERT
        embeddings = self.sbert_model.encode(input_texts, convert_to_tensor=True)
        logits = self.classifier(embeddings)
        return logits


In [5]:
# Example Dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], torch.tensor(self.labels[idx], dtype=torch.float32)

In [6]:
# Define sample data (replace with your actual dataset)
train_texts = ["I love programming", "Python is great", "I dislike AI", "Coding is fun"]
train_labels = [1, 1, 0, 1]  # Positive class
val_texts = ["I hate bugs", "Debugging is frustrating", "I like eating"]
val_labels = [0, 0, 1]  # Negative class
test_texts = ["Learning AI is amazing", "Errors are annoying"]
test_labels = [1, 0]

# Create datasets and data loaders
train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)
test_loader = DataLoader(test_dataset, batch_size=2)

# Initialize model, criterion, and optimizer
model = SBERTClassifier(sbert_model).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=1e-5)


In [None]:
model(train_texts).squeeze().cpu().detach().numpy()

In [None]:
for texts, labels in train_loader:
    print(texts)
    print(labels)

In [19]:
import numpy as np
# Training function with AUC calculation on validation set
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for texts, labels in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            texts = [text for text in texts]
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(texts).squeeze()  # Remove extra dimension
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")
        
        # Validate after each epoch
        auc = evaluate_model(model, val_loader)
        print(f"Validation AUC after Epoch {epoch + 1}: {auc:.4f}")

# Evaluation function for AUC
def evaluate_model(model, data_loader):
    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = [text for text in texts]
            labels = labels.to(device)
            
            # Get predicted probabilities
            outputs = model(texts).squeeze()  # Remove extra dimension
            all_labels.extend(labels.cpu().detach().numpy())

            outputs = outputs.cpu().detach().numpy()
            if outputs.ndim == 0:
                outputs = outputs.reshape(1)

            print(outputs)
            all_probs.extend(outputs)
    # Calculate AUC
    auc = roc_auc_score(all_labels, all_probs)
    model.train()  # Set model back to training mode
    return auc

In [None]:
# Test function to calculate AUC on the test set
def test_model(model, data_loader):
    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = [text for text in texts]
            labels = labels.to(device)
            
            outputs = model(texts).squeeze()
            all_labels.extend(labels.cpu().detach().numpy())
            all_probs.extend(outputs.cpu().detach().numpy())
    
    # Calculate AUC
    auc = roc_auc_score(all_labels, all_probs)
    print(f"Test AUC: {auc:.4f}")

In [20]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

Training Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  6.67it/s]


Epoch [1/10], Loss: 0.7047
[0.4892356 0.4817939]
[0.49916002]
Validation AUC after Epoch 1: 1.0000


Training Epoch 2: 100%|██████████| 2/2 [00:00<00:00, 49.28it/s]


Epoch [2/10], Loss: 0.7047
[0.48924395 0.48181123]
[0.49917406]
Validation AUC after Epoch 2: 1.0000


Training Epoch 3: 100%|██████████| 2/2 [00:00<00:00, 27.10it/s]


Epoch [3/10], Loss: 0.7046
[0.48925275 0.48182866]
[0.49918854]
Validation AUC after Epoch 3: 1.0000


Training Epoch 4: 100%|██████████| 2/2 [00:00<00:00, 53.52it/s]


Epoch [4/10], Loss: 0.7045
[0.48926187 0.4818462 ]
[0.4992033]
Validation AUC after Epoch 4: 1.0000


Training Epoch 5: 100%|██████████| 2/2 [00:00<00:00, 53.21it/s]


Epoch [5/10], Loss: 0.7045
[0.48927328 0.48186508]
[0.49922004]
Validation AUC after Epoch 5: 1.0000


Training Epoch 6: 100%|██████████| 2/2 [00:00<00:00, 28.65it/s]

Epoch [6/10], Loss: 0.7044





[0.48928145 0.48188192]
[0.49923393]
Validation AUC after Epoch 6: 1.0000


Training Epoch 7: 100%|██████████| 2/2 [00:00<00:00, 28.51it/s]

Epoch [7/10], Loss: 0.7043





[0.4892903 0.4818994]
[0.4992484]
Validation AUC after Epoch 7: 1.0000


Training Epoch 8: 100%|██████████| 2/2 [00:00<00:00, 26.97it/s]


Epoch [8/10], Loss: 0.7042
[0.4893012 0.4819178]
[0.49926472]
Validation AUC after Epoch 8: 1.0000


Training Epoch 9: 100%|██████████| 2/2 [00:00<00:00, 51.97it/s]


Epoch [9/10], Loss: 0.7042
[0.4893101 0.4819353]
[0.49927923]
Validation AUC after Epoch 9: 1.0000


Training Epoch 10: 100%|██████████| 2/2 [00:00<00:00, 50.24it/s]


Epoch [10/10], Loss: 0.7041
[0.489321   0.48195368]
[0.49929544]
Validation AUC after Epoch 10: 1.0000


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # or any other sentence-BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare your data
texts = ["description 1", "description 2", "description 3", "description 4"]  # Replace with your data
labels = [0, 1, 0, 1]  # Replace with your binary labels

# Tokenize the texts
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Create a dataset
dataset = Dataset.from_dict({"input_ids": encodings["input_ids"], 
                             "attention_mask": encodings["attention_mask"], 
                             "labels": labels})

# Split into train and eval
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Define the AUC computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    auc = roc_auc_score(labels, logits[:, 1])  # Use probabilities of the positive class
    return {"AUC": auc}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Match save strategy to evaluation
    load_best_model_at_end=True,  # Track best model based on evaluation
    metric_for_best_model="AUC",  # Track AUC to select the best model
    greater_is_better=True  # AUC is higher when better
)

# Define the Trainer with AUC as the evaluation metric
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/david/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.