# Load Data

In [2]:
import json
import torch
import random
import optuna
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from torch import nn
import numpy as np
from scipy.linalg import inv
from torch.optim import Adam
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
from scipy.spatial import distance
from torch.utils.tensorboard import SummaryWriter
import uuid

ModuleNotFoundError: No module named 'tensorboard'

# Load BANKING77

In [1]:
# Define the IntentExample class and load_intent_examples function as provided
class IntentExample:
    def __init__(self, text, label, do_lower_case):
        self.original_text = text
        self.text = text
        self.label = label
        if do_lower_case:
            self.text = self.text.lower()

def load_intent_examples(file_path, do_lower_case=True):
    examples = []
    with open(f'{file_path}/seq.in', 'r', encoding="utf-8") as f_text, open(f'{file_path}/label', 'r', encoding="utf-8") as f_label:
        for text, label in zip(f_text, f_label):
            e = IntentExample(text.strip(), label.strip(), do_lower_case)
            examples.append(e)
    return examples

# Define paths to the dataset directories
base_dir = 'Few-Shot-Intent-Detection/Datasets/BANKING77-OOS'
paths = {
    'train': f'{base_dir}/train',
    'valid': f'{base_dir}/valid',
    'test': f'{base_dir}/test',
    'oos_val': f'{base_dir}/ood-oos/valid',
    'oos_test': f'{base_dir}/ood-oos/test'
}
datasets = {key: load_intent_examples(path) for key, path in paths.items()}

# Extract sentences and labels from the loaded datasets
train_sentences = [e.text for e in datasets['train']]
train_labels = [e.label for e in datasets['train']]

val_sentences = [e.text for e in datasets['valid']]
val_labels = [e.label for e in datasets['valid']]

test_sentences = [e.text for e in datasets['test']]
test_labels = [e.label for e in datasets['test']]

oos_val_sentences = [e.text for e in datasets['oos_val']]
oos_test_sentences = [e.text for e in datasets['oos_test']]
model_name = str(uuid.uuid4())+"improved_ce_model_bert_BANKING77.pth"


NameError: name 'uuid' is not defined

# Encode Labels

In [5]:
label_encoder = LabelEncoder()
# Fit the label encoder and transform labels to integers
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.fit_transform(val_labels)

# Tokenize our sentences and create Dataloaders

In [6]:
pretrained_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
transformer_model = AutoModel.from_pretrained(pretrained_model_name)

class TextDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(sentences, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [7]:
tokenized_lengths = [len(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in train_sentences]
max_length = max(tokenized_lengths)
print(f"Max length for tokenizer: {max_length}")
# 2. Create the dataset
train_dataset = TextDataset(train_sentences, encoded_train_labels, tokenizer, max_length)
val_dataset = TextDataset(val_sentences, encoded_val_labels, tokenizer, max_length)

Max length for tokenizer: 98


# Define functions to encode our sentences

In [7]:
transformer_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer_model = transformer_model.to(device)
def encode_sentences(model, sentences, tokenizer=tokenizer, batch_size=256):
    model = model.to(device)
    sentence_embeddings = []

    # Process sentences in batches
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Move the batch to the same device as the model
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        
        with torch.no_grad():
            model_output = model(**encoded_input)

        pooled_output = model_output.last_hidden_state.mean(dim=1)
        sentence_embeddings.append(pooled_output)

    # Concatenate all batched embeddings and move to CPU in one go
    sentence_embeddings_np = torch.cat(sentence_embeddings, dim=0).cpu().numpy()
    
    return sentence_embeddings_np

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Define our model

In [9]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, transformer_model, num_labels):
        super(TextClassifier, self).__init__()
        self.transformer = transformer_model
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embedding = transformer_output.last_hidden_state.max(dim=1).values

        # Forward pass through the classifier layer
        logits = self.classifier(sentence_embedding)
        
        return logits, sentence_embedding


# Initiallize everything else we needed

In [10]:
unique_intents = list(set(train_labels)) 
training_losses = []
validation_losses = []

In [11]:
def euclidean_distance_loss(embeddings):
    n, k = embeddings.size()  # n is the batch size, k is the embedding dimension
    loss = 0.0
    
    # Calculate the mean embedding for each sample, excluding the sample itself
    for i in range(n):
        # Use indexing to exclude the current sample, then calculate the mean of the remaining samples
        indices = [j for j in range(n) if j != i]
        mean_embedding = embeddings[indices].mean(dim=0)
        
        # Calculate the squared Euclidean distance for the current sample
        distance = (embeddings[i] - mean_embedding).pow(2).sum()
        
        # Accumulate the loss
        loss += distance
    
    # Average the loss over all samples and divide by the dimension k
    loss = loss / (n * k)
    return loss

In [9]:
def min_mahalanobis_for_sample(sample, intent_means, cov_inverse):
    distances = [distance.mahalanobis(sample, mean, cov_inverse) for mean in intent_means.values()]
    return min(distances)

In [None]:
def objective(trial):
    # Optuna suggests hyperparameters
    writer = SummaryWriter()
    seed_value=42
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    lr = trial.suggest_categorical('lr', [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3])
    num_epochs = 25
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256])
    ed_loss_importance = trial.suggest_float('ed_loss_importance', 0.05, 0.3, step = 0.02)
    training_losses = []
    validation_losses = []
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    # Model setup
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    transformer_model = AutoModel.from_pretrained(pretrained_model_name)
    model = TextClassifier(transformer_model, len(unique_intents))
    if not torch.cuda.is_available():
        print("cuda not available")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=lr)
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        # Training Phase
        model.train()  # Set the model to training mode
        total_train_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            optimizer.zero_grad()  # Zero the gradients
            predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
            ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
            ed_loss = euclidean_distance_loss(embeddings)  # Euclidean distance loss
            total_loss = ce_loss + ed_loss_importance * ed_loss  # Combine the losses
            
            total_loss.backward()  # Backward pass
            optimizer.step()  # Update weights
    
            total_train_loss += total_loss.item()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        training_losses.append(avg_train_loss)
        # Validation Phase
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        total_ce_loss = 0
        total_ed_loss = 0
        with torch.no_grad():  # Disable gradient calculations
            for batch in val_dataloader:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
                ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
                ed_loss = euclidean_distance_loss(embeddings)  # Euclidean distance loss
                total_loss = ce_loss + ed_loss_importance * ed_loss  # Combine the losses
                total_val_loss += total_loss.item()
                total_ce_loss += ce_loss.item()
                total_ed_loss += ed_loss.item()
        avg_val_loss = total_val_loss / len(val_dataloader)
        avg_ce_loss = total_ce_loss / len(val_dataloader)
        avg_ed_loss = total_ed_loss / len(val_dataloader)
        writer.add_scalar("Validation/Average CE Loss", avg_ce_loss, epoch)
        writer.add_scalar("Validation/Average ED Loss", avg_ed_loss, epoch)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Save the model
            torch.save(model, model_name)
            print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
        validation_losses.append(avg_val_loss)
    trial.set_user_attr("training_losses", training_losses)
    trial.set_user_attr("validation_losses", validation_losses)
    writer.close()
    fine_model = torch.load(model_name)
    fine_model.eval()  # Put the model in evaluation mode
    fine_model = fine_model.to(device)
    fine_transformer = fine_model.transformer
    train_embeddings = encode_sentences(fine_transformer, train_sentences)
    val_embeddings = encode_sentences(fine_transformer, val_sentences)
    oos_val_embeddings = encode_sentences(fine_transformer, oos_val_sentences)

    intent_means = {}
    for encoded_label in np.unique(encoded_train_labels):
        # Find indices where the encoded label matches
        indices = np.where(encoded_train_labels == encoded_label)[0]
        
        # Calculate the mean embedding for the current intent
        intent_embeddings = train_embeddings[indices]
        intent_mean = np.mean(intent_embeddings, axis=0)
        
        # Use the encoded label as the dictionary key
        intent_means[encoded_label] = intent_mean
    covariance = np.cov(train_embeddings, rowvar=False)
    cov_inverse = inv(covariance)
    val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in val_embeddings]
    oos_val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_val_embeddings]

    # True binary labels: 0 for in-domain and 1 for OOD
    y_true = [0] * len(val_scores) + [1] * len(oos_val_scores)

    # Combine the scores
    y_scores = val_scores + oos_val_scores

    # Compute AUPR
    aupr = average_precision_score(y_true, y_scores)
    
    return aupr

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize',  study_name='improved_ce_loss_BANKING77_search', storage='sqlite:///try.db', load_if_exists= True)
study.optimize(objective, n_trials=150)  # n_trials is the number of iterations

# Get the best parameters
best_params = study.best_params
print("Best parameters:", best_params)

