# Load Data

In [2]:
import json
import torch
import random
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from torch import nn
import numpy as np
from scipy.linalg import inv
from torch.optim import Adam

In [3]:
seed_value=95181
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

# Load CLINC150

In [4]:
# Load the dataset
with open("clinc150_uci/data_full.json", "r") as file:
    data = json.load(file)
# Extracting data
train_data = data['train']
val_data = data['val']
test_data = data['test']

oos_train_data = data['oos_train']
oos_val_data = data['oos_val']
oos_test_data = data['oos_test']

# Get sentences and labels
train_sentences = [item[0] for item in train_data]
train_labels = [item[1] for item in train_data]

val_sentences = [item[0] for item in val_data]
val_labels = [item[1] for item in val_data]
# 
test_sentences = [item[0] for item in test_data]
test_labels = [item[1] for item in test_data]

oos_train_sentences = [item[0] for item in oos_train_data]
oos_val_sentences = [item[0] for item in oos_val_data]
oos_test_sentences = [item[0] for item in oos_test_data]
model_name = "improved_ce_model_bert_CLINC150.pth"

# Load SLURP

In [5]:
# def load_data(file_path):
#     sentences = []
#     scenarios = []
#     with open(file_path, 'r') as file:
#         for line in file:
#             data = json.loads(line)
#             sentence = data.get('sentence', None)
#             scenario = data.get('scenario', None)
#             if sentence is not None and scenario is not None:
#                 sentences.append(sentence)
#                 scenarios.append(scenario)
#     return sentences, scenarios

# # Randomly select one domain to be out of scope
# unique_scenarios = {'alarm', 'audio', 'calendar', 'cooking', 'datetime', 'email', 'general', 'iot', 'lists', 'music', 'news', 'play', 'qa', 'recommendation', 'social', 'takeaway', 'transport', 'weather'}
# oos_scenario = random.choice(list(unique_scenarios))

# # Load data from files
# train_sentences, train_labels = load_data('slurp/dataset/slurp/train.jsonl')
# val_sentences, val_labels = load_data('slurp/dataset/slurp/devel.jsonl')
# test_sentences, test_labels = load_data('slurp/dataset/slurp/test.jsonl')

# # Separate out of scope data
# oos_train_data = [(s, l) for s, l in zip(train_sentences, train_labels) if l == oos_scenario]
# oos_val_data = [(s, l) for s, l in zip(val_sentences, val_labels) if l == oos_scenario]
# oos_test_data = [(s, l) for s, l in zip(test_sentences, test_labels) if l == oos_scenario]

# # Remove out of scope data from original sets
# train_data = [(s, l) for s, l in zip(train_sentences, train_labels) if l != oos_scenario]
# val_data = [(s, l) for s, l in zip(val_sentences, val_labels) if l != oos_scenario]
# test_data = [(s, l) for s, l in zip(test_sentences, test_labels) if l != oos_scenario]

# # Extract sentences and labels
# train_sentences = [item[0] for item in train_data]
# train_labels = [item[1] for item in train_data]

# val_sentences = [item[0] for item in val_data]
# val_labels = [item[1] for item in val_data]

# test_sentences = [item[0] for item in test_data]
# test_labels = [item[1] for item in test_data]

# oos_train_sentences = [item[0] for item in oos_train_data]
# oos_val_sentences = [item[0] for item in oos_val_data]
# oos_test_sentences = [item[0] for item in oos_test_data]
# oos_scenario
# model_name = "improved_ce_model_bert_SLURP.pth"

# Load Banking77

In [6]:
# # Define the IntentExample class and load_intent_examples function as provided
# class IntentExample:
#     def __init__(self, text, label, do_lower_case):
#         self.original_text = text
#         self.text = text
#         self.label = label
#         if do_lower_case:
#             self.text = self.text.lower()

# def load_intent_examples(file_path, do_lower_case=True):
#     examples = []
#     with open(f'{file_path}/seq.in', 'r', encoding="utf-8") as f_text, open(f'{file_path}/label', 'r', encoding="utf-8") as f_label:
#         for text, label in zip(f_text, f_label):
#             e = IntentExample(text.strip(), label.strip(), do_lower_case)
#             examples.append(e)
#     return examples

# # Define paths to the dataset directories
# base_dir = 'Few-Shot-Intent-Detection/Datasets/BANKING77-OOS'
# paths = {
#     'train': f'{base_dir}/train',
#     'valid': f'{base_dir}/valid',
#     'test': f'{base_dir}/test',
#     'oos_val': f'{base_dir}/ood-oos/valid',
#     'oos_test': f'{base_dir}/ood-oos/test'
# }
# datasets = {key: load_intent_examples(path) for key, path in paths.items()}

# # Extract sentences and labels from the loaded datasets
# train_sentences = [e.text for e in datasets['train']]
# train_labels = [e.label for e in datasets['train']]

# val_sentences = [e.text for e in datasets['valid']]
# val_labels = [e.label for e in datasets['valid']]

# test_sentences = [e.text for e in datasets['test']]
# test_labels = [e.label for e in datasets['test']]

# oos_val_sentences = [e.text for e in datasets['oos_val']]
# oos_test_sentences = [e.text for e in datasets['oos_test']]
# model_name = "improved_ce_model_bert_BANKING77.pth"


# SNIPS

In [7]:
# # Define the IntentExample class and load_intent_examples function as provided
# class IntentExample:
#     def __init__(self, text, label, do_lower_case):
#         self.original_text = text
#         self.text = text
#         self.label = label
#         if do_lower_case:
#             self.text = self.text.lower()

# def load_intent_examples(file_path, do_lower_case=True):
#     examples = []
#     with open(f'{file_path}/seq.in', 'r', encoding="utf-8") as f_text, open(f'{file_path}/label', 'r', encoding="utf-8") as f_label:
#         for text, label in zip(f_text, f_label):
#             e = IntentExample(text.strip(), label.strip(), do_lower_case)
#             examples.append(e)
#     return examples

# # Define paths to the dataset directories
# base_dir = 'Few-Shot-Intent-Detection/Datasets/SNIPS'
# paths = {
#     'train': f'{base_dir}/train',
#     'valid': f'{base_dir}/valid',
#     'test': f'{base_dir}/test'
# }
# datasets = {key: load_intent_examples(path) for key, path in paths.items()}

# # Extract sentences and labels from the loaded datasets
# train_sentences = [e.text for e in datasets['train']]
# train_labels = [e.label for e in datasets['train']]

# val_sentences = [e.text for e in datasets['valid']]
# val_labels = [e.label for e in datasets['valid']]

# test_sentences = [e.text for e in datasets['test']]
# test_labels = [e.label for e in datasets['test']]
# unique_scenarios = set(train_labels)
# # oos_scenario = random.choice(list(unique_scenarios))
# oos_scenario = 'AddToPlaylist'
# # Separate out of scope data
# oos_train_data = [(s, l) for s, l in zip(train_sentences, train_labels) if l == oos_scenario]
# oos_val_data = [(s, l) for s, l in zip(val_sentences, val_labels) if l == oos_scenario]
# oos_test_data = [(s, l) for s, l in zip(test_sentences, test_labels) if l == oos_scenario]

# # Remove out of scope data from original sets
# train_data = [(s, l) for s, l in zip(train_sentences, train_labels) if l != oos_scenario]
# val_data = [(s, l) for s, l in zip(val_sentences, val_labels) if l != oos_scenario]
# test_data = [(s, l) for s, l in zip(test_sentences, test_labels) if l != oos_scenario]
# # Extract sentences and labels
# train_sentences = [item[0] for item in train_data]
# train_labels = [item[1] for item in train_data]

# val_sentences = [item[0] for item in val_data]
# val_labels = [item[1] for item in val_data]

# test_sentences = [item[0] for item in test_data]
# test_labels = [item[1] for item in test_data]

# oos_train_sentences = [item[0] for item in oos_train_data]
# oos_val_sentences = [item[0] for item in oos_val_data]
# oos_test_sentences = [item[0] for item in oos_test_data]
# oos_scenario

# model_name = "improved_ce_model_bert_SNIP.pth"

# ROSTD

In [8]:
# from datasets import load_dataset

# dataset = load_dataset("cmaldona/Generalization-MultiClass-CLINC150-ROSTD", "rostd+")

# train_sentences = []
# train_labels = []
# val_sentences = []
# val_labels = []
# test_sentences = []
# test_labels = []
# oos_test_sentences = []

# # Extract training data
# for example in dataset['train']:
#     train_sentences.append(example['data'].lower())
#     train_labels.append(example['labels'])

# # Extract validation data
# for example in dataset['validation']:
#     val_sentences.append(example['data'].lower())
#     val_labels.append(example['labels'])

# # Extract test data and separate ID from OOS
# for example in dataset['test']:
#     if example['generalisation'] == 'ID':
#         test_sentences.append(example['data'].lower())
#         test_labels.append(example['labels'])
#     elif example['generalisation'] == 'near-OOD' or example['generalisation'] == 'far-OOD':# OOS
#         try:
#             oos_test_sentences.append(example['data'].lower())
#         except:
#             continue
            

# model_name = "improved_ce_model_bert_ROSTD.pth"

# Encode Labels

In [9]:
label_encoder = LabelEncoder()
# Fit the label encoder and transform labels to integers
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.fit_transform(val_labels)

# Tokenize our sentences and create Dataloaders

In [10]:
pretrained_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
transformer_model = AutoModel.from_pretrained(pretrained_model_name)

class TextDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(sentences, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [11]:
tokenized_lengths = [len(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in train_sentences]
max_length = max(tokenized_lengths)
print(f"Max length for tokenizer: {max_length}")
# 2. Create the dataset
train_dataset = TextDataset(train_sentences, encoded_train_labels, tokenizer, max_length)
val_dataset = TextDataset(val_sentences, encoded_val_labels, tokenizer, max_length)

Max length for tokenizer: 33


# Define functions to encode our sentences

In [12]:
transformer_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer_model = transformer_model.to(device)
def encode_sentences(model, sentences, tokenizer=tokenizer, batch_size=256):
    model = model.to(device)
    sentence_embeddings = []

    # Process sentences in batches
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Move the batch to the same device as the model
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        
        with torch.no_grad():
            model_output = model(**encoded_input)

        pooled_output = model_output.last_hidden_state.mean(dim=1)
        sentence_embeddings.append(pooled_output)

    # Concatenate all batched embeddings and move to CPU in one go
    sentence_embeddings_np = torch.cat(sentence_embeddings, dim=0).cpu().numpy()
    
    return sentence_embeddings_np

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Define our model

In [13]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, transformer_model, num_labels):
        super(TextClassifier, self).__init__()
        self.transformer = transformer_model
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embedding = transformer_output.last_hidden_state.max(dim=1).values

        # Forward pass through the classifier layer
        logits = self.classifier(sentence_embedding)
        
        return logits, sentence_embedding


# Initiallize everything else we needed

In [95]:
unique_intents = list(set(train_labels)) 
transformer_model = AutoModel.from_pretrained(pretrained_model_name)
transformer_model.to(device)
model = TextClassifier(transformer_model, len(unique_intents))
model.to(device)
optimizer = Adam(model.parameters(), lr=1e-04)
training_losses = []
validation_losses = []
batch_size= 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
loss_function = nn.CrossEntropyLoss()

In [96]:
# def euclidean_distance_loss(embeddings):
#     n, k = embeddings.size()
#     mean_embeddings = embeddings.mean(dim=0)
#     distances = embeddings - mean_embeddings
#     loss = (distances ** 2).sum(dim=1).mean() / k
#     return loss

In [97]:
def euclidean_distance_loss(embeddings):
    n, k = embeddings.size()  # n is the batch size, k is the embedding dimension
    loss = 0.0
    
    # Calculate the mean embedding for each sample, excluding the sample itself
    for i in range(n):
        # Use indexing to exclude the current sample, then calculate the mean of the remaining samples
        indices = [j for j in range(n) if j != i]
        mean_embedding = embeddings[indices].mean(dim=0)
        
        # Calculate the squared Euclidean distance for the current sample
        distance = (embeddings[i] - mean_embedding).pow(2).sum()
        
        # Accumulate the loss
        loss += distance
    
    # Average the loss over all samples and divide by the dimension k
    #loss = loss / (n * k)
    loss = loss / (n * k) / embeddings.var()
    return loss

In [98]:
def euclidean_distance_loss(embeddings, labels):
    unique_labels = torch.unique(labels)
    num_unique_labels = len(unique_labels)
    num_pairs = num_unique_labels * (num_unique_labels - 1) / 2  

    # Calculate the mean embedding for each unique label
    mean_embeddings = torch.zeros((num_unique_labels, embeddings.size(1)))
    for i, label in enumerate(unique_labels):
        mean_embeddings[i] = embeddings[labels == label].mean(dim=0)

    # Calculate the total loss as the sum of pairwise distances between mean embeddings
    total_loss = 0.0
    for i in range(num_unique_labels):
        for j in range(i + 1, num_unique_labels):
            distance = (mean_embeddings[i] - mean_embeddings[j]).pow(2).sum()
            total_loss += distance

    # Normalize the total loss by the square of the number of unique labels
    loss = total_loss / num_pairs
    return loss

In [99]:
def euclidean_distance_loss(embeddings, mean_embedding):
    # Calculate the Euclidean distance of each embedding to the mean embedding
    distances = torch.norm(embeddings - mean_embedding, dim=-1, p = 2)
    
    # Calculate the average distance
    average_distance = torch.mean(distances)
    
    return average_distance


In [100]:
ed_loss_importance = 0.025
num_epochs = 40

# Training Loop!!!

In [101]:
best_val_loss = float('Inf')
for epoch in range(num_epochs):
    # Training Phase
    model.train()  # Set the model to training mode
    total_train_loss = 0
    cumulative_embeddings = None  # Tensor to store the cumulative sum of embeddings
    embedding_count = 0  # Counter for the number of embeddings
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()  # Zero the gradients
        predictions, embeddings = model(input_ids, attention_mask)  # Forward pass

        # Update the cumulative sum and count
        if cumulative_embeddings is None:
            cumulative_embeddings = embeddings.detach().sum(dim=0)
        else:
            cumulative_embeddings += embeddings.detach().sum(dim=0)
        embedding_count += embeddings.size(0)

    # Calculate the mean embedding for the entire epoch
    mean_embedding = cumulative_embeddings / embedding_count

    # Free up memory
    del cumulative_embeddings
    torch.cuda.empty_cache()

    # Calculate and apply the ED loss for each batch
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()  # Zero the gradients
        predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
        ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
        ed_loss = euclidean_distance_loss(embeddings, mean_embedding)  # ED loss
        total_loss = (1 - ed_loss_importance) * ce_loss + ed_loss_importance * ed_loss  # Combine the losses
        print((1 - ed_loss_importance) * ce_loss, ed_loss_importance * ed_loss)
        total_loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        total_train_loss += total_loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_losses.append(avg_train_loss) 

    # Validation Phase
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0
    cumulative_val_embeddings = None  # Tensor to store the cumulative sum of validation embeddings
    val_embedding_count = 0  # Counter for the number of validation embeddings
    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            predictions, embeddings = model(input_ids, attention_mask)  # Forward pass

            # Update the cumulative sum and count for validation embeddings
            if cumulative_val_embeddings is None:
                cumulative_val_embeddings = embeddings.detach().sum(dim=0)
            else:
                cumulative_val_embeddings += embeddings.detach().sum(dim=0)
            val_embedding_count += embeddings.size(0)

            ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
            total_val_loss += ce_loss.item()

    # Calculate the mean embedding for the entire validation set
    mean_val_embedding = cumulative_val_embeddings / val_embedding_count
    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
            ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
            ed_loss = euclidean_distance_loss(embeddings, mean_val_embedding)  # ED loss
            total_loss = (1 - ed_loss_importance) * ce_loss + ed_loss_importance * ed_loss  # Combine the losses
            total_val_loss += total_loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        # Save the model
        torch.save(model, model_name)
        print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
    validation_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.5e}, Validation Loss: {avg_val_loss:.5e}")

tensor(4.9983, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1998, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.9182, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1922, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.9011, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.2014, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.8842, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.2027, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.9019, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1963, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.8519, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1909, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.8766, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1940, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.8232, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1911, device='cuda:0', grad_fn=<MulBackward0>)
tensor(4.8539, device='cuda:0', grad_fn=<MulBackward0>) tensor(0.1892, device='cuda:0', grad_fn=<MulBack

In [102]:
# best_val_loss = float('Inf')
# for epoch in range(num_epochs):
#     # Training Phase
#     model.train()  # Set the model to training mode
#     total_train_loss = 0
#     for batch in train_dataloader:
#         input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
#         optimizer.zero_grad()  # Zero the gradients
#         predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
#         ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
#         ed_loss = euclidean_distance_loss(embeddings, labels)  # Euclidean distance loss
#         adj_ce_loss = (1 - ed_loss_importance) * ce_loss
#         adj_ed_loss = ed_loss_importance * ed_loss
#         print(adj_ce_loss, adj_ed_loss)
#         total_loss = adj_ce_loss + adj_ed_loss  # Combine the losses
        
#         total_loss.backward()  # Backward pass
#         optimizer.step()  # Update weights

#         total_train_loss += total_loss.item()
    
#     avg_train_loss = total_train_loss / len(train_dataloader)
#     training_losses.append(avg_train_loss) 

#     # Validation Phase
#     model.eval()  # Set the model to evaluation mode
#     total_val_loss = 0
#     with torch.no_grad():  # Disable gradient calculations
#         for batch in val_dataloader:
#             input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
#             predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
#             ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
#             ed_loss = euclidean_distance_loss(embeddings, labels)  # Euclidean distance loss
#             total_loss = (1 - ed_loss_importance) * ce_loss + ed_loss_importance * ed_loss  # Combine the losses
#             total_val_loss += total_loss.item()
#     avg_val_loss = total_val_loss / len(val_dataloader)
#     if avg_val_loss < best_val_loss:
#         best_val_loss = avg_val_loss
#         # Save the model
#         torch.save(model, model_name)
#         print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
#     validation_losses.append(avg_val_loss)
#     print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.5e}, Validation Loss: {avg_val_loss:.5e}")

##### Calculate means and covariance matrix

In [103]:
fine_model = torch.load(model_name)
fine_model.eval()  # Put the model in evaluation mode
fine_model = fine_model.to(device)

In [104]:
# get trasformer part of the model
fine_model = fine_model.transformer

In [105]:
train_embeddings = encode_sentences(fine_model, train_sentences)
val_embeddings = encode_sentences(fine_model, val_sentences)
test_embeddings = encode_sentences(fine_model, test_sentences)
oos_val_embeddings = encode_sentences(fine_model, oos_val_sentences)
oos_test_embeddings = encode_sentences(fine_model, oos_test_sentences)

In [106]:
intent_means = {}
for encoded_label in np.unique(encoded_train_labels):
    # Find indices where the encoded label matches
    indices = np.where(encoded_train_labels == encoded_label)[0]
    
    # Calculate the mean embedding for the current intent
    intent_embeddings = train_embeddings[indices]
    intent_mean = np.mean(intent_embeddings, axis=0)
    
    # Use the encoded label as the dictionary key
    intent_means[encoded_label] = intent_mean

In [107]:
covariance = np.cov(train_embeddings, rowvar=False)
cov_inverse = inv(covariance)

In [108]:
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
from scipy.spatial import distance
from sklearn.metrics import average_precision_score

In [109]:
def min_mahalanobis_for_sample(sample, intent_means, cov_inverse):
    distances = [distance.mahalanobis(sample, mean, cov_inverse) for mean in intent_means.values()]
    return min(distances)

In [110]:
# Compute minimum Mahalanobis distances for samples in test_embeddings and oos_test_embeddings
val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in val_embeddings]
oos_val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_val_embeddings]

# True binary labels: 0 for in-domain and 1 for OOD
y_true = [0] * len(val_scores) + [1] * len(oos_val_scores)

# Combine the scores
y_scores = val_scores + oos_val_scores

# Compute AUPR
aupr = average_precision_score(y_true, y_scores)
aupr

0.6158678406845223

In [111]:
# Compute minimum Mahalanobis distances for samples in test_embeddings and oos_test_embeddings
test_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in test_embeddings]
oos_test_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_test_embeddings]

# True binary labels: 0 for in-domain and 1 for OOD
y_true = [0] * len(test_scores) + [1] * len(oos_test_scores)

# Combine the scores
y_scores = test_scores + oos_test_scores

# Compute AUPR
aupr = average_precision_score(y_true, y_scores)
aupr


0.8810031338275476

In [112]:
auroc = roc_auc_score(y_true, y_scores)
auroc

0.9698946666666666

In [113]:
num_smaller_50 = sum([value<50 for value in test_scores])
num_smaller_50

3520

In [None]:
import matplotlib.pyplot as plt

# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Plot the histograms
plt.hist(test_scores, bins=50, alpha=0.5, label='In-domain')
plt.hist(oos_test_scores, bins=50, alpha=0.5, label='Out-of-domain')

# Add legend, title, and labels
plt.legend(loc='upper right')
plt.title('Distribution of Minimum Mahalanobis Distances')
plt.xlabel('Mahalanobis Distance')
plt.ylabel('Number of Samples')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
import optuna
from transformers import AutoTokenizer, AutoModel

def objective(trial):
    # Optuna suggests hyperparameters
    writer = SummaryWriter()
    seed_value=42
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    lr = 3e-4
    num_epochs = 20
    batch_size = 768
    ed_loss_importance = trial.suggest_float('ed_loss_importance', 0.0001, 0.1)
    #ed_loss_importance = 0.1
    training_losses = []
    validation_losses = []
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    # Model setup
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    transformer_model = AutoModel.from_pretrained(pretrained_model_name)
    model = TextClassifier(transformer_model, len(unique_intents))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=lr)
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        # Training Phase
        model.train()  # Set the model to training mode
        total_train_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            optimizer.zero_grad()  # Zero the gradients
            predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
            ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
            ed_loss = euclidean_distance_loss(embeddings, labels)  # Euclidean distance loss
            total_loss = (1-ed_loss_importance)*ce_loss + ed_loss_importance * ed_loss  # Combine the losses
            
            total_loss.backward()  # Backward pass
            optimizer.step()  # Update weights
    
            total_train_loss += total_loss.item()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        training_losses.append(avg_train_loss)
        # Validation Phase
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        total_ce_loss = 0
        total_ed_loss = 0
        with torch.no_grad():  # Disable gradient calculations
            for batch in val_dataloader:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                predictions, embeddings = model(input_ids, attention_mask)  # Forward pass
                ce_loss = loss_function(predictions, labels)  # Cross-Entropy loss
                ed_loss = euclidean_distance_loss(embeddings, labels)  # Euclidean distance loss
                total_loss = (1-ed_loss_importance)*ce_loss + ed_loss_importance * ed_loss  # Combine the losses
                total_val_loss += total_loss.item()
                total_ce_loss += ce_loss.item()
                total_ed_loss += ed_loss.item()
        avg_val_loss = total_val_loss / len(val_dataloader)
        avg_ce_loss = total_ce_loss / len(val_dataloader)
        avg_ed_loss = total_ed_loss / len(val_dataloader)
        writer.add_scalar("Validation/Average CE Loss", avg_ce_loss, epoch)
        writer.add_scalar("Validation/Average ED Loss", avg_ed_loss, epoch)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Save the model
            torch.save(model, model_name)
            print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
        validation_losses.append(avg_val_loss)
    trial.set_user_attr("training_losses", training_losses)
    trial.set_user_attr("validation_losses", validation_losses)
    writer.close()
    fine_model = torch.load(model_name)
    fine_model.eval()  # Put the model in evaluation mode
    fine_model = fine_model.to(device)
    fine_transformer = fine_model.transformer
    train_embeddings = encode_sentences(fine_transformer, train_sentences)
    val_embeddings = encode_sentences(fine_transformer, val_sentences)
    oos_val_embeddings = encode_sentences(fine_transformer, oos_val_sentences)

    intent_means = {}
    for encoded_label in np.unique(encoded_train_labels):
        # Find indices where the encoded label matches
        indices = np.where(encoded_train_labels == encoded_label)[0]
        
        # Calculate the mean embedding for the current intent
        intent_embeddings = train_embeddings[indices]
        intent_mean = np.mean(intent_embeddings, axis=0)
        
        # Use the encoded label as the dictionary key
        intent_means[encoded_label] = intent_mean
    covariance = np.cov(train_embeddings, rowvar=False)
    cov_inverse = inv(covariance)
    val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in val_embeddings]
    oos_val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_val_embeddings]

    # True binary labels: 0 for in-domain and 1 for OOD
    y_true = [0] * len(val_scores) + [1] * len(oos_val_scores)

    # Combine the scores
    y_scores = val_scores + oos_val_scores

    # Compute AUPR
    aupr = average_precision_score(y_true, y_scores)
    
    return aupr

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize',  study_name='improved_ce_loss_average_mean_ed', storage='sqlite:///desperate.db', load_if_exists= True)
study.optimize(objective, n_trials=500)  # n_trials is the number of iterations

# Get the best parameters
best_params = study.best_params
print("Best parameters:", best_params)



In [None]:
# import optuna
# study = optuna.load_study(study_name='improved_ce_loss_CLINC150', storage='sqlite:///desperate.db')

In [None]:
# sorted_trials = sorted(
#     study.trials, 
#     key=lambda trial: min(trial.user_attrs.get('validation_losses', [float('inf')]))
# )
