# Load Data

In [1]:
import json
import torch
import random
import os
import pickle
import collections
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from torch import nn
import numpy as np
from scipy.linalg import inv
from torch.optim import Adam
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
from collections import Counter

In [2]:
seed_value=42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

# MTOP

In [3]:
# def read_mtop_file(file_path):
#     data = []
#     with open(file_path, 'r', encoding='utf-8') as file:
#         for line in file:
#             fields = line.strip().split('\t')
#             if len(fields) < 8:
#                 continue  # Skip any malformed lines
#             record = {
#                 'ID': fields[0],
#                 'Intent': fields[1],
#                 'Utterance': fields[3],
#                 'Domain': fields[4]
#             }
#             data.append(record)
#     return data

# def select_ood_domains(domains, num_ood):
#     return ["timer"]  # Hardcoded OOD domains

# # Configuration parameters
# english_dir = 'mtop/en'

# # Read dataset
# all_data = read_mtop_file(f'{english_dir}/test.txt') + \
#            read_mtop_file(f'{english_dir}/train.txt') + \
#            read_mtop_file(f'{english_dir}/eval.txt')

# domains = set(record['Domain'] for record in all_data)
# ood_domains = select_ood_domains(domains, 1)  # Using 1 as hardcoded number of OOD domains

# # Separate OOD data based on domain
# in_domain_data = [record for record in all_data if record['Domain'] not in ood_domains]
# ood_data = [record for record in all_data if record['Domain'] in ood_domains]

# intent_counts = Counter(record['Intent'] for record in in_domain_data)

# # Filter intents with at least 10 instances
# sufficient_data = [record for record in in_domain_data if intent_counts[record['Intent']] > 10]

# train_val_data, test_data = train_test_split(
#     sufficient_data, test_size=0.2, random_state=seed_value, stratify=[record['Intent'] for record in sufficient_data]
# )
# train_data, val_data = train_test_split(
#     train_val_data, test_size=0.125, random_state=seed_value, stratify=[record['Intent'] for record in train_val_data]
# )

# # Split OOD data between validation and test
# oos_val_data, oos_test_data = train_test_split(ood_data, test_size=0.67, random_state=seed_value)  # No stratification here since it's all OOD

# # Extract sentences and labels
# train_sentences = [record['Utterance'] for record in train_data]
# train_labels = [record['Intent'] for record in train_data]

# val_sentences = [record['Utterance'] for record in val_data]
# val_labels = [record['Intent'] for record in val_data]

# test_sentences = [record['Utterance'] for record in test_data]
# test_labels = [record['Intent'] for record in test_data]

# oos_val_sentences = [record['Utterance'] for record in oos_val_data]
# oos_test_sentences = [record['Utterance'] for record in oos_test_data]

# # Summary of splits
# summary = {
#     "OOD Domains": ood_domains,
#     "Train Set Size": len(train_sentences),
#     "Validation Set Size": len(val_sentences),
#     "Test Set Size": len(test_sentences),
#     "OOS Validation Set Size": len(oos_val_sentences),
#     "OOS Test Set Size": len(oos_test_sentences)
# }




# model_name = "ce_model_bert_mtop.pth"
# summary

{'OOD Domains': ['timer'],
 'Train Set Size': 14465,
 'Validation Set Size': 2067,
 'Test Set Size': 4134,
 'OOS Validation Set Size': 491,
 'OOS Test Set Size': 997}

# Load CLINC150

In [3]:
# # Load the dataset
# with open("clinc150_uci/data_full.json", "r") as file:
#     data = json.load(file)
# # Extracting data
# train_data = data['train']
# val_data = data['val']
# test_data = data['test']

# oos_train_data = data['oos_train']
# oos_val_data = data['oos_val']
# oos_test_data = data['oos_test']

# # Get sentences and labels
# train_sentences = [item[0] for item in train_data]
# train_labels = [item[1] for item in train_data]

# val_sentences = [item[0] for item in val_data]
# val_labels = [item[1] for item in val_data]

# test_sentences = [item[0] for item in test_data]
# test_labels = [item[1] for item in test_data]

# oos_train_sentences = [item[0] for item in oos_train_data]
# oos_val_sentences = [item[0] for item in oos_val_data]
# oos_test_sentences = [item[0] for item in oos_test_data]
# model_name = "ce_model_bert_CLINC150.pth"

# StackOverflow

In [3]:
split_number = 1

base_dir = 'stackoverflow_data'  

# Construct the path to the specific split
split_dir = os.path.join(base_dir, f'split{split_number}')

# Function to load data from a .pkl file
def load_data_from_split(split_dir, file_name):
    with open(os.path.join(split_dir, file_name), 'rb') as file:
        return pickle.load(file)

# Load the datasets
train_sentences = load_data_from_split(split_dir, 'train_sentences.pkl')
train_labels = load_data_from_split(split_dir, 'train_labels.pkl')
val_sentences = load_data_from_split(split_dir, 'val_sentences.pkl')
val_labels = load_data_from_split(split_dir, 'val_labels.pkl')
test_sentences = load_data_from_split(split_dir, 'test_sentences.pkl')
test_labels = load_data_from_split(split_dir, 'test_labels.pkl')
oos_val_sentences = load_data_from_split(split_dir, 'oos_val_sentences.pkl')
oos_test_sentences = load_data_from_split(split_dir, 'oos_test_sentences.pkl')
model_name = "ce_model_bert_Stack.pth"

In [4]:
model_name = f"{seed_value}_{model_name}"

# Encode Labels

In [5]:
label_encoder = LabelEncoder()
# Fit the label encoder and transform labels to integers
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.fit_transform(val_labels)
encoded_test_labels = label_encoder.fit_transform(test_labels)

# Tokenize our sentences and create Dataloaders

In [6]:
pretrained_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
transformer_model = AutoModel.from_pretrained(pretrained_model_name)
tokenized_lengths = [len(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in train_sentences]
max_length = max(tokenized_lengths)
print(f"Max length for tokenizer: {max_length}")

class TextDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(sentences, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 2. Create the dataset
train_dataset = TextDataset(train_sentences, encoded_train_labels, tokenizer, max_length)
val_dataset = TextDataset(val_sentences, encoded_val_labels, tokenizer, max_length)
test_dataset = TextDataset(test_sentences, encoded_test_labels, tokenizer, max_length)



Max length for tokenizer: 58


# Define functions to encode our sentences

In [7]:
transformer_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer_model = transformer_model.to(device)
def encode_sentences(model, sentences, tokenizer=tokenizer, batch_size=256):
    model = model.to(device)
    sentence_embeddings = []

    # Process sentences in batches
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Move the batch to the same device as the model
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        
        with torch.no_grad():
            model_output = model(**encoded_input)

        pooled_output = model_output.last_hidden_state.mean(dim=1)
        sentence_embeddings.append(pooled_output)

    # Concatenate all batched embeddings and move to CPU in one go
    sentence_embeddings_np = torch.cat(sentence_embeddings, dim=0).cpu().numpy()
    
    return sentence_embeddings_np

# Define our model

In [8]:
transformer_model = AutoModel.from_pretrained(pretrained_model_name)
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, transformer_model, num_labels):
        super(TextClassifier, self).__init__()
        self.transformer = transformer_model
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        # Get the output from the transformer model
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # Pool the outputs into a single sentence vector
        # You can use `transformer_output.last_hidden_state.mean(dim=1)` for mean pooling
        # or `transformer_output.last_hidden_state.max(dim=1).values` for max pooling
        sentence_embedding = transformer_output.last_hidden_state.max(dim=1).values
        # Forward pass through the classifier layer
        return self.classifier(sentence_embedding)




# Initiallize everything else we needed

In [9]:
unique_intents = list(set(train_labels)) 
transformer_model = AutoModel.from_pretrained(pretrained_model_name)
transformer_model.to(device)
model = TextClassifier(transformer_model, len(unique_intents))
model.to(device)
optimizer = Adam(model.parameters(), lr=5.00E-05)
training_losses = []
validation_losses = []
batch_size= 1024
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
loss_function = nn.CrossEntropyLoss()

In [10]:
num_epochs = 6

# Training Loop!!!

In [11]:
if not os.path.exists(model_name):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        # Training Phase
        model.train()  # Set the model to training mode
        total_train_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            optimizer.zero_grad()  # Zero the gradients
            predictions = model(input_ids, attention_mask)  # Forward pass
            loss = loss_function(predictions, labels) 
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights
    
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        training_losses.append(avg_train_loss) 
    
        # Validation Phase
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        with torch.no_grad():  # Disable gradient calculations
            for batch in val_dataloader:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                predictions = model(input_ids, attention_mask) 
                loss = loss_function(predictions, labels)  # Compute loss
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_dataloader)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Save the model
            torch.save(model, model_name)
            print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
        validation_losses.append(avg_val_loss)
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.5e}, Validation Loss: {avg_val_loss:.5e}")
else:
        print("training skipped")

training skipped


# Calculate means and covariance matrix

In [12]:
fine_model = torch.load(model_name)
fine_model.eval()  # Put the model in evaluation mode
fine_model = fine_model.to(device)

In [13]:
# get trasformer part of the model
fine_model = fine_model.transformer

In [14]:
train_embeddings = encode_sentences(fine_model, train_sentences)
val_embeddings = encode_sentences(fine_model, val_sentences)
test_embeddings = encode_sentences(fine_model, test_sentences)
oos_val_embeddings = encode_sentences(fine_model, oos_val_sentences)
oos_test_embeddings = encode_sentences(fine_model, oos_test_sentences)

In [15]:
intent_means = {}
for encoded_label in np.unique(encoded_train_labels):
    # Find indices where the encoded label matches
    indices = np.where(encoded_train_labels == encoded_label)[0]
    
    # Calculate the mean embedding for the current intent
    intent_embeddings = train_embeddings[indices]
    intent_mean = np.mean(intent_embeddings, axis=0)
    
    # Use the encoded label as the dictionary key
    intent_means[encoded_label] = intent_mean
    

In [90]:
np.savez('cross_entropy_embeddings.npz', 
         intent_means=np.array(list(intent_means.values())), 
         oos_test_embeddings=oos_test_embeddings, 
         intent_labels=np.array(list(intent_means.keys())))

In [16]:
covariance = np.cov(train_embeddings, rowvar=False)
cov_inverse = inv(covariance)

In [17]:
trace = np.trace(covariance)
total_variance = np.sqrt(trace)
total_variance

16.85449078704698

In [92]:
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
from scipy.spatial import distance
from sklearn.metrics import average_precision_score

In [93]:
def min_mahalanobis_for_sample(sample, intent_means, cov_inverse):
    distances = [distance.mahalanobis(sample, mean, cov_inverse) for mean in intent_means.values()]
    return min(distances)

In [94]:
# Compute minimum Mahalanobis distances for samples in test_embeddings and oos_test_embeddings
val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in val_embeddings]
oos_val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_val_embeddings]

# True binary labels: 0 for in-domain and 1 for OOD
y_true = [0] * len(val_scores) + [1] * len(oos_val_scores)

# Combine the scores
y_scores = val_scores + oos_val_scores

# Compute AUPR
aupr = average_precision_score(y_true, y_scores)
aupr

0.8990882203833147

In [95]:
# Compute minimum Mahalanobis distances for samples in test_embeddings and oos_test_embeddings
test_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in test_embeddings]
oos_test_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_test_embeddings]

# True binary labels: 0 for in-domain and 1 for OOD
y_true = [0] * len(test_scores) + [1] * len(oos_test_scores)

# Combine the scores
y_scores = test_scores + oos_test_scores

# Compute AUPR
aupr = average_precision_score(y_true, y_scores)
aupr


0.8921835362352729

In [None]:
auroc = roc_auc_score(y_true, y_scores)
auroc

In [None]:
def get_predictions(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    real_labels = []
    with torch.no_grad():  # Disable gradient calculation
        for batch in data_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask) 
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.detach().cpu().numpy())  # Move preds to CPU and convert to numpy
            real_labels.extend(labels.detach().cpu().numpy())  # Move labels to CPU and convert to numpy
    return predictions, real_labels


In [None]:
fine_model = torch.load(model_name)
fine_model = fine_model.to(device)
preds, true_labels = get_predictions(fine_model, test_dataloader)

# Now you can calculate the accuracy
accuracy = accuracy_score(true_labels, preds)
print(f"Classification Accuracy: {accuracy}")

In [None]:
# import matplotlib.pyplot as plt

# # Set up the figure and axes
# plt.figure(figsize=(10, 6))

# # Plot the histograms
# plt.hist(val_scores, bins=50, alpha=0.5, label='In-domain')
# plt.hist(oos_val_scores, bins=50, alpha=0.5, label='Out-of-domain')

# # Add legend, title, and labels 
# plt.legend(loc='upper right')
# plt.title('Distribution of Minimum Mahalanobis Distances')
# plt.xlabel('Mahalanobis Distance')
# plt.ylabel('Number of Samples')
# plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# plt.tight_layout()

# # Show the plot
# plt.show()


In [None]:
# # Set up the figure and axes
# plt.figure(figsize=(10, 6))

# # Plot the histograms
# plt.hist(test_scores,bins=50, alpha=0.5, label='In-domain')
# plt.hist(oos_test_scores, bins=50, alpha=0.5, label='Out-of-domain')

# # Add legend, title, and labels
# plt.legend(loc='upper right')
# plt.title('Distribution of Minimum Mahalanobis Distances')
# plt.xlabel('Mahalanobis Distance')
# plt.ylabel('Number of Samples')
# plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# plt.tight_layout()

# # Show the plot
# plt.show()


In [None]:
# unique_labels = np.unique(test_labels)
# num_unique_labels = len(unique_labels)
 
# ood_label = "ood"
 
# # Combine embeddings
# combined_embeddings = np.vstack((test_embeddings, oos_test_embeddings))
 
# # Create labels for combined data
# combined_labels = np.concatenate((test_labels, np.array([ood_label] * len(oos_test_embeddings))))
 
# # TSNE
# tsne = TSNE(n_components=2, random_state=25)
# embeddings_2d = tsne.fit_transform(combined_embeddings)

In [None]:
# plt.figure(figsize=(10, 8))
# unique_labels = np.unique(combined_labels)
# for label in unique_labels:
#     indices = np.where(combined_labels == label)[0]
#     if label == ood_label:
#         # Specific color for OOD
#         plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], label=label, color='black', s=0.5, alpha=0.7)
#     else:
#         plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], label=label, color='red', s=0.5, alpha=0.7)
# plt.title('2D t-SNE Plot of Embeddings (Baseline)')
# plt.legend()
# plt.show()

In [None]:
# import optuna
# from transformers import AutoTokenizer, AutoModel

# def objective(trial):
#     # Optuna suggests hyperparameters
#     seed_value=42
#     random.seed(seed_value)
#     np.random.seed(seed_value)
#     torch.manual_seed(seed_value)
#     torch.cuda.manual_seed_all(seed_value)
#     lr = trial.suggest_categorical('lr', [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2])
#     num_epochs = 25
#     batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256])
#     training_losses = []
#     validation_losses = []
#     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
#     # Model setup
#     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
#     transformer_model = AutoModel.from_pretrained(pretrained_model_name)
#     model = TextClassifier(transformer_model, len(unique_intents))
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model = model.to(device)
    

#     # Loss function and optimizer
#     loss_function = nn.CrossEntropyLoss()
#     optimizer = Adam(model.parameters(), lr=lr)

#     best_val_loss = float('inf')
#     for epoch in range(num_epochs):
#         # Training Phase
#         model.train()  # Set the model to training mode
#         total_train_loss = 0
#         for batch in train_dataloader:
#             input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
#             optimizer.zero_grad()  # Zero the gradients
#             predictions = model(input_ids, attention_mask)  # Forward pass
#             loss = loss_function(predictions, labels) 
#             loss.backward()  # Backward pass
#             optimizer.step()  # Update weights
    
#             total_train_loss += loss.item()
        
#         avg_train_loss = total_train_loss / len(train_dataloader)
#         training_losses.append(avg_train_loss) 
    
#         # Validation Phase
#         model.eval()  # Set the model to evaluation mode
#         total_val_loss = 0
#         with torch.no_grad():  # Disable gradient calculations
#             for batch in val_dataloader:
#                 input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
#                 predictions = model(input_ids, attention_mask) 
#                 loss = loss_function(predictions, labels)  # Compute loss
#                 total_val_loss += loss.item()
#         avg_val_loss = total_val_loss / len(val_dataloader)
#         validation_losses.append(avg_val_loss)
#         if avg_val_loss < best_val_loss:
#             best_val_loss = avg_val_loss
#             # Save the model
#             torch.save(model, model_name)
#             print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
#     return best_val_loss
    
#     # trial.set_user_attr("training_losses", training_losses)
#     # trial.set_user_attr("validation_losses", validation_losses)
#     # model = torch.load(model_name)
#     # model.eval()  # Put the model in evaluation mode
#     # model = fine_model.to(device)
#     # fine_transformer = model.transformer
#     # train_embeddings = encode_sentences(fine_transformer, train_sentences)
#     # val_embeddings = encode_sentences(fine_transformer, val_sentences)
#     # oos_val_embeddings = encode_sentences(fine_transformer, oos_val_sentences)

#     # intent_means = {}
#     # for encoded_label in np.unique(encoded_train_labels):
#     #     # Find indices where the encoded label matches
#     #     indices = np.where(encoded_train_labels == encoded_label)[0]
        
#     #     # Calculate the mean embedding for the current intent
#     #     intent_embeddings = train_embeddings[indices]
#     #     intent_mean = np.mean(intent_embeddings, axis=0)
        
#     #     # Use the encoded label as the dictionary key
#     #     intent_means[encoded_label] = intent_mean
#     # covariance = np.cov(train_embeddings, rowvar=False)
#     # cov_inverse = inv(covariance)
#     # val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in val_embeddings]
#     # oos_val_scores = [min_mahalanobis_for_sample(sample, intent_means, cov_inverse) for sample in oos_val_embeddings]

#     # # True binary labels: 0 for in-domain and 1 for OOD
#     # y_true = [0] * len(val_scores) + [1] * len(oos_val_scores)

#     # # Combine the scores
#     # y_scores = val_scores + oos_val_scores

#     # # Compute AUPR
#     # aupr = average_precision_score(y_true, y_scores)
    
#     # return aupr

# # Create a study object and optimize the objective function
# study = optuna.create_study(direction='minimize',  study_name='ce_loss_CLINC150_min_val', storage='sqlite:///desperate.db', load_if_exists= True)
# study.optimize(objective, n_trials=35)  # n_trials is the number of iterations

# # Get the best parameters
# best_params = study.best_params
# print("Best parameters:", best_params)

