In [None]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-y_nskooh
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-y_nskooh
  Resolved https://github.com/huggingface/transformers to commit 5fa35344755d8d9c29610b57d175efd03776ae9e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.49.0.dev0-py3-none-any.whl size=10474239 sha256=c992ad485234bd82ee1d0b0a855f56a69d4176c9f0b8f181d2ac4379ea033667
  Stored in directory: /tmp/pip-ephem-wheel-cache-m7npdusm/wheels/04/a3/f1/b88775f8e1665827525b19ac7590250f1038d947067beba9fb
Successfully built transformer

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch import nn
from torch.optim import Adam
import json
import os
import pickle
from torch.utils.data import Dataset, DataLoader
# from tqdm import tqdm

In [None]:
seed_value = 42
np.random.seed(seed_value)

In [None]:
with open('sample_data/data_full.json', 'r') as file:
  data = json.load(file)

train=data['train']
test=data['test']
val=data['val']

oos_train_data = data['oos_train']
oos_val_data = data['oos_val']
oos_test_data = data['oos_test']

In [None]:
train_sent=[item[0] for item in train]
train_labels=[item[1] for item in train]

val_sent = [item[0] for item in val]
val_labels=[item[1] for item in val]

test_sent = [item[0] for item in test]
test_labels = [item[1] for item in test]

oos_train_sentences = [item[0] for item in oos_train_data]
oos_val_sentences = [item[0] for item in oos_val_data]
oos_test_sentences = [item[0] for item in oos_test_data]

model_name = "ae_model_bert_CLINC150.pth"

In [None]:
model_name = f"{seed_value}_{model_name}"

# Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.fit_transform(val_labels)
encoded_test_labels = label_encoder.fit_transform(test_labels)

# Tokenize dataset

In [None]:
!pip install -q python-dotenv

In [None]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
# pretrained = 'answerdotai/ModernBERT'

# tokenizer = AutoTokenizer.from_pretrained(pretrained)
# pretrained_model = AutoModelForSequenceClassification.from_pretrained(pretrained)

model_id = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_lengths = [len(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in train_sent]
max_length = max(tokenized_lengths)

In [None]:
class TextDataset(Dataset):
  def __init__(self, sentences, labels, tokenizer, max_length):
    self.encodings = tokenizer(sentences, truncation=True, padding=True, max_length=max_length)
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_df = TextDataset(train_sent, encoded_train_labels, tokenizer, max_length)
val_df = TextDataset(val_sent, encoded_val_labels, tokenizer, max_length)
test_df = TextDataset(test_sent, encoded_test_labels, tokenizer, max_length)

# Define functions to encode sentences

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
pretrained_model = model.eval()
pretrained_model = model.to(device)

def encode_sentences(model, sentences, tokenizer, batch_size=256):
  encoded_sentences = []
  model = model.to(device)

  for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i+batch_size]
    encoded_input = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

    with torch.no_grad():
      model_output = model(**encoded_input)

  pooled = model_output.last_hidden_state.mean(dim=1)
  encoded_sentences.append(pooled)

  sent_embed_np = torch.cat(encoded_sentences).cpu().numpy()

  return sent_embed_np

# Define the model

In [None]:
class TextClassifier(nn.Module):
  def __init__(self, pretrained_model, num_labels):
    super(TextClassifier, self).__init__()
    self.transformer = pretrained_model

    # encoder layers

    self.encoder1 = nn.Linear(768, 512)
    self.encoder2 = nn.Linear(512, 64)
    self.encoder3 = nn.Linear(64, 16)

    # decoder layers
    self.decoder1 = nn.Linear(16, 64)
    self.decoder2 = nn.Linear(64, 512)
    self.decoder3 = nn.Linear(512, 768)

    self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)


  def forward(self, input_ids, attention_mask):
    trans_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
    original_embeddings = trans_output.last_hidden_state.max(dim=1).values

    predictions = self.classifier(original_embeddings)

    # AE forward pass

    x = nn.functional.tanh(self.encoder1(original_embeddings))
    x = nn.functional.tanh(self.encoder2(x))
    x = nn.functional.tanh(self.encoder3(x))
    x = nn.functional.tanh(self.decoder1(x))
    x = nn.functional.tanh(self.decoder2(x))
    reconstructed_embeddings = self.decoder3(x)

    return original_embeddings, reconstructed_embeddings, predictions

# Define reconstruction loss

In [None]:
rec_loss_fn = nn.MSELoss()
ce_loss_fn = nn.CrossEntropyLoss()

# initialise everything else

In [None]:
from transformers import AutoModel

In [None]:
unique_intents = list(set(train_labels))
# transformer_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
transformer_model = AutoModel.from_pretrained(model_id)
transformer_model.to(device)
model = TextClassifier(transformer_model, len(unique_intents))
model = model.to(device)
optimizer = Adam(model.parameters(), lr=5.00E-05)
training_losses = []
validation_losses = []

batch_size = 1024
train_dataloader = DataLoader(train_df, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_df, batch_size=batch_size)
test_dataloader = DataLoader(test_df, batch_size=batch_size)

rec_loss_importance=0.1
factor=1

In [None]:
num_epochs=6

In [None]:
if not os.path.exists(model_name):
  best_val_loss = float('inf')
  for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
      input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
      optimizer.zero_grad()
      orginal_embeddings, reconstructed_embeddings, predictions = model(input_ids, attention_mask)
      rec_loss = rec_loss_fn(original_embeddings, reconstructed_embeddings)
      ce_loss = ce_loss_fn(predictions, labels)

      loss = (1-rec_loss_importance)*ce_loss + rec_loss_importance*rec_loss
      loss.backward()
      optimizer.step()

      total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
      for batch in val_dataloader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        original_embeddings, reconstructed_embeddings, predictions = model(input_ids, attention_mask)

        rec_loss = rec_loss_fn(original_embeddings, reconstructed_embeddings)
        ce_loss = ce_loss_fn(predictions, labels)

        loss = (1-rec_loss_importance)*ce_loss + rec_loss_importance*rec_loss
        total_val_loss += loss.item()

      avg_val_loss = total_val_loss / len(val_dataloader)
      validation_losses.append(avg_val_loss)
      if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Save the model
            torch.save(model, model_name)
            print(f"Epoch {epoch+1}/{num_epochs}: Lower validation loss found. Model saved.")
      validation_losses.append(avg_val_loss)
      print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.5e}, Validation Loss: {avg_val_loss:.5e}")
else:
    print("training skipped")

OutOfMemoryError: CUDA out of memory. Tried to allocate 396.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 187.06 MiB is free. Process 4249 has 14.56 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 119.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)