# STAT 940 Group Project - Group 9

The code is adapted from the Pytorch implementation of the GAN-BERT model from https://github.com/crux82/ganbert-pytorch, refactored to utilize an alternative dataset and to support a different language.

### Setup

In [1]:
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import datetime
import torch.nn as nn
import transformers as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

##Set random values
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


### Parameters


In [3]:
#--------------------------------
#  Transformer parameters
#--------------------------------
max_seq_length = 64
batch_size = 64

#--------------------------------
#  GAN-BERT specific parameters
#--------------------------------
# number of hidden layers in the generator, 
# each of the size of the output space
num_hidden_layers_g = 1; 
# number of hidden layers in the discriminator, 
# each of the size of the input space
num_hidden_layers_d = 1; 
# size of the generator's input noisy vectors
noise_size = 100
# dropout to be applied to discriminator's input vectors
out_dropout_rate = 0.2

#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-5
learning_rate_generator = 5e-5
epsilon = 1e-8
num_train_epochs = 16
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 20

#--------------------------------
#  Adopted Pretrained Tranformer model
#--------------------------------

model_name = "google-bert/bert-base-chinese"
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "albert-base-v2"
#model_name = "xlm-roberta-base"
#model_name = "amazon/bort"


#--------------------------------
#  Dataset parameters
#--------------------------------
label_count=61
mask_percentage=0.6  # percentage of the training data to be masked

### Load the Pretrained Model

In [4]:
transformer = tf.AutoModel.from_pretrained(model_name)
tokenizer = tf.AutoTokenizer.from_pretrained(model_name, use_fast=False)

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

### Prepare Data

In [5]:
cols_to_remove=["id","locale","partition","scenario","annot_utt","slot_method","judgments","worker_id"]
np.object = object

def add_label_mask(batch,mask=1):
    batch["label_mask"]=[mask]*len(batch["input_ids"])
    if mask==0:
        batch["label_id"]=[60]*len(batch["input_ids"])
    return batch

In [6]:
from datasets import load_dataset, concatenate_datasets, Value

d_train_raw=load_dataset("AmazonScience/massive","zh-CN",split="train")
d_test_raw=load_dataset("AmazonScience/massive","zh-CN",split="test")

sampler=RandomSampler

#------------------------------
#   Load the train dataset
#------------------------------
d_train_t=d_train_raw.map(lambda batch: tokenizer(batch["utt"],truncation=True,padding="max_length",max_length=max_seq_length),batched=True,remove_columns=cols_to_remove).rename_column("intent","label_id")

d_train_split=d_train_t.train_test_split(test_size=mask_percentage,seed=seed_val)
d_train_nomask=d_train_split["train"].map(add_label_mask,fn_kwargs={"mask":1},batched=True)
d_train_mask=d_train_split["test"].map(add_label_mask,fn_kwargs={"mask":0},batched=True)
d_train=concatenate_datasets([d_train_nomask,d_train_mask])
print(d_train)
d_train=d_train.cast_column("label_mask",Value(dtype="bool")).with_format(type="torch", columns=["input_ids", "attention_mask", "label_id", "label_mask"], device=device)

train_dataloader=DataLoader(d_train, sampler = sampler(d_train), batch_size=batch_size)

#------------------------------
#   Load the test dataset
#------------------------------
d_test=d_test_raw.map(lambda batch: tokenizer(batch["utt"],truncation=True,padding="max_length",max_length=max_seq_length),batched=True,remove_columns=cols_to_remove).map(add_label_mask,fn_kwargs={"mask":1},batched=True)\
.rename_column("intent","label_id")
print(d_test)
d_test=d_test.cast_column("label_mask",Value(dtype="bool")).with_format(type="torch", columns=["input_ids", "attention_mask", "label_id", "label_mask"], device=device)

test_dataloader=DataLoader(d_test,batch_size=batch_size)

Downloading builder script:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Downloading and preparing dataset massive/zh-CN to /root/.cache/huggingface/datasets/AmazonScience___massive/zh-CN/1.0.0/71d360eb7d7a18565ff8c10609cebf714fce3cc390e173ba5b02ffd48543cdc1...


Downloading data:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset massive downloaded and prepared to /root/.cache/huggingface/datasets/AmazonScience___massive/zh-CN/1.0.0/71d360eb7d7a18565ff8c10609cebf714fce3cc390e173ba5b02ffd48543cdc1. Subsequent calls will reuse this data.


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

Dataset({
    features: ['label_id', 'utt', 'input_ids', 'token_type_ids', 'attention_mask', 'label_mask'],
    num_rows: 11514
})


Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['label_id', 'utt', 'input_ids', 'token_type_ids', 'attention_mask', 'label_mask'],
    num_rows: 2974
})


Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

#### Example instance from the dataset

In [7]:
dl=DataLoader(d_train,batch_size=batch_size)
next(iter(dl))

{'label_id': tensor([12, 50, 59, 56, 52, 30, 38, 14, 15,  6,  0, 12, 51, 43, 13, 50, 14, 49,
          3, 22, 36, 14, 11, 22, 22, 53, 45,  9, 43,  9, 44, 45, 51, 32,  0, 45,
         53, 25,  9, 13, 13, 49, 42, 21, 22, 17, 12, 53, 27, 26, 50, 49, 44, 13,
         43, 17,  1, 56, 32,  2, 20, 39, 49, 13], device='cuda:0'),
 'input_ids': tensor([[ 101, 6843, 1957,  ...,    0,    0,    0],
         [ 101, 2128, 2961,  ...,    0,    0,    0],
         [ 101, 3926, 1296,  ...,    0,    0,    0],
         ...,
         [ 101, 1282, 1724,  ...,    0,    0,    0],
         [ 101, 4294, 3306,  ...,    0,    0,    0],
         [ 101,  791, 1921,  ...,    0,    0,    0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 'label_mask': tensor([True, True, True, True, True,

In [8]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### GAN
Generator and Discriminator as in https://www.aclweb.org/anthology/2020.acl-main.191/

In [9]:
#------------------------------
#   The Generator as in 
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Generator(nn.Module):
    def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
        super(Generator, self).__init__()
        layers = []
        hidden_sizes = [noise_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        layers.append(nn.Linear(hidden_sizes[-1],output_size))
        self.layers = nn.Sequential(*layers)

    def forward(self, noise):
        output_rep = self.layers(noise)
        return output_rep

#------------------------------
#   The Discriminator
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers) #per il flatten
        self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs

#### Instantiate the Discriminator and Generator

In [10]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
config = tf.AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
# Define the number and width of hidden layers
hidden_levels_g = [hidden_size for i in range(0, num_hidden_layers_g)]
hidden_levels_d = [hidden_size for i in range(0, num_hidden_layers_d)]

#-------------------------------------------------
#   Instantiate the Generator and Discriminator
#-------------------------------------------------
generator = Generator(noise_size=noise_size, output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=out_dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=label_count, dropout_rate=out_dropout_rate)

# Put everything in the GPU if available
if torch.cuda.is_available():    
  generator.cuda()
  discriminator.cuda()
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)

# print(config)

### Training

In [11]:
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

#models parameters
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
g_vars = [v for v in generator.parameters()]

#optimizer
dis_optimizer = torch.optim.AdamW(d_vars, lr=learning_rate_discriminator)
gen_optimizer = torch.optim.AdamW(g_vars, lr=learning_rate_generator) 

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

  scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
  scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                           num_warmup_steps = num_warmup_steps)

# For each epoch...
for epoch_i in range(0, num_train_epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    tr_g_loss = 0
    tr_d_loss = 0

    # Put the model into training mode.
    transformer.train() 
    generator.train()
    discriminator.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every print_each_n_step batches.
        if step % print_each_n_step == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        b_input_ids = batch["input_ids"]
        b_input_mask = batch["attention_mask"]
        b_labels = batch["label_id"]
        b_label_mask = batch["label_mask"]

        real_batch_size = b_input_ids.shape[0]
     
        # Encode real data in the Transformer
        model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
        hidden_states = model_outputs[-1]
        
        # Generate fake data that should have the same distribution of the ones
        # encoded by the transformer. 
        # First noisy input are used in input to the Generator
        noise = torch.zeros(real_batch_size, noise_size, device=device).uniform_(0, 1)
        # Gnerate Fake data
        gen_rep = generator(noise)

        # Generate the output of the Discriminator for real and fake data.
        # First, we put together the output of the tranformer and the generator
        disciminator_input = torch.cat([hidden_states, gen_rep], dim=0)
        # Then, we select the output of the disciminator
        features, logits, probs = discriminator(disciminator_input)

        # Finally, we separate the discriminator's output for the real and fake
        # data
        features_list = torch.split(features, real_batch_size)
        D_real_features = features_list[0]
        D_fake_features = features_list[1]
      
        logits_list = torch.split(logits, real_batch_size)
        D_real_logits = logits_list[0]
        D_fake_logits = logits_list[1]
        
        probs_list = torch.split(probs, real_batch_size)
        D_real_probs = probs_list[0]
        D_fake_probs = probs_list[1]

        #---------------------------------
        #  LOSS evaluation
        #---------------------------------
        # Generator's LOSS estimation
        g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + epsilon))
        g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
        g_loss = g_loss_d + g_feat_reg
  
        # Disciminator's LOSS estimation
        logits = D_real_logits[:,0:-1]
        log_probs = F.log_softmax(logits, dim=-1)
        # The discriminator provides an output for labeled and unlabeled real data
        # so the loss evaluated for unlabeled data is ignored (masked)
        label2one_hot = torch.nn.functional.one_hot(b_labels, label_count)
        per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
        try:
            per_example_loss = torch.masked_select(per_example_loss, b_label_mask)
        except Exception as e:
            print(per_example_loss.shape, b_label_mask.shape)
            raise e
        labeled_example_count = per_example_loss.type(torch.float32).numel()

        # It may be the case that a batch does not contain labeled examples, 
        # so the "supervised loss" in this case is not evaluated
        if labeled_example_count == 0:
          D_L_Supervised = 0
        else:
          D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)
                 
        D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
        D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
        d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

        #---------------------------------
        #  OPTIMIZATION
        #---------------------------------
        # Avoid gradient accumulation
        gen_optimizer.zero_grad()
        dis_optimizer.zero_grad()

        # Calculate weigth updates
        # retain_graph=True is required since the underlying graph will be deleted after backward
        g_loss.backward(retain_graph=True)
        d_loss.backward() 
        
        # Apply modifications
        gen_optimizer.step()
        dis_optimizer.step()

        # A detail log of the individual losses
        #print("{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}".
        #      format(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
        #             g_loss_d, g_feat_reg))

        # Save the losses to print them later
        tr_g_loss += g_loss.item()
        tr_d_loss += d_loss.item()

        # Update the learning rate with the scheduler
        if apply_scheduler:
          scheduler_d.step()
          scheduler_g.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss_g = tr_g_loss / len(train_dataloader)
    avg_train_loss_d = tr_d_loss / len(train_dataloader)             
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
    print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #     TEST ON THE EVALUATION DATASET
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our test set.
    print("")
    print("Running Test...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    transformer.eval() #maybe redundant
    discriminator.eval()
    generator.eval()

    # Tracking variables 
    total_test_accuracy = 0
   
    total_test_loss = 0
    nb_test_steps = 0

    all_preds = []
    all_labels_ids = []

    #loss
    nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        # Unpack this training batch from our dataloader. 
        b_input_ids = batch["input_ids"]
        b_input_mask = batch["attention_mask"]
        b_labels = batch["label_id"]
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
            hidden_states = model_outputs[-1]
            _, logits, probs = discriminator(hidden_states)
            ###log_probs = F.log_softmax(probs[:,1:], dim=-1)
            filtered_logits = logits[:,0:-1]
            # Accumulate the test loss.
            total_test_loss += nll_loss(filtered_logits, b_labels)
            
        # Accumulate the predictions and the input labels
        _, preds = torch.max(filtered_logits, 1)
        all_preds += preds.detach().cpu()
        all_labels_ids += b_labels.detach().cpu()

    # Report the final accuracy for this validation run.
    all_preds = torch.stack(all_preds).numpy()
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
    print("  Accuracy: {0:.3f}".format(test_accuracy))

    # Calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(test_dataloader)
    avg_test_loss = avg_test_loss.item()
    
    # Measure how long the validation run took.
    test_time = format_time(time.time() - t0)
    
    print("  Test Loss: {0:.3f}".format(avg_test_loss))
    print("  Test took: {:}".format(test_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss generator': avg_train_loss_g,
            'Training Loss discriminator': avg_train_loss_d,
            'Valid. Loss': avg_test_loss,
            'Valid. Accur.': test_accuracy,
            'Training Time': training_time,
            'Test Time': test_time
        }
    )


Training...


2024-04-05 21:19:26.287420: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-05 21:19:26.287523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-05 21:19:26.585548: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


  Batch    20  of    180.    Elapsed: 0:00:31.
  Batch    40  of    180.    Elapsed: 0:00:44.
  Batch    60  of    180.    Elapsed: 0:00:57.
  Batch    80  of    180.    Elapsed: 0:01:09.
  Batch   100  of    180.    Elapsed: 0:01:22.
  Batch   120  of    180.    Elapsed: 0:01:35.
  Batch   140  of    180.    Elapsed: 0:01:48.
  Batch   160  of    180.    Elapsed: 0:02:01.

  Average training loss generetor: 0.640
  Average training loss discriminator: 4.066
  Training epcoh took: 0:02:14

Running Test...
  Accuracy: 0.671
  Test Loss: 1.602
  Test took: 0:00:09

Training...
  Batch    20  of    180.    Elapsed: 0:00:13.
  Batch    40  of    180.    Elapsed: 0:00:26.
  Batch    60  of    180.    Elapsed: 0:00:39.
  Batch    80  of    180.    Elapsed: 0:00:52.
  Batch   100  of    180.    Elapsed: 0:01:06.
  Batch   120  of    180.    Elapsed: 0:01:19.
  Batch   140  of    180.    Elapsed: 0:01:32.
  Batch   160  of    180.    Elapsed: 0:01:46.

  Average training loss generetor: 0.744


In [12]:
for stat in training_stats:
  print(stat)

print("\nTraining complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

{'epoch': 1, 'Training Loss generator': 0.6404982954263687, 'Training Loss discriminator': 4.066060251659817, 'Valid. Loss': 1.6019396781921387, 'Valid. Accur.': 0.6714862138533961, 'Training Time': '0:02:14', 'Test Time': '0:00:09'}
{'epoch': 2, 'Training Loss generator': 0.7442983657121658, 'Training Loss discriminator': 1.917604097392824, 'Valid. Loss': 0.9605762958526611, 'Valid. Accur.': 0.7881640887693342, 'Training Time': '0:01:59', 'Test Time': '0:00:08'}
{'epoch': 3, 'Training Loss generator': 0.7318831688827938, 'Training Loss discriminator': 1.3910513781838947, 'Valid. Loss': 0.7871029376983643, 'Valid. Accur.': 0.8143913920645595, 'Training Time': '0:01:59', 'Test Time': '0:00:08'}
{'epoch': 4, 'Training Loss generator': 0.7225377973583009, 'Training Loss discriminator': 1.1453532414303886, 'Valid. Loss': 0.7360614538192749, 'Valid. Accur.': 0.8248150638870209, 'Training Time': '0:01:59', 'Test Time': '0:00:08'}
{'epoch': 5, 'Training Loss generator': 0.7172564794619878, 'T