In [1]:
## All the Imports used

from datetime import datetime

from typing import Tuple
import torch
from torch.nn import Module
from torch.nn import CrossEntropyLoss, CosineEmbeddingLoss


import os

# Append Utils to System Path 
import sys
sys.path.append('../utils')

from collections import OrderedDict

# Call Our Model's Utility Function 
from distilbert_utils import *
from distilbert_finetuning import *

from ganbert_models import *
from ganbert_utils import *

import numpy as np

# Call Hugging Face Functions 
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification, AutoConfig, DistilBertConfig

2022-05-08 03:41:35.645004: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.0/lib64
2022-05-08 03:41:35.645046: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Using DistilBert FineTuned Model has starting values. 

model_path = '../distilbert/finetuned_models/distill_bert_finetuned_sst2_67349_samples_2022-05-03_21-30-41.pt'

model_dict = torch.load(model_path)
        
model_name = 'distilbert-base-cased'


# Load Model from saved File
model = DistilBertForSequenceClassification.from_pretrained(model_name)

model.load_state_dict(model_dict['distilbert'])

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier

<All keys matched successfully>

In [3]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [4]:
def visualize_children(
    object,
    level=0,
):
    """
    Prints the children of (object) and their children too, if there are any.
    Uses the current depth (level) to print things in a ordonnate manner.
    """
    print(f"{'   ' * level}{level}- {type(object).__name__} - {type(object)}")
    try:
        for child in object.children():
            visualize_children(child, level + 1)
    except:
        pass

In [5]:

visualize_children(model)

0- DistilBertForSequenceClassification - <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>
   1- DistilBertModel - <class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
      2- Embeddings - <class 'transformers.models.distilbert.modeling_distilbert.Embeddings'>
         3- Embedding - <class 'torch.nn.modules.sparse.Embedding'>
         3- Embedding - <class 'torch.nn.modules.sparse.Embedding'>
         3- LayerNorm - <class 'torch.nn.modules.normalization.LayerNorm'>
         3- Dropout - <class 'torch.nn.modules.dropout.Dropout'>
      2- Transformer - <class 'transformers.models.distilbert.modeling_distilbert.Transformer'>
         3- ModuleList - <class 'torch.nn.modules.container.ModuleList'>
            4- TransformerBlock - <class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'>
               5- MultiHeadSelfAttention - <class 'transformers.models.distilbert.modeling_distilbert.MultiHea

In [6]:
from transformers import DistilBertModel
from  transformers.models.distilbert.modeling_distilbert import Transformer

def distill_finetune_weights(teacher, student):
    """
    Recursively copies the weights of the (teacher) to the (student).
    This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
    The only part that's not fully copied is the encoder, of which only half is copied.
    """

    if isinstance(teacher, DistilBertModel) or type(teacher).__name__.startswith('DistilBertForSequenceClassification'):

        for teacher_part, student_part in zip(teacher.children(), student.children()):
            distill_finetune_weights(teacher_part, student_part)

    # Else if the part is an encoder, copy one out of every layer
    elif isinstance(teacher, Transformer):

            teacher_encoding_layers = [layer for layer in next(teacher.children())]
            student_encoding_layers = [layer for layer in next(student.children())]

            for i in range(len(student_encoding_layers)):
                student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
    # Else the part is a head or something else, copy the state_dict
    else:

        student.load_state_dict(teacher.state_dict())

In [7]:
## Function
def distill_distilbert(teacher_model):
    """
    Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
    The student model has the same configuration, except for the number of hidden layers, which is // by 2.
    The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
    The head of the teacher is also copied.
    """
    # Get teacher configuration as a dictionary
    configuration = teacher_model.config.to_dict()
#     print(configuration)
    # Half the number of hidden layer
    configuration['n_layers'] //= 2
    # Convert the dictionnary to the student configuration
    configuration = DistilBertConfig.from_dict(configuration)
    # Create uninitialized student model
    student_model = type(teacher_model)(configuration)
    # Initialize the student's weights
    distill_finetune_weights(teacher=teacher_model, student=student_model)

    # Return the student model
    return student_model

In [8]:
student_distil_bert = distill_distilbert(model)

In [9]:
visualize_children(student_distil_bert)

0- DistilBertForSequenceClassification - <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>
   1- DistilBertModel - <class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
      2- Embeddings - <class 'transformers.models.distilbert.modeling_distilbert.Embeddings'>
         3- Embedding - <class 'torch.nn.modules.sparse.Embedding'>
         3- Embedding - <class 'torch.nn.modules.sparse.Embedding'>
         3- LayerNorm - <class 'torch.nn.modules.normalization.LayerNorm'>
         3- Dropout - <class 'torch.nn.modules.dropout.Dropout'>
      2- Transformer - <class 'transformers.models.distilbert.modeling_distilbert.Transformer'>
         3- ModuleList - <class 'torch.nn.modules.container.ModuleList'>
            4- TransformerBlock - <class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'>
               5- MultiHeadSelfAttention - <class 'transformers.models.distilbert.modeling_distilbert.MultiHea

In [10]:
def get_logits(model, input_ids, attention_mask):
    
    """
    Given a RoBERTa (model) for classification and the couple of (input_ids) and (attention_mask),
    returns the logits corresponding to the prediction.
    """
    return model(input_ids, attention_mask)['logits']

In [11]:
def distillation_loss(teacher_logits, student_logits, labels, temperature= 1.0):
    """
    The distillation loss for distilating a BERT-like model.
    The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
    The (temperature) can be given, otherwise it's set to 1 by default.
    """
    # Temperature and sotfmax
    student_logits, teacher_logits = (student_logits / temperature).softmax(1), (teacher_logits / temperature).softmax(1)
    # Classification loss (problem-specific loss)
    loss = CrossEntropyLoss()(student_logits, labels)
    # CrossEntropy teacher-student loss
    loss = loss + CrossEntropyLoss()(student_logits, teacher_logits)
    # Cosine loss
    loss = loss + CosineEmbeddingLoss()(teacher_logits, student_logits, torch.ones(teacher_logits.size()[0]))
    # Average the loss and return it
    loss = loss / 3
    return loss

In [12]:
from datasets import load_dataset

dataset_id="glue"
dataset_config="sst2"


dataset = load_dataset(dataset_id,dataset_config)

from transformers import AutoTokenizer
 
tokenizer = AutoTokenizer.from_pretrained(model_name)

def process(examples):
    tokenized_inputs = tokenizer(
         examples["sentence"], truncation=True, max_length=512
     )
    return tokenized_inputs
 
tokenized_datasets = dataset.map(process, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label","labels")
 
tokenized_datasets["test"].features

Reusing dataset glue (/home/ecbm4040/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'idx': Value(dtype='int32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None),
 'sentence': Value(dtype='string', id=None)}

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'labels', 'sentence'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'labels', 'sentence'],
        num_rows: 872
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'labels', 'sentence'],
        num_rows: 1821
    })
})

In [14]:
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F

class DistillationTrainingArguments(TrainingArguments):
     def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
         
        self.alpha = alpha
        self.temperature = temperature
         
class DistillationTrainer(Trainer):
     def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # place teacher on same device as student
        self._move_model_to_device(self.teacher,self.model.device)
        
        self.teacher.eval()
 
     def compute_loss(self, model, inputs, return_outputs=False):
        self._move_model_to_device(model,self.model.device)
        self._move_model_to_device(inputs,self.model.device)
        # compute student output
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output
        with torch.no_grad():
            
            outputs_teacher = self.teacher(**inputs)
         
         # assert size
        assert outputs_student.logits.size() == outputs_teacher.logits.size(), f" Student is {outputs_student.logits.size() } , parent is {outputs_teacher.logits.size() }"

        # Soften probabilities and compute distillation loss
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
             F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
             F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

In [15]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from huggingface_hub import HfFolder
 
 # create label2id, id2label dicts for nice outputs for the model
labels = tokenized_datasets["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
 
 # define training args
training_args = DistillationTrainingArguments(output_dir = "Distillation_of_GanBert-May7th-18-30",
     num_train_epochs=7,
     per_device_train_batch_size=128,
     per_device_eval_batch_size=128,
     fp16=True,
     learning_rate=6e-5,
     seed=33,
     # logging & evaluation strategies
     evaluation_strategy="epoch",
     metric_for_best_model="accuracy",
     alpha=0.5,
     temperature=4.0
     )
 
# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
from datasets import load_metric
import numpy as np
 
 # define metrics and metrics function
accuracy_metric = load_metric( "accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": acc["accuracy"]}

In [17]:
from transformers.modeling_outputs import SequenceClassifierOutput

class InferenceGANBert(nn.Module):
    
    def __init__(self, model_path):
        super().__init__()
        self.device = get_gpu_details()
        model_dict = torch.load(model_path)
        model_name = 'bert-base-cased'
        self.transformer = AutoModel.from_pretrained(model_name)
        
        transformer_dict = OrderedDict()
        
        for key,value in model_dict['bert_encoder'].items():
            transformer_dict[key[7:]] = value
      
        self.transformer.load_state_dict(transformer_dict)
        self.transformer.eval()
        
        hidden_size_bert = AutoConfig.from_pretrained(model_name).hidden_size
        hidden_size_bert = int(hidden_size_bert)
        
        self.discriminator = Discriminator(input_size=hidden_size_bert, hidden_sizes=[hidden_size_bert],\
                                                                                   num_labels=3, dropout_rate=0.1)
        self.discriminator.load_state_dict(model_dict['discriminator'])
        self.discriminator.eval()
        
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if torch.cuda.is_available(): 
            self.discriminator.cuda()
            self.transformer.cuda()
    
            if True:
                self.transformer = torch.nn.DataParallel(self.transformer)
        
    def forward(self, attention_mask,input_ids,labels):
        attention_mask.to(self.device)
        input_ids.to(self.device)
                
        # do the forward pass
        with torch.no_grad():        
            model_outputs = self.transformer(input_ids, attention_mask)
            hidden_states = model_outputs[-1]
            features, logits, probs = self.discriminator(hidden_states)
            outputs = SequenceClassifierOutput(logits  = logits[:,:2], hidden_states = features)

        return outputs

In [18]:
trainer = DistillationTrainer(
     student_distil_bert,
     training_args,
     teacher_model=InferenceGANBert('../gan-bert/finetuned_models/gan_bert_finetuned_sst2_67349_samples_0.2_labelratio_2022-05-04_02-26-06.pt'),
     train_dataset=tokenized_datasets["train"],
     eval_dataset=tokenized_datasets["validation"],
     data_collator=data_collator,
     tokenizer=tokenizer,
     compute_metrics=compute_metrics,
 )

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB





Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


hidden_sizes[i] and [i+1] is 768 and 768


Using amp fp16 backend


In [19]:
os.environ["WANDB_DISABLED"] = "false"

In [20]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, idx.
***** Running training *****
  Num examples = 67349
  Num Epochs = 7
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 3689
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvijaykalmath[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-05-08 03:41:50.852998: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.0/lib64
2022-05-08 03:41:50.853037: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5502,0.7219,0.875
2,0.2588,0.678684,0.872706
3,0.1936,0.629718,0.879587
4,0.1543,0.609332,0.880734
5,0.1251,0.563365,0.883028
6,0.1083,0.571109,0.887615
7,0.0994,0.563202,0.887615


Saving model checkpoint to Distillation_of_GanBert-May7th-18-30/checkpoint-500
Configuration saved in Distillation_of_GanBert-May7th-18-30/checkpoint-500/config.json
Model weights saved in Distillation_of_GanBert-May7th-18-30/checkpoint-500/pytorch_model.bin
tokenizer config file saved in Distillation_of_GanBert-May7th-18-30/checkpoint-500/tokenizer_config.json
Special tokens file saved in Distillation_of_GanBert-May7th-18-30/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence, idx.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 128
Saving model checkpoint to Distillation_of_GanBert-May7th-18-30/checkpoint-1000
Configuration saved in Distillation_of_GanBert-May7th-18-30/checkpoint-1000/config.json
Model weights saved in Distillation_of_GanBert-May7th-18-30/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in

TrainOutput(global_step=3689, training_loss=0.20677563012470299, metrics={'train_runtime': 319.8844, 'train_samples_per_second': 1473.792, 'train_steps_per_second': 11.532, 'total_flos': 3186342286167516.0, 'train_loss': 0.20677563012470299, 'epoch': 7.0})

In [22]:
torch.save(
                {
                    'distilbert': student_distil_bert.state_dict(),
            }, 
    f"Gan_distilbert_sst2_{datetime.now():%Y-%m-%d_%H-%M-%S%z}.pt")


