In [1]:
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.0
Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiproce

In [2]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [3]:
from transformers import TrainingArguments

In [4]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    #*args allows us to pass a variable number of non-keyword arguments to a Python function.
    #**kwargs stands for keyword arguments. The only difference from args is that it uses keywords and returns the values in the form of a dictionary.
    super().__init__(*args, **kwargs)
    #The super() function is often used with the __init__() method to initialize the attributes of the parent class.
    self.alpha = alpha
    self.temperature = temperature

In [5]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

In [6]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

     #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss


In [7]:
from datasets import load_dataset

dataset = load_dataset("carblacac/twitter-sentiment-analysis")
#the plus configuration refers to the subset that contains the out-of-scope training examples.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/149985 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Generating train split:   0%|          | 0/119988 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29997 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [8]:
sample = dataset["train"][5]
print(sample)
#Each example in the CLINC150 dataset consists of a query in the text column and its corresponding intent.

{'text': '@TashaWilson like questions she asks me the date etc..i say that i have been to birmingham lol its weird o well  u ok?', 'feeling': 1}


In [9]:
from transformers import AutoTokenizer

In [10]:
student_checkpoint = "prajjwal1/bert-tiny"
student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [11]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["label"] for item in batch]

    # Pad sequences to the maximum length in the batch
    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": torch.stack(labels)}

In [12]:
#aauugg
from nlpaug.augmenter.word import SynonymAug
from torch.utils.data import DataLoader,Dataset

class TwitterSentimentDataset_aug(Dataset):
    def __init__(self, data, tokenizer, max_length=128, augmenter=None):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmenter = augmenter

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Apply data augmentation if available
        text = item["text"]
        if self.augmenter is not None:
            text = self.augmenter.augment(text)

        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        label = torch.tensor(item["feeling"], dtype=torch.long)
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "label": label
        }

# Example usage with data augmentation
augmenter = SynonymAug()

# Initialize the dataset with augmentation
train_dataset_aug = TwitterSentimentDataset_aug(dataset["train"], student_tokenizer, max_length=128,augmenter=augmenter)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [13]:
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True)

In [14]:
clinc_tokenized = dataset.map(tokenize_text, batched=True, remove_columns=["text"])

#We will remove text column as we don't need it
#We will also rename the intent column to labels so it can be automatically detected by the trainer.
clinc_tokenized = clinc_tokenized.rename_column("feeling", "labels")


Map:   0%|          | 0/119988 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/29997 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [15]:
import numpy as np
from datasets import load_metric
accuracy_score = load_metric("accuracy")

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

#Lets define Training Arguments for DistillationTrainer

In [16]:
batch_size = 48
finetuned_student_ckpt = "tinybert-base-uncased-finetuned-twitter-student"

In [17]:
!pip install accelerate>=0.20.1

In [18]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=1, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

In [19]:
from transformers import pipeline

bert_ckpt = "prajjwal1/bert-tiny"
pipe = pipeline("text-classification", model=bert_ckpt)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import AutoConfig
# num_labels = intents.num_classes
student_config = (AutoConfig
                  .from_pretrained(student_checkpoint, num_labels=2,
                                    id2label=id2label, label2id=label2id))

In [21]:
import torch
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).to(device))

In [22]:
teacher_checkpoint = "bert-base-uncased"

In [23]:
teacher_model = (AutoModelForSequenceClassification
                     .from_pretrained(teacher_checkpoint, num_labels=2)
                     .to(device))

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
tinybert_trainer = KnowledgeDistillationTrainer(model_init=student_init,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=train_dataset_aug, eval_dataset=clinc_tokenized['validation'],
        compute_metrics=compute_metrics, tokenizer=student_tokenizer)
tinybert_trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5707,0.536449,0.735207


TrainOutput(global_step=2500, training_loss=0.6030327026367187, metrics={'train_runtime': 1257.8535, 'train_samples_per_second': 95.391, 'train_steps_per_second': 1.988, 'total_flos': 16191503716320.0, 'train_loss': 0.6030327026367187, 'epoch': 1.0})

In [25]:
print(tinybert_trainer.teacher_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
def save_teacher_model():
  teacher_model.save_pretrained("teacher_model")
def save_student_model():
  tinybert_trainer.save_model('student_model')


In [27]:
save_teacher_model()
save_student_model()

## Lets compare Teacher and Student Model

In [28]:
from transformers import AutoConfig, AutoModelForSequenceClassification
import os

def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [29]:
teacher_model_parameters = compute_parameters(model_path="/content/teacher_model")
print("Teacher Model: ", teacher_model_parameters)

Teacher Model:  109483778


In [30]:
student_model_parameters = compute_parameters(model_path="/content/student_model")
print("Student Model: ", student_model_parameters)

Student Model:  4386178


In [31]:
tinybert_trainer.evaluate()

{'eval_loss': 0.5364490747451782,
 'eval_accuracy': 0.7352068540187352,
 'eval_runtime': 90.1477,
 'eval_samples_per_second': 332.754,
 'eval_steps_per_second': 6.933,
 'epoch': 1.0}

In [39]:
decrease = (teacher_model_parameters-student_model_parameters)/teacher_model_parameters
print(decrease*100)

95.993764482625


In [33]:
!ls /content/student_model -al --block-size=MB

total 19MB
drwxr-xr-x 2 root root  1MB Feb  9 19:14 .
drwxr-xr-x 1 root root  1MB Feb  9 19:14 ..
-rw-r--r-- 1 root root  1MB Feb  9 19:14 config.json
-rw-r--r-- 1 root root 18MB Feb  9 19:14 model.safetensors
-rw-r--r-- 1 root root  1MB Feb  9 19:14 special_tokens_map.json
-rw-r--r-- 1 root root  1MB Feb  9 19:14 tokenizer_config.json
-rw-r--r-- 1 root root  1MB Feb  9 19:14 tokenizer.json
-rw-r--r-- 1 root root  1MB Feb  9 19:14 training_args.bin
-rw-r--r-- 1 root root  1MB Feb  9 19:14 vocab.txt


In [34]:
!ls /content/teacher_model -al --block-size=MB

total 438MB
drwxr-xr-x 2 root root   1MB Feb  9 19:13 .
drwxr-xr-x 1 root root   1MB Feb  9 19:14 ..
-rw-r--r-- 1 root root   1MB Feb  9 19:13 config.json
-rw-r--r-- 1 root root 438MB Feb  9 19:14 model.safetensors
