In [1]:
dataset_ckpt ='ag_news'
teacher_model_ckpt ='odunola/bert-base-uncased-ag-news-finetuned-2' #our already finetuned teacher model
student_model_ckpt ='google/bert_uncased_L-12_H-256_A-4'

In [4]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!huggingface-cli login

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [5]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForSequenceClassification
from torch import nn
from torch import optim
from torch.nn import functional as F
from transformers import AutoTokenizer
from tqdm import tqdm
from time import perf_counter
import pandas as pd

In [6]:
tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [7]:
data = load_dataset(dataset_ckpt)
train_test = data['train'].train_test_split(test_size = 0.2)
valid_data = train_test['test']
train_data = train_test['train']
test_data = data['test']

def get_num_rows(dataset):
  return dataset.num_rows

print(f'Train set has {get_num_rows(train_data)} texts')
print(f'Valid set has {get_num_rows(valid_data)} texts')
print(f'Test set has {get_num_rows(test_data)} texts')

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Train set has 96000 texts
Valid set has 24000 texts
Test set has 7600 texts


In [8]:
#now we would utilise pytorch's Dataset andDataloader classes to create our dataset

class MyData(Dataset):
  def __init__(self, data):
    targets = data['label']
    texts = data['text']

    tokens = tokenizer(texts, return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 150)
    self.input_ids = tokens['input_ids']
    self.attention_mask = tokens['attention_mask']
    self.targets = torch.tensor(targets)
    self.length = len(texts)
  def __len__(self):
    return self.length
  def __getitem__(self, index):
    return self.input_ids[index], self.attention_mask[index], self.targets[index]


train_data = MyData(train_data)
valid_data = MyData(valid_data)
test_data = MyData(test_data)

# now we build our loaders
batch_size = 64
train_loader = DataLoader(train_data,batch_size = batch_size)
valid_loader = DataLoader(valid_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [9]:
#first we install define our device and download our teacher model from huggingface
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt).to(device)


# we define a function to help us compute accuracy as we train, we would also define another function to measure time ellapsed
def accuracy_score(batch, model):
  with torch.no_grad():
    outputs = model(
        batch[0].to(device),
        batch[1].to(device)
    )
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim = 1)
    class_predictions = torch.argmax(probabilities, dim = 1)
    acc = torch.mean((class_predictions == batch[2].to(device)).to(torch.float)).data.item()
    return acc

#now let us test!

accuracy = 0.0
time_taken = 0.0
count = 0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teacher_model)
  end_time = perf_counter()
  accuracy += score
  time_taken += end_time - start_time

print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'number of batch is {len(test_loader)}')
print(f"accuracy is {accuracy / len(test_loader):.2f}")
print(f'time taken per batch is {time_taken / len(test_loader):.6f}')

config.json:   0%|          | 0.00/944 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 119/119 [00:53<00:00,  2.22it/s]




number of samples in each batch is 48
number of batch is 119
accuracy is 0.94
time taken per batch is 0.446002





In [18]:
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_ckpt, num_labels = 4).to(device)
student_model.dropout = nn.Dropout(0.3) #Increase dropout to improve generalization.
epochs = 5#we train for5epochs
learning_rate = 2e-5
entropy_loss = nn.CrossEntropyLoss() #cross entropy loss
temperature = 2.0
alpha = 0.5
criterion = nn.KLDivLoss(reduction = 'batchmean') #KL Divergence Loss
optimizer = optim.Adam(student_model.parameters(), lr = learning_rate)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-12_H-256_A-4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def get_parameter_count(model):
  num_params = sum(p.numel() for p in model.parameters())
  return num_params

print(f'teacher model has {(get_parameter_count(teacher_model)/1000000):.2f} parameters')
print(f'student model has {(get_parameter_count(student_model)/1000000):.2f} parameters')

teacher model has 109.49 parameters
student model has 17.49 parameters


In [14]:
# epochs = 4
# learning_rate = 2e-5
# entropy_loss = nn.CrossEntropyLoss()
# temperature = 2.0
# alpha = 0.5 #test this
# criterion = nn.KLDivLoss(reduction = 'batchmean')
# optimizer = optim.AdamW(student_model.parameters(), lr = learning_rate)

In [20]:
import pandas as pd

# Lists to store training and validation metrics
training_loss_list = []
training_kd_loss_list = []
training_accuracy_list = []
valid_loss_list = []
valid_accuracy_list = []

#starting loop
for epoch in tqdm(range(epochs), total=epochs):
    student_model.train()
    train_loss = 0.0
    train_kd_loss = 0.0
    train_accuracy = 0.0
    valid_loss = 0.0
    valid_accuracy = 0.0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        target_tensors = batch[2].to(device)

        # Student model predictions
        student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits
        ce_loss = entropy_loss(student_logits, target_tensors).data.item()

        # We extract teacher model logits
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits

        # Knowledge distillation loss (KD divergence)
        kd_loss = temperature ** 2 * criterion(
            F.log_softmax(student_logits / temperature, dim=-1),
            F.softmax(teacher_logits / temperature, dim=-1)
        )

        # Combined loss
        loss = alpha * ce_loss + (1. - alpha) * kd_loss
        loss.backward()
        optimizer.step()

        # Update training metrics
        train_kd_loss += kd_loss.data.item()
        train_loss += loss
        accuracy = accuracy_score(batch, student_model)
        train_accuracy += accuracy

    student_model.eval()
    for batch in valid_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        target_tensors = batch[2].to(device)

        # Validation loss
        output = student_model(input_ids=input_ids, attention_mask=attention_mask)
        val_loss = entropy_loss(output.logits, target_tensors)
        valid_loss += val_loss.data.item()

        # Update validation accuracy
        accuracy = accuracy_score(batch, student_model)
        valid_accuracy += accuracy

    # Calculate average metrics
    train_accuracy /= len(train_loader)
    valid_accuracy /= len(valid_loader)
    train_loss /= len(train_loader)
    train_kd_loss /= len(train_loader)
    valid_loss /= len(valid_loader)

    # Append metrics to lists
    training_kd_loss_list.append(train_kd_loss)
    training_loss_list.append(train_loss.cpu().detach().numpy())
    training_accuracy_list.append(train_accuracy)
    valid_loss_list.append(valid_loss)
    valid_accuracy_list.append(valid_accuracy)

    # Print and store metrics
    print(f"""
    After epoch {epoch + 1}:
    Training loss (entropy): {train_loss}
    Kullback-Leibler (KL) divergence loss: {train_kd_loss}
    Validation loss (entropy): {valid_loss}
    Training accuracy: {train_accuracy}
    Validation accuracy: {valid_accuracy}
    """)

# Create a DataFrame to store the metrics
metrics = pd.DataFrame({
    'training_loss': training_loss_list,
    'training_kd_loss': training_kd_loss_list,
    'training_accuracy': training_accuracy_list,
    'valid_loss': valid_loss_list,
    'valid_accuracy': valid_accuracy_list
})

 20%|██        | 1/5 [20:50<1:23:23, 1250.99s/it]


    After epoch 1:
    Training loss (entropy): 0.7482079267501831
    Kullback-Leibler (KL) divergence loss: 1.1360062624911467
    Validation loss (entropy): 0.26385501480599244
    Training accuracy: 0.8992708333333334
    Validation accuracy: 0.9247083333333334
    


 40%|████      | 2/5 [41:44<1:02:37, 1252.66s/it]


    After epoch 2:
    Training loss (entropy): 0.35760584473609924
    Kullback-Leibler (KL) divergence loss: 0.47823194254934787
    Validation loss (entropy): 0.23686846408744652
    Training accuracy: 0.9371354166666667
    Validation accuracy: 0.9344166666666667
    


 60%|██████    | 3/5 [1:02:38<41:45, 1252.98s/it]


    After epoch 3:
    Training loss (entropy): 0.27531835436820984
    Kullback-Leibler (KL) divergence loss: 0.3567752848118544
    Validation loss (entropy): 0.23795962620029848
    Training accuracy: 0.9488541666666667
    Validation accuracy: 0.93725
    


 80%|████████  | 4/5 [1:23:33<20:53, 1253.73s/it]


    After epoch 4:
    Training loss (entropy): 0.22677084803581238
    Kullback-Leibler (KL) divergence loss: 0.28709228939563036
    Validation loss (entropy): 0.24179285213599602
    Training accuracy: 0.9573541666666666
    Validation accuracy: 0.9411666666666667
    


100%|██████████| 5/5 [1:44:27<00:00, 1253.42s/it]


    After epoch 5:
    Training loss (entropy): 0.19181770086288452
    Kullback-Leibler (KL) divergence loss: 0.23655021492143471
    Validation loss (entropy): 0.2472120392775784
    Training accuracy: 0.9629895833333333
    Validation accuracy: 0.9407083333333334
    





In [None]:
student_model.push_to_hub('odunola/google-distilled-ag-news')
tokenizer.push_to_hub('odunola/google-distilled-ag-news')

In [21]:
accuracy_teacher = 0.0
time_taken_teacher = 0.0

accuracy_student = 0.0
time_taken_student = 0.0
count = 0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teacher_model)
  end_time = perf_counter()
  accuracy_teacher += score
  time_taken_teacher += end_time - start_time

  start_time = perf_counter()
  score = accuracy_score(batch, student_model)
  end_time = perf_counter()
  accuracy_student += score
  time_taken_student += end_time - start_time


print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'total number of batches is {len(test_loader)}')
print(f"teacher accuracy is {accuracy_teacher / len(test_loader):.2f}")
print(f'time taken per batch for teacher is {time_taken_teacher / len(test_loader):.6f}')
print('\n\n\n')
print(f"student accuracy is {accuracy_student / len(test_loader):.2f}")
print(f'time taken per batch for student is {time_taken_student / len(test_loader):.6f}')

100%|██████████| 119/119 [01:09<00:00,  1.71it/s]




number of samples in each batch is 48
total number of batches is 119
teacher accuracy is 0.94
time taken per batch for teacher is 0.506722




student accuracy is 0.94
time taken per batch for student is 0.073800





In [22]:
!ls

sample_data


In [25]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [23]:
student_model.save_pretrained("model")

In [26]:
stud_model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [27]:
teach_model = AutoModelForSequenceClassification.from_pretrained('odunola/bert-base-uncased-ag-news-finetuned-2')
#stud_model = AutoModelForSequenceClassification.from_pretrained('odunola/distillbert-distilled-ag-news')
device = 'cpu'

accuracy_teacher = 0.0
time_taken_teacher = 0.0
teacher_model = teach_model.to('cpu')
student_model = stud_model.to('cpu')
accuracy_student = 0.0
time_taken_student = 0.0
count = 1
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teach_model)
  end_time = perf_counter()
  accuracy_teacher += score
  time_taken_teacher += end_time - start_time

  start_time = perf_counter()
  score = accuracy_score(batch, stud_model)
  end_time = perf_counter()
  accuracy_student += score
  time_taken_student += end_time - start_time
  if count == 4:
    break
  count += 1


print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'total number of batches is {len(test_loader)}')
print(f"teacher accuracy is {accuracy_teacher / 4:.2f}")
print(f'time taken per batch for teacher is {time_taken_teacher / 4:.6f}')
print('\n\n\n')
print(f"student accuracy is {accuracy_student / 4:.2f}")
print(f'time taken per batch for student is {time_taken_student / 4:.6f}')

  3%|▎         | 3/119 [01:20<51:54, 26.85s/it]




number of samples in each batch is 64
total number of batches is 119
teacher accuracy is 0.94
time taken per batch for teacher is 17.054173




student accuracy is 0.94
time taken per batch for student is 3.078758





In [None]:
!ls