In [9]:
!pip install datasets



In [10]:
from datasets import load_dataset

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertConfig

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support

In [12]:
data=load_dataset("shawhin/phishing-site-classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

In [14]:
data["train"][9]

{'text': 'lquuqkf.org/information.cgi', 'labels': 1}

In [15]:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
#load teacher
checkpoint="shawhin/bert-phishing-classifier_teacher"

tokenizer=AutoTokenizer.from_pretrained(checkpoint)
model=AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)

In [17]:
#load student
my_config=DistilBertConfig(n_heads=8, n_layers=4)

In [18]:
my_config

DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 8,
  "n_layers": 4,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.44.2",
  "vocab_size": 30522
}

In [19]:
student_model=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",config=my_config).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def tokenizer_fn(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True)

tokenized_data=data.map(tokenizer_fn, batched=True)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [21]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
})

In [22]:
tokenized_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [23]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
})

In [24]:
# evaluation function
def evaluate_model(model, data_loader,device):
    model.eval() #set model to eval mode
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    return accuracy, precision, recall, f1

In [25]:
#distillation loss function
def distillation_loss(student_logits, teacher_logits, labels, temperature):
    student_probs = F.softmax(student_logits / temperature, dim=1)
    teacher_probs = F.softmax(teacher_logits / temperature, dim=1)

    distillation_loss = F.kl_div(student_probs, teacher_probs, reduction='batchmean')*(temperature**2)

    hard_loss=nn.CrossEntropyLoss()(student_logits, labels)

    loss= alpha*distillation_loss + (1-alpha)*hard_loss

    return loss

In [26]:
#hyper parameters
batch_size=32
lr=1e-4
num_epochs=3
alpha=0.5
temperature=2

optimizer=optim.AdamW(student_model.parameters(), lr=lr)

data_loader=DataLoader(tokenized_data["train"], batch_size=batch_size, shuffle=True)
test_loader=DataLoader(tokenized_data["test"], batch_size=batch_size, shuffle=False)

In [23]:
#train model
student_model.train()

for epoch in range(num_epochs):
  for batch in data_loader:
    input_ids=batch["input_ids"].to(device)
    attention_mask=batch["attention_mask"].to(device)
    labels=batch["labels"].to(device)

    with torch.no_grad():
      teacher_outputs=model(input_ids, attention_mask=attention_mask)
      teacher_logits=teacher_outputs.logits

    student_outputs=student_model(input_ids, attention_mask=attention_mask)
    student_logits=student_outputs.logits

    loss=distillation_loss(student_logits, teacher_logits, labels, temperature)


    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  #evaluate model
  teacher_accuracy, teacher_precision, teacher_recall, teacher_f1=evaluate_model(model, test_loader, device)
  print(f"Epoch {epoch+1}/{num_epochs} - Teacher Accuracy: {teacher_accuracy:.4f}, Teacher Precision: {teacher_precision:.4f}, Teacher Recall: {teacher_recall:.4f}, Teacher F1: {teacher_f1:.4f}")

  student_accuracy, student_precision, student_recall, student_f1=evaluate_model(student_model, test_loader, device)
  print(f"Epoch {epoch+1}/{num_epochs} - Student Accuracy: {student_accuracy:.4f}, Student Precision: {student_precision:.4f}, Student Recall: {student_recall:.4f}, Student F1: {student_f1:.4f}")

  print("\n")

  student_model.train()


Epoch 1/10 - Teacher Accuracy: 0.8667, Teacher Precision: 0.8967, Teacher Recall: 0.8341, Teacher F1: 0.8643
Epoch 1/10 - Student Accuracy: 0.9000, Student Precision: 0.8680, Student Recall: 0.9476, Student F1: 0.9061


Epoch 2/10 - Teacher Accuracy: 0.8667, Teacher Precision: 0.8967, Teacher Recall: 0.8341, Teacher F1: 0.8643
Epoch 2/10 - Student Accuracy: 0.9133, Student Precision: 0.9167, Student Recall: 0.9127, Student F1: 0.9147


Epoch 3/10 - Teacher Accuracy: 0.8667, Teacher Precision: 0.8967, Teacher Recall: 0.8341, Teacher F1: 0.8643
Epoch 3/10 - Student Accuracy: 0.9222, Student Precision: 0.9575, Student Recall: 0.8865, Student F1: 0.9206


Epoch 4/10 - Teacher Accuracy: 0.8667, Teacher Precision: 0.8967, Teacher Recall: 0.8341, Teacher F1: 0.8643
Epoch 4/10 - Student Accuracy: 0.9222, Student Precision: 0.9254, Student Recall: 0.9214, Student F1: 0.9234




KeyboardInterrupt: 

In [24]:
student_model.save_pretrained("student_model")

In [32]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [27]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
# Load the quantized model
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_nf4 = AutoModelForSequenceClassification.from_pretrained("student_model", quantization_config=nf4_config, device_map=device)

qaccuracy,qprecision,qrecall,qf1=evaluate_model(model_nf4, test_loader, device)
print("Quantized Accuracy: ", qaccuracy)
print("Quantized Precision: ", qprecision)
print("Quantized Recall: ", qrecall)
print("Quantized F1: ", qf1)

Quantized Accuracy:  0.92
Quantized Precision:  0.9251101321585903
Quantized Recall:  0.9170305676855895
Quantized F1:  0.9210526315789473
