In [None]:
import evaluate
import numpy as np
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

In [None]:
def freeze_parameters(model, unfreeze_layers=[]):
    for name, param in model.base_model.named_parameters():
        if any(layer in name for layer in unfreeze_layers):
            param.requires_grad = True
        else:
            param.requires_grad = False
            
    return model

In [None]:
def preprocess_text(data, tokenizer):
    tokenized_data = tokenizer(data["text"], truncation=True)
    return tokenized_data

In [None]:
def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    auc_score = evaluate.load("roc_auc")
    logits, labels = eval_pred
    probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
    pos_probs = probs[:, 1]
    auc = np.round(auc_score.compute(predictions_score=pos_probs, references=labels)["roc_auc"], 3)
    pred = np.argmax(logits, axis=1)
    acc = np.round(accuracy.compute(predictions_score=pred, references=labels)["accuracy"], 3)
    metrics = {"accuracy": acc, "auc": auc}
    
    return metrics

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
dataset_path = "shawhin/phishing-site-classification"
model_path = "google-bert/bert-base-uncased"

In [None]:
dataset_dict = load_dataset(dataset_path)

In [None]:
id2label = {0: "Safe", 1: "Not Safe"}
label2id = {"Safe": 0, "Not Safe": 1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, id2label=id2label, label2id=label2id).to(device)

In [None]:
model = freeze_parameters(model, unfreeze_layers=["pooler"])

In [None]:
print("Model parameters after freezing:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.requires_grad}")

In [None]:
tokenized_dataset = dataset_dict.map(lambda x: preprocess_text(x, tokenizer), batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
lr = 2e-4
batch_size = 8
n_epochs = 10

In [None]:
trainer_args = TrainingArguments(
    output_dir="bert-distillation-teacher",
    eval_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=n_epochs,
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=trainer_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_dataset["validation"])
logits = predictions.predictions
labels = predictions.label_ids
metrics = compute_metrics((logits, labels))
print(f"Metrics: {metrics}")

In [None]:
trainer.push_to_hub("bert-distillation-teacher")