In [None]:
# pip install datasets evaluate transformers huggingface_hub

In [None]:
# pip install kagglehub

In [17]:
from datasets import DatasetDict, Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

# **Load Data**

In [20]:
dataset_dict = load_dataset("shawhin/phishing-site-classification")

README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/450 [00:00<?, ? examples/s]

In [21]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})


# **Load Pre-trained Model**

In [22]:
# define pre-trained model path
model_path = "google-bert/bert-base-uncased"

# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# load model with binary classification head
id2label = {0: "not-phishing", 1: "phishing"}
label2id = {"not-phishing": 0, "phishing": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=2, id2label=id2label, label2id=label2id
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Set Trainable Parameters(Transfer Learning)**

In [24]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
  if "pooler" in name:
    param.requires_grad = True


# **Data Pre-processing**

In [25]:
# define text preprocessing
def preprocess_function(item):
  return tokenizer(item["text"], truncation=True)

# preprocess all data
tokenized_data = dataset_dict.map(preprocess_function, batched=True)


Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [26]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
})

In [27]:
# create data collator to make smaller texts pad to highest text in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# **Define Evaluation Metric**

In [28]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions (-1 to 1)
    predictions, labels = eval_pred

    # apply softmax to get probabilities (-1 to 1 --> 0 to 1)
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'], 3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'], 3)

    return {"Accuracy": acc, "AUC": auc}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

# **Training Parameters**

In [38]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-phishing-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)


# **Fine-tune Model**

In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.4979,0.381855,0.811,0.915
2,0.4038,0.351955,0.844,0.935
3,0.3689,0.321997,0.856,0.937
4,0.3453,0.413897,0.831,0.944
5,0.3511,0.309032,0.869,0.945
6,0.3513,0.290449,0.864,0.95
7,0.3222,0.30918,0.858,0.947
8,0.307,0.292962,0.867,0.949
9,0.3201,0.283617,0.876,0.951
10,0.3103,0.295131,0.869,0.951


TrainOutput(global_step=2630, training_loss=0.35778053776846186, metrics={'train_runtime': 4090.6457, 'train_samples_per_second': 5.134, 'train_steps_per_second': 0.643, 'total_flos': 706603239165360.0, 'train_loss': 0.35778053776846186, 'epoch': 10.0})

In [40]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': 0.884, 'AUC': 0.946}
