In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [33]:
from datasets import DatasetDict, Dataset

In [34]:
!pip install evaluate



In [35]:
import numpy as np
import evaluate
from transformers import DataCollatorWithPadding
from datasets import load_dataset

Loading the dataset

In [36]:
dataset_dict= load_dataset("shawhin/phishing-site-classification")

Loading the Pretrained model

In [37]:
# define pretrained model path
model_path= "prajjwal1/bert-tiny"

tokenizer= AutoTokenizer.from_pretrained(model_path)



In [38]:
# load model with binary classification head
id2label= {0:"Safe", 1:"Not Safe"}
label2id= {"Safe":0, "Not Safe":1}

model= AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=2, id2label=id2label,label2id=label2id)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# So now all 110 million parameters of the bert modela n aslo the parameters of the classification head are initialized and ready to be trained
# Setting trainable parameters by freezing all base model parameters basically the Encoder layers are frozen
for name, param in model.base_model.named_parameters():
  param.requires_grad=False

# unfreezing the base model last four layers parameters which are the polling layers
for name,param in model.base_model.named_parameters():
  if "pooling" in name:
    param.raquires_grad= True



Data Preprocessing

In [40]:
def preprocess_function(examples):
  # return tokenized text with truncation flag so that none of the url must be too long crossing the 512 context size of bert
  return tokenizer(examples['text'], truncation= True)

# preprocess all datasets
tokenized_data= dataset_dict.map(preprocess_function, batched= True)



Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [41]:
# we have to take care that every example in the batch is of same rectangular matrix size
data_collator= DataCollatorWithPadding(tokenizer= tokenizer)

In [42]:
# evaluation matrix , these are the matrix that will be printed during the training process
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)

    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]

    # compute auc
    auc = np.round(
        auc_score.compute(
            prediction_scores=positive_class_probs,
            references=labels
        )['roc_auc'], 3
    )

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)

    # compute accuracy
    acc = np.round(
        accuracy.compute(
            predictions=predicted_classes,
            references=labels
        )['accuracy'], 3
    )

    return {"Accuracy": acc, "AUC": auc}


Training the model

In [43]:
# hyperparameters
learn_rate= 2e-5
batch_size= 32
num_epochs= 3

training_args = TrainingArguments(
    "bert-phishing-classifier_teacher",  # passed as positional argument
    learning_rate=learn_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)


Finetune the model

In [44]:
trainer = Trainer (
model=model, args=training_args,
                    train_dataset=tokenized_data["train"],
                    eval_dataset=tokenized_data["test"] ,
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
)


  trainer = Trainer (
