# Teacher Model Training

Code authored by: Shaw Talebi

[Video](https://youtu.be/4QHg8Ix8WWQ) <br>
[Blog](https://medium.com/towards-data-science/fine-tuning-bert-for-text-classification-a01f89b179fc) <br>
Based on example [here](https://huggingface.co/docs/transformers/en/tasks/sequence_classification)

### imports

In [1]:
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

### load data

In [2]:
dataset_dict = load_dataset("shawhin/phishing-site-classification")

In [3]:
dataset_dict['train']['text']

["http://bazurashop.com/idex.html?sfm_from_iframe=1',300,350",
 'hollywoodland.org/?p=29',
 'tunnekylmyysmiddletonii.02leds.com/me4xcdste0.php\\n',
 'usa-people-search.com/Find-Carla-Brown-IA.aspx',
 'inspire-consultants.com.my/487ygfh',
 'taiwanteastore.com/',
 'citizendia.org/Morocco_national_football_team',
 'osscamp.pl/poeosias/xskkswee/oeidppda/doeiidas/',
 'www.luckybell.com/index/',
 'lquuqkf.org/information.cgi',
 'freitaspedrasdecorativas.com.br/website/wp-admin/images/',
 'absoluteastronomy.com/topics/TV5MONDE',
 'readprint.com/online-books/a',
 'findtofind.info/wp-content/themes/twentyfifteen/inc/index.html',
 'schoolspthakarpura.com/components/com_ag_google_analytics2/itordernote.html',
 'flickeralley.com/fat_soviet_01.html',
 'hpfirefighter.com/',
 'school.mech.uwa.edu.au/~jamest/shearmagic/',
 'pakistanstoday.com/',
 'aquatixbottle.com/MCskGX',
 'en.wikipedia.org/wiki/Elimination_Chamber',
 'supaprice.co.uk/p/result.jsp?ga=uk9&q=signed+stoke+city+shirt',
 'flickr.com/phot

### Train Teacher Model

In [4]:
# Load model directly
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "Safe", 1: "Not Safe"}
label2id = {"Safe": 0, "Not Safe": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze base model

In [5]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [6]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocess text

In [7]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
# tokenize all datasetse
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [13]:
tokenized_data['train'][[0, 1]]

{'text': ["http://bazurashop.com/idex.html?sfm_from_iframe=1',300,350",
  'hollywoodland.org/?p=29'],
 'labels': [1, 0],
 'input_ids': [[101,
   8299,
   1024,
   1013,
   1013,
   8670,
   9759,
   8180,
   18471,
   1012,
   4012,
   1013,
   8909,
   10288,
   1012,
   16129,
   1029,
   16420,
   2213,
   1035,
   2013,
   1035,
   2065,
   6444,
   2063,
   1027,
   1015,
   1005,
   1010,
   3998,
   1010,
   8698,
   102],
  [101, 5365, 3122, 1012, 8917, 1013, 1029, 1052, 1027, 2756, 102]],
 'token_type_ids': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  [1, 1, 1, 1, 1, 1, 1, 1

In [9]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluation

In [10]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

#### Train model

In [11]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-phishing-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/2630 [00:00<?, ?it/s]

{'loss': 0.4944, 'grad_norm': 1.3726258277893066, 'learning_rate': 0.00018, 'epoch': 1.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.37888339161872864, 'eval_Accuracy': 0.816, 'eval_AUC': 0.915, 'eval_runtime': 10.3024, 'eval_samples_per_second': 43.679, 'eval_steps_per_second': 5.533, 'epoch': 1.0}
{'loss': 0.4032, 'grad_norm': 6.719116687774658, 'learning_rate': 0.00016, 'epoch': 2.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.35716497898101807, 'eval_Accuracy': 0.842, 'eval_AUC': 0.934, 'eval_runtime': 10.2741, 'eval_samples_per_second': 43.8, 'eval_steps_per_second': 5.548, 'epoch': 2.0}
{'loss': 0.3689, 'grad_norm': 0.6542453765869141, 'learning_rate': 0.00014, 'epoch': 3.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.3247514069080353, 'eval_Accuracy': 0.853, 'eval_AUC': 0.936, 'eval_runtime': 10.28, 'eval_samples_per_second': 43.775, 'eval_steps_per_second': 5.545, 'epoch': 3.0}
{'loss': 0.3465, 'grad_norm': 2.0636978149414062, 'learning_rate': 0.00012, 'epoch': 4.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.4174882769584656, 'eval_Accuracy': 0.831, 'eval_AUC': 0.943, 'eval_runtime': 10.0894, 'eval_samples_per_second': 44.601, 'eval_steps_per_second': 5.649, 'epoch': 4.0}
{'loss': 0.3522, 'grad_norm': 3.3976035118103027, 'learning_rate': 0.0001, 'epoch': 5.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.3094785511493683, 'eval_Accuracy': 0.869, 'eval_AUC': 0.944, 'eval_runtime': 9.954, 'eval_samples_per_second': 45.208, 'eval_steps_per_second': 5.726, 'epoch': 5.0}
{'loss': 0.3524, 'grad_norm': 2.2095787525177, 'learning_rate': 8e-05, 'epoch': 6.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.29271748661994934, 'eval_Accuracy': 0.862, 'eval_AUC': 0.949, 'eval_runtime': 10.5312, 'eval_samples_per_second': 42.73, 'eval_steps_per_second': 5.413, 'epoch': 6.0}
{'loss': 0.3208, 'grad_norm': 2.0597949028015137, 'learning_rate': 6e-05, 'epoch': 7.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.30850574374198914, 'eval_Accuracy': 0.858, 'eval_AUC': 0.946, 'eval_runtime': 10.4793, 'eval_samples_per_second': 42.942, 'eval_steps_per_second': 5.439, 'epoch': 7.0}
{'loss': 0.3105, 'grad_norm': 3.3963489532470703, 'learning_rate': 4e-05, 'epoch': 8.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.295242041349411, 'eval_Accuracy': 0.864, 'eval_AUC': 0.949, 'eval_runtime': 10.9419, 'eval_samples_per_second': 41.126, 'eval_steps_per_second': 5.209, 'epoch': 8.0}
{'loss': 0.3232, 'grad_norm': 0.2550579011440277, 'learning_rate': 2e-05, 'epoch': 9.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.286236971616745, 'eval_Accuracy': 0.876, 'eval_AUC': 0.949, 'eval_runtime': 10.9368, 'eval_samples_per_second': 41.145, 'eval_steps_per_second': 5.212, 'epoch': 9.0}
{'loss': 0.3099, 'grad_norm': 3.82411789894104, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.29782503843307495, 'eval_Accuracy': 0.867, 'eval_AUC': 0.95, 'eval_runtime': 10.7565, 'eval_samples_per_second': 41.835, 'eval_steps_per_second': 5.299, 'epoch': 10.0}
{'train_runtime': 594.4514, 'train_samples_per_second': 35.327, 'train_steps_per_second': 4.424, 'train_loss': 0.3582098928241222, 'epoch': 10.0}


TrainOutput(global_step=2630, training_loss=0.3582098928241222, metrics={'train_runtime': 594.4514, 'train_samples_per_second': 35.327, 'train_steps_per_second': 4.424, 'total_flos': 706603239165360.0, 'train_loss': 0.3582098928241222, 'epoch': 10.0})

### Apply Model to Validation Dataset

In [13]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

  0%|          | 0/57 [00:00<?, ?it/s]

{'Accuracy': 0.882, 'AUC': 0.947}


### Push to hub