# Teacher Model Training

Code authored by: Shaw Talebi

[Video](https://youtu.be/4QHg8Ix8WWQ) <br>
[Blog](https://medium.com/towards-data-science/fine-tuning-bert-for-text-classification-a01f89b179fc) <br>
Based on example [here](https://huggingface.co/docs/transformers/en/tasks/sequence_classification)

### imports

In [16]:
from datasets import Dataset, load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

### load data

In [17]:
data = pd.read_csv("data.csv")  # Replace with your data file
# Split data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label']) # 80/20 split

# Convert to Hugging Face Datasets format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset_dict = {"train": train_dataset, "test": test_dataset} # Create a dictionary as expected by the original code

dataset_dict

{'train': Dataset({
     features: ['text', 'label', '__index_level_0__'],
     num_rows: 159
 }),
 'test': Dataset({
     features: ['text', 'label', '__index_level_0__'],
     num_rows: 40
 })}

### Train Teacher Model

In [18]:
# Load model directly
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "complete", 1: "incomplete"}
label2id = {"complete": 0, "incomplete": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze base model

In [19]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [20]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [21]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocess text

In [30]:
# define text preprocessing
def preprocess_function(examples):
    tokenizer(examples["text"], truncation=True, padding="max_length", max_length = 256)

In [37]:
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True)

In [39]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluation

In [40]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

#### Train model

In [41]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-completeness-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [42]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data, 
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [43]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

# 1. Load your data (assuming a CSV file with "text" and "label" columns)
data = pd.read_csv("data.csv")  # Replace with your data file

# 2. Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label']) # 80/20 split

# 3. Load model directly
model_path = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 4. Define Tokenizer and Process Data
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length = 256)

# 5. Tokenize Datasets to have labels and tokenized items
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True)
# 5. Convert to PyTorch Dataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    
# Convert labels to numerical values
le = LabelEncoder()
y_train = le.fit_transform(train_data['label'])
y_test = le.transform(test_data['label']) #Use fit from train, so we do not add categories into our encoder
#We are now also using our MyDataset class for this function
train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)


# 6. Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy = "epoch",
    load_best_model_at_end=True,
)

# 7. Model building
# Load model directly
model_path = "google-bert/bert-base-uncased"

id2label = {0: "complete", 1: "incomplete"}
label2id = {"complete": 0, "incomplete": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

# 8. Trainer
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer), #this is required for tokenizers or else the error happens
)

# 9. Train
trainer.train()
trainer.save_model()
print("Training complete")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.7007,0.690729
2,0.6939,0.674482
3,0.6674,0.653065
4,0.6234,0.623082
5,0.5738,0.578776
6,0.5031,0.507907
7,0.4299,0.428199
8,0.3412,0.353584
9,0.2571,0.282227
10,0.185,0.271104


Training complete


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 1. Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('./results') # same folder that the pretrained values are
model = BertForSequenceClassification.from_pretrained('./results') # same folder that the pretrained values are

# 2. Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) #Move model to the active device.
model.eval() #put model in evaluation mode
# 3. Define the is_complete function
def is_complete(text, tokenizer, model, device): #now passes arguments
    """
    Classifies if the given text is complete or incomplete using a pre-trained DistilBERT model.
    """
    model.eval()  # Set the model to evaluation mode
    model.to(device) #move model to device

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device) # Move inputs to device
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities).item()  # Use item() to get the Python number
    return predicted_class
# 4. Example usage

text_to_test = "hello my name is neil joseph and i am a"  #@param {type:"string"}
# Run the check
prediction = is_complete(text_to_test, tokenizer, model, device) #now we pass the training and test objects in here, we also pass the text string

# Convert numerical values to what we defined before (label2id = {"complete": 0, "incomplete": 1}
labels= {0: "complete", 1: "incomplete"}
predicted_label=labels[prediction] #Here we get the label from the dictionary with key values

# 5. Output the result.
print(f"The sentence '{text_to_test}' is predicted as: {predicted_label}")

The sentence 'hello my name is neil joseph and i am a,' is predicted as: incomplete


### Apply Model to Validation Dataset

In [None]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_test_data)

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

  0%|          | 0/57 [00:00<?, ?it/s]

{'Accuracy': 0.889, 'AUC': 0.946}


### Push to hub

In [34]:
# push model to hub
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shawhin/bert-phishing-classifier_teacher/commit/6e4110db0febcd143d945e86d8e0ec8a08204d4c', commit_message='End of training', commit_description='', oid='6e4110db0febcd143d945e86d8e0ec8a08204d4c', pr_url=None, pr_revision=None, pr_num=None)