In [8]:
import zipfile
import os.path as osp
from transformers import logging, get_linear_schedule_with_warmup
from hparam_tuning import *
from data_utils import *
import torch
import copy
import os
from torch.optim import AdamW
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DataCollatorWithPadding
from utils import *
import pandas as pd
from IPython.display import display

In [9]:
model_name = '.'
keep_cols = ["input_ids", "token_type_ids", "attention_mask", "label"]
train_dataset_path = '/home/dsi/coheny78/project2/train.csv'
val_dataset_path = '/home/dsi/coheny78/project2/validation.csv'
model, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
dataset = load_csv_to_dataset(train_dataset_path, val_dataset_path)
preprocessed_dataset = preprocess(dataset, tokenizer)
processed_dataset = DatasetDict({
        split: ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
        for split, ds in preprocessed_dataset.items()})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/15938 [00:00<?, ? examples/s]

Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15938 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1996 [00:00<?, ? examples/s]

Map:   0%|          | 0/15938 [00:00<?, ? examples/s]

Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

Map:   0%|          | 0/15938 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15938 [00:00<?, ? examples/s]

Map:   0%|          | 0/12750 [00:00<?, ? examples/s]

Map:   0%|          | 0/3188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

## Compression Methods

### Single step

#### Pruning

In [3]:
teacher, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
student = initialize_student(teacher, mode="pruning", num_layers=6)
print("Teacher params:", count_parameters(teacher)[0])
print("Student params:", count_parameters(student)[0])
trainer = CompressionTrainer(
    loss_name="ce",
    teacher=teacher,
    student=student,
    dataset=processed_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    weights = calc_class_weights(processed_dataset['train']),
    path="./single_step/pruning"
)
trainer.inialize_training()
trainer.train_model()

Initializing Student by pruning the provided Teacher...
Teacher params: 109486854
Student params: 66959622
Validation Accuracy: 0.9351
Validation Accuracy: 0.9410
Validation Accuracy: 0.9479
Validation Accuracy: 0.9495
Validation Accuracy: 0.9467
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9495
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9464
EarlyStopping counter: 3 out of 3
Early stopping triggered. Training finished.


#### Distiliation (Student Teacher)

In [4]:
teacher, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
student = initialize_student(teacher, mode="st", num_layers=6)
print("Teacher params:", count_parameters(teacher)[0])
print("Student params:", count_parameters(student)[0])

trainer = CompressionTrainer(
    loss_name="kd",
    teacher=teacher,
    student=student,
    dataset=processed_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    weights = calc_class_weights(processed_dataset['train']),
    path="./single_step/ST"
)
trainer.inialize_training()
trainer.train_model()

Initializing Student from google/electra-base-discriminator (ST mode)...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Classifier initialized randomly for ST mode.
Teacher params: 109486854
Student params: 66959622
Validation Accuracy: 0.8821
Validation Accuracy: 0.9235
Validation Accuracy: 0.9231
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9307
Validation Accuracy: 0.9291
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9279
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9285
EarlyStopping counter: 3 out of 3
Early stopping triggered. Training finished.


#### Hybrid

In [5]:
teacher, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
student = initialize_student(teacher, mode="pruning", num_layers=6)
print("Teacher params:", count_parameters(teacher)[0])
print("Student params:", count_parameters(student)[0])
trainer = CompressionTrainer(
    loss_name="hybrid",
    teacher=teacher,
    student=student,
    dataset=processed_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    weights = calc_class_weights(processed_dataset['train']),
    path="./single_step/hybrid"
)
trainer.inialize_training()
trainer.train_model()

Initializing Student by pruning the provided Teacher...
Teacher params: 109486854
Student params: 66959622
Validation Accuracy: 0.9382
Validation Accuracy: 0.9460
Validation Accuracy: 0.9495
Validation Accuracy: 0.9492
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9501
Validation Accuracy: 0.9489
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9498
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9489
EarlyStopping counter: 3 out of 3
Early stopping triggered. Training finished.


#### Results

In [5]:
pruning_results = predict_on_dataset("./single_step/pruning", preprocessed_dataset)
st_results = predict_on_dataset("./single_step/ST", preprocessed_dataset)
hybrid_results = predict_on_dataset("./single_step/hybrid", preprocessed_dataset)
origin_results = predict_on_dataset(".", preprocessed_dataset)

df = pd.DataFrame.from_dict(
    {
        "full model": {
            **origin_results["metrics"],
            "sample run time": origin_results["time_per_sample_mean"],
            "sample rum std" : origin_results["time_per_sample_std"],
        },
        "pruning": {
            **pruning_results["metrics"],
            "sample run time": pruning_results["time_per_sample_mean"],
            "sample rum std" : pruning_results["time_per_sample_std"],
        },
        "st": {
            **st_results["metrics"],
            "sample run time": st_results["time_per_sample_mean"],
            "sample rum std" : st_results["time_per_sample_std"]
        },
        "hybrid": {
            **hybrid_results["metrics"],
            "sample run time": hybrid_results["time_per_sample_mean"],
            "sample rum std" : hybrid_results["time_per_sample_std"]
        },
    },
    orient="index"
)


print("\n")
display(df)

Loading cached results for pruning on test split
Loading cached results for ST on test split
Loading cached results for hybrid on test split
Loading cached results for . on test split




Unnamed: 0,accuracy,recall,precision,f1,sample run time,sample rum std
full model,0.937375,0.93443,0.897463,0.913964,0.000146,1.9e-05
pruning,0.92986,0.90259,0.907001,0.904235,8.3e-05,1.3e-05
st,0.925852,0.917133,0.882891,0.898265,7.9e-05,1.1e-05
hybrid,0.930361,0.919154,0.8894,0.9026,8.1e-05,1.2e-05


### Iterative Approach

In [3]:
layers_to_drop = [11, 9, 7, 5, 3, 1]  # alternating top layers

#### Pruning

In [4]:
first_teacher, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
final_student_folder = iterative_with_trainer(first_teacher, processed_dataset, tokenizer,
                                                    layers_to_drop, "ce", data_collator, save_dir="iterative")



--- Iteration 0/6 ---
Dropping layer 11 for this student
Teacher params: 109486854
Student params: 102398982
Validation Accuracy: 0.9511
Validation Accuracy: 0.9539
Validation Accuracy: 0.9658
Validation Accuracy: 0.9570
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9674
Validation Accuracy: 0.9655
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9661
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9639
EarlyStopping counter: 3 out of 3
Early stopping triggered. Training finished.

Iterative distillation finished!
Final student is saved in: iterative/not_final_step

--- Iteration 1/6 ---
Dropping layer 9 for this student
Teacher params: 102398982
Student params: 95311110
Validation Accuracy: 0.9664
Validation Accuracy: 0.9658
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9689
Validation Accuracy: 0.9683
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9677
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9683
EarlyStopping counter

#### Hybrid

In [6]:
first_teacher, tokenizer = load_hf_classifier(model_name, n_classes=6, training=False)
final_student_folder = iterative_with_trainer(first_teacher, processed_dataset, tokenizer,
                                                    layers_to_drop,"hybrid", data_collator, save_dir="iterative")



--- Iteration 0/6 ---
Dropping layer 11 for this student
Teacher params: 109486854
Student params: 102398982
Validation Accuracy: 0.9570
Validation Accuracy: 0.9558
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9533
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9580
Validation Accuracy: 0.9577
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9570
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9592
Validation Accuracy: 0.9561
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9580
EarlyStopping counter: 2 out of 3
Validation Accuracy: 0.9580
EarlyStopping counter: 3 out of 3

Iterative distillation finished!
Final student is saved in: iterative/not_final_step

--- Iteration 1/6 ---
Dropping layer 9 for this student
Teacher params: 102398982
Student params: 95311110
Validation Accuracy: 0.9536
Validation Accuracy: 0.9627
Validation Accuracy: 0.9630
Validation Accuracy: 0.9605
EarlyStopping counter: 1 out of 3
Validation Accuracy: 0.9592
Earl

#### Results

In [18]:
pruning_results = predict_on_dataset("./iterative/ce", preprocessed_dataset)
hybrid_results = predict_on_dataset("./iterative/hybrid", preprocessed_dataset)
origin_results = predict_on_dataset(".", preprocessed_dataset)

df = pd.DataFrame.from_dict(
    {
        "full model": {
            **origin_results["metrics"],
            "sample run time": origin_results["time_per_sample_mean"],
            "sample rum std" : origin_results["time_per_sample_std"],
        },
        "pruning": {
            **pruning_results["metrics"],
            "sample run time": pruning_results["time_per_sample_mean"],
            "sample rum std" : pruning_results["time_per_sample_std"],
        },
        "hybrid": {
            **hybrid_results["metrics"],
            "sample run time": hybrid_results["time_per_sample_mean"],
            "sample rum std" : hybrid_results["time_per_sample_std"]
        },
    },
    orient="index"

    
)
print("\n")
display(df)

Loading cached results for ce on test split
Loading cached results for hybrid on test split
Loading cached results for . on test split




Unnamed: 0,accuracy,recall,precision,f1,sample run time,sample rum std
full model,0.937375,0.93443,0.897463,0.913964,0.000146,1.9e-05
pruning,0.925351,0.905029,0.894945,0.899815,8e-05,1.7e-05
hybrid,0.930361,0.919154,0.8894,0.9026,8.1e-05,1.2e-05
