In [11]:
import torch
import transformers
#from transformers import pipeline
from transformers import DistilBertModel, DistilBertForMaskedLM, DistilBertTokenizer, IBertForSequenceClassification, AutoTokenizer
#from optimum.quanto import freeze, quantize, qint8
import datasets
from transformers import TrainingArguments
import numpy as np
import evaluate

In [12]:
model_id = "kssteven/ibert-roberta-base"
#model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("kssteven/ibert-roberta-base")

In [13]:
dataset = datasets.load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [14]:
def preprocess(data):
    tokens = tokenizer(data["text"], truncation=True, padding = 'max_length',  max_length=512)
    tokens["label"] = data["label"]
    return tokens

In [15]:
tokens = dataset.map(preprocess, batched = True)

Map: 100%|██████████| 25000/25000 [00:07<00:00, 3327.30 examples/s]
Map: 100%|██████████| 50000/50000 [00:15<00:00, 3150.17 examples/s]


In [16]:
labels = tokens['train'].features['label'].names
num_labels = len(labels)
label2id, id2label = {}, {}

for idx, lbl in enumerate(labels):
    label2id[lbl] = idx
    id2label[idx] = lbl

In [17]:
small_train_dataset = tokens["train"].shuffle(seed=11).select(range(2000))
small_eval_dataset = tokens["train"].shuffle(seed=11).select(range(2000))

In [27]:
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification, DistilBertConfig, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels = 2,
    #id2label = id2label,
    #label2id = label2id,
)

print(model)

Some weights of IBertForSequenceClassification were not initialized from the model checkpoint at kssteven/ibert-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'ibert.embeddings.LayerNorm.activation.act_scaling_factor', 'ibert.embeddings.LayerNorm.activation.x_max', 'ibert.embeddings.LayerNorm.activation.x_min', 'ibert.embeddings.LayerNorm.shift', 'ibert.embeddings.embeddings_act1.act_scaling_factor', 'ibert.embeddings.embeddings_act1.x_max', 'ibert.embeddings.embeddings_act1.x_min', 'ibert.embeddings.embeddings_act2.act_scaling_factor', 'ibert.embeddings.embeddings_act2.x_max', 'ibert.embeddings.embeddings_act2.x_min', 'ibert.embeddings.output_activation.act_scaling_factor', 'ibert.embeddings.output_activation.x_max', 'ibert.embeddings.output_activation.x_min', 'ibert.embeddings.position_embeddings.weight_integer', 'ibert.embeddings.position_embeddings.weight_scaling_factor', 'ibert.

IBertForSequenceClassification(
  (ibert): IBertModel(
    (embeddings): IBertEmbeddings(
      (word_embeddings): QuantEmbedding()
      (token_type_embeddings): QuantEmbedding()
      (position_embeddings): QuantEmbedding()
      (embeddings_act1): QuantAct(activation_bit=16, quant_mode: False, Act_min: -0.00, Act_max: 0.00)
      (embeddings_act2): QuantAct(activation_bit=16, quant_mode: False, Act_min: -0.00, Act_max: 0.00)
      (LayerNorm): IntLayerNorm(
        (activation): QuantAct(activation_bit=32, quant_mode: False, Act_min: -0.00, Act_max: 0.00)
      )
      (output_activation): QuantAct(activation_bit=8, quant_mode: False, Act_min: -0.00, Act_max: 0.00)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): IBertEncoder(
      (layer): ModuleList(
        (0-11): 12 x IBertLayer(
          (attention): IBertAttention(
            (self): IBertSelfAttention(
              (query): (QuantLinear() weight_bit=8, quant_mode=False)
              (key): (QuantLinea

In [29]:
print(model.ibert.encoder.layer[0].attention.self.query.state_dict())

OrderedDict({'weight': tensor([[ 0.0729, -0.0029, -0.0902,  ...,  0.1033,  0.0900, -0.1030],
        [-0.0516,  0.2061,  0.0739,  ...,  0.0657,  0.0634,  0.1282],
        [ 0.0878,  0.0698, -0.0515,  ..., -0.0426, -0.0081,  0.1100],
        ...,
        [-0.1871,  0.0172, -0.0315,  ..., -0.0503,  0.1024, -0.1165],
        [-0.2532,  0.0439,  0.0638,  ...,  0.0701, -0.1045,  0.0118],
        [-0.0516, -0.0859,  0.1027,  ..., -0.1895,  0.0033, -0.0541]]), 'bias': tensor([ 2.3572e-01,  4.6570e-02, -3.7012e-01,  4.8804e-01,  7.1484e-01,
        -2.8931e-01, -9.8755e-02,  2.5928e-01, -4.6722e-02,  3.9771e-01,
         5.5127e-01,  5.8008e-01,  1.5674e-01, -1.7273e-01,  5.7129e-01,
        -2.9053e-01, -2.1350e-01,  9.8450e-02,  2.4033e-02, -9.2983e-04,
        -5.0195e-01,  1.0547e-01, -4.2847e-02, -1.1133e-01, -6.2988e-01,
         1.7786e-01,  1.2720e-01, -3.9453e-01, -5.3345e-02,  3.4106e-01,
         6.4270e-02, -3.5278e-01, -2.7515e-01, -3.8574e-01,  9.6008e-02,
        -1.8677e-01,  6

In [24]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def preprocess_logits_for_metrics(logits, labels):
    """
    Preprocess the logits to ensure they are in the correct format for metric computation.
    This function will be called during the evaluation process.
    """
    if isinstance(logits, tuple):  
        logits = logits[0]  # get logit tensors

    pred_ids = torch.argmax(logits, dim=-1)
    
    return pred_ids, labels
    
def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred

    return accuracy.compute(predictions=predictions[0], references=labels)


In [32]:
from transformers import TrainingArguments, Trainer

EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 0.00005

training_args = TrainingArguments(
    output_dir = './imdb_tune_distilbert',
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    learning_rate = LEARNING_RATE,
    logging_dir = './logs',
    load_best_model_at_end= True,
    metric_for_best_model="accuracy",
    eval_strategy="epoch",
    eval_steps = 500,
    save_strategy="epoch",
    save_total_limit=2,
    report_to=['tensorboard'],
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=small_train_dataset,         
    eval_dataset=small_eval_dataset.shuffle(seed=72).select(range(600)),
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    tokenizer = tokenizer,
    data_collator = data_collator,
)


  trainer = Trainer(


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.182836,0.9415


TrainOutput(global_step=125, training_loss=0.39196124267578125, metrics={'train_runtime': 2017.9639, 'train_samples_per_second': 0.991, 'train_steps_per_second': 0.062, 'total_flos': 264934797312000.0, 'train_loss': 0.39196124267578125, 'epoch': 1.0})

In [None]:
untrained = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels = num_labels,
    id2label = id2label,
    label2id = label2id,
)

training_args_untrained = TrainingArguments(
    output_dir="./results",  
    per_device_eval_batch_size=2, 
    use_cpu=True,  
    no_cuda = True,
    logging_dir="./logs",  
    logging_steps=10,
)

trainer_untrained = Trainer(
    model=untrained,
    args=training_args_untrained,
    eval_dataset=small_eval_dataset.select(range(600)),
    data_collator = data_collator,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    compute_metrics = compute_metrics,
)

print("accuracy no training: ", trainer_untrained.evaluate())


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy after quantization: 0.48833333333333334


In [33]:
#quantize trained imdb model
from optimum.quanto import freeze, quantize, qint8



print("accuracy trained after quant of weights to int8 ", trainer.evaluate())


accuracy trained after quant of weights to int8  {'eval_loss': 0.1721130609512329, 'eval_model_preparation_time': 0.0022, 'eval_accuracy': 0.94, 'eval_runtime': 127.8146, 'eval_samples_per_second': 4.694, 'eval_steps_per_second': 0.297}


In [None]:
quantize(model, weights=qint8, activations=None)
freeze(model)
print("accuracy trained after quant of weights to int8 ", trainer.evaluate())

accuracy trained after quant of weights to int8  {'eval_loss': 0.17312778532505035, 'eval_model_preparation_time': 0.0022, 'eval_accuracy': 0.94, 'eval_runtime': 155.7452, 'eval_samples_per_second': 3.852, 'eval_steps_per_second': 0.244}


In [35]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): QLinear(in_features=768, out_features=768, bias=True)
            (k_lin): QLinear(in_features=768, out_features=768, bias=True)
            (v_lin): QLinear(in_features=768, out_features=768, bias=True)
            (out_lin): QLinear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=Fal

quantize(model)
trainer.train()
freeze(model)
trainer.evaluate()