### Second Time

In [None]:
# Set Up

# clean env
! yes y | pip uninstall transformers datasets

# reinstall
! pip install transformers
! pip install datasets
! pip install torch==2.0.1 torchvision==0.15.2
! pip install accelerate

# import libs
import os
import sys
import time
import numpy as np
import torch
import torchvision
import transformers
import datasets
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

task = 'mnli'
dataset = load_dataset("glue", task)
metric = load_metric("glue", task)

# before feed texts to model, need to prepocessing data, it can be done by Transformer Tokenizer
batch_size = 16
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

task_to_keys = {"mnli": ("premise", "hypothesis")}
sentence1_key, sentence2_key = task_to_keys[task]

def preprocess_function(samples):
  return tokenizer(samples[sentence1_key], samples[sentence2_key], truncation=True)

# use one single command to preprocess train, validation and test data
encoded_dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=True)

In [None]:
# MNLI has 3 labels
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

metric_name = 'accuracy'
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# before quantization
trainer.evaluate()

In [None]:
# quantization
# quantization.quantize_dynamic method does not support CUDA env
device = torch.device('cpu')
model.to(device)

In [None]:
# quantization
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [None]:
# Output two models size
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

In [None]:
# define a evaluation function
def evaluate(model, encoded_dataset, mnli_dataset, test_dataset):
  matched = 0
  N = len(encoded_dataset)
  print(f'Total number is: {N}')

  '''
  corresponding encoded number
  netural => 1
  contradiction => 2
  entailment => 0
  '''
  for i, batches in enumerate(encoded_dataset):
    premise = batches['premise']
    hypothesis = batches['hypothesis']
    idx = batches['idx']
    label = mnli_dataset[idx]['label']
    # input to model and predict the label
    encode_input = tokenizer(premise, hypothesis, return_tensors='pt')
    output = model(**encode_input)
    # need Tensor.cpu() to copy the tensor to host memory first
    pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)

    if test_dataset:
      # all the labels in test_dataset is contradiction
      if pred[0] == 2:
        matched += 1
      # the label of contradiction is -1 in test_dataset
      if label != -1:
        print('exception in test dataset')
    elif pred[0] == label:
      matched += 1
    
    if i != 0 and i % 500 == 0:
      print(f'Step at: {i / 500}, accu: {matched / N }, matched {matched} out of {i}')
  
  return matched / N

In [None]:
def time_model_evaluation(model, encoded_dataset, mnli_dataset, test_dataset):
  eval_start_time = time.time()
  acc = evaluate(model, encoded_dataset, mnli_dataset, test_dataset)
  eval_end_time = time.time()
  eval_duration_time = eval_end_time - eval_start_time
  print("\nEND INFO:")
  print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))
  print(f'Evaluate end accuracy is {acc}')

In [None]:
acc = time_model_evaluation(model, encoded_dataset['validation_matched'], dataset["validation_matched"], test_dataset=False)

In [None]:
acc = time_model_evaluation(quantized_model, encoded_dataset['validation_matched'], dataset["validation_matched"], test_dataset=False)

In [None]:
acc = time_model_evaluation(model, encoded_dataset['test_matched'], dataset["test_matched"], test_dataset=True)

In [None]:
acc = time_model_evaluation(quantized_model, encoded_dataset['test_matched'], dataset["test_matched"], test_dataset=True)