In [1]:
!pip install transformers
!pip install datasets evaluate accelerate optuna optimum
!pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
!pip install onnxruntime-gpu

Collecting optimum[onnxruntime]@ git+https://github.com/huggingface/optimum.git
  Cloning https://github.com/huggingface/optimum.git to /tmp/pip-install-6dr3xra7/optimum_45e94c4185a04e00a276da626b3c3b2e
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/optimum.git /tmp/pip-install-6dr3xra7/optimum_45e94c4185a04e00a276da626b3c3b2e
  Resolved https://github.com/huggingface/optimum.git to commit 915e182ca2c3c202a0ac25860ea8400250817a25
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import os
import time
import shutil
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoConfig
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import AutoQuantizationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
filepath = "./content/helpdest_dataset_pruned.csv"

In [22]:
df2 = pd.read_csv(filepath,encoding='utf-8')
df2.head()

Unnamed: 0,text,label
0,The customer described problems with blurry im...,2
1,The customer was having issues with login cred...,2
2,The customer complained that the product they ...,2


In [23]:
def finetune(filepath):
  df = pd.read_csv(filepath,encoding='utf-8')
  df.head()
  train , validation = train_test_split(df , test_size=0.09 , random_state = 42 , stratify = df["label"])
  train.to_csv("train_help.csv" , index=False)
  validation.to_csv("val_help.csv",index=False)

  class_names = ['Negative','Neutral','Positive']
  ft=Features({'text':Value(dtype='string',id=None), 'label': ClassLabel(num_classes=3, names=class_names)})
  dataset=load_dataset('csv',data_files={'train':'train_help.csv','validation':'val_help.csv'},features=ft)

  model_ckpt = "distilbert-base-uncased"
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

  def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

  helpdesk_encoded = dataset.map(tokenize, batched=True, batch_size=None)
  helpdesk_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  num_labels = 3
  model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

  config = (AutoConfig.from_pretrained(model_ckpt,
                                    num_labels=len(class_names),
                                    id2label={i: label for i, label in enumerate(class_names)},
                                    label2id={label: i for i, label in enumerate(class_names)}))

  model.config = config


  def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

  batch_size = 2
  logging_steps = len(helpdesk_encoded["train"]) // batch_size
  model_name = "Venkatesh4342/distilbert-helpdesk-sentiment"
  training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=6,
    learning_rate=2e-5,
    evaluation_strategy ='steps',
    eval_steps=100,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end= True,
    logging_steps=logging_steps,
    gradient_checkpointing=True,
    push_to_hub=False)


  trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=helpdesk_encoded["train"],
                  eval_dataset=helpdesk_encoded["validation"],
                  tokenizer=tokenizer)

  trainer.train()
  trainer.save_model("fine_tuned_model")

  onnx_model = ORTModelForSequenceClassification.from_pretrained("fine_tuned_model", export=True)
  quantizer = ORTQuantizer.from_pretrained(onnx_model)
  dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
  model_quantized_path = quantizer.quantize(
    save_dir="model",
    quantization_config=dqconfig,
  )
  time.sleep(15)
  shutil.rmtree("fine_tuned_model")
  os.remove("train_help.csv")
  os.remove("val_help.csv")

In [24]:
finetune(filepath)

Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 12087.33it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 2337.96it/s]
Generating train split: 2 examples [00:00, 656.75 examples/s]
Generating validation split: 1 examples [00:00, 505.16 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 602.33 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 281.40 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 36/678 [02:44<48:58,  4.58s/it]
  0%|          | 0/6 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text fol

{'loss': 1.1165, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.0}
{'loss': 0.9776, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.0}
{'loss': 0.9601, 'learning_rate': 1e-05, 'epoch': 3.0}


100%|██████████| 6/6 [00:00<00:00, 12.85it/s]


{'loss': 0.8805, 'learning_rate': 6.666666666666667e-06, 'epoch': 4.0}
{'loss': 0.8784, 'learning_rate': 3.3333333333333333e-06, 'epoch': 5.0}
{'loss': 0.8709, 'learning_rate': 0.0, 'epoch': 6.0}
{'train_runtime': 0.4683, 'train_samples_per_second': 25.624, 'train_steps_per_second': 12.812, 'train_loss': 0.9473178287347158, 'epoch': 6.0}


Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.1+cu117
  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: model (external data format: False)
Configuration saved in model/ort_config.json
