# TSL Tutorial

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from onnxruntime import InferenceSession

import duckdb

  from .autonotebook import tqdm as notebook_tqdm


### Load iris data

In [2]:
con = duckdb.connect("../test.db")
imdb = con.sql("SELECT * FROM imdb WHERE stage == 'train'").df()
con.close()

imdb.head()

Unnamed: 0,text,label,stage
0,I rented I AM CURIOUS-YELLOW from my video sto...,0.0,train
1,"""I Am Curious: Yellow"" is a risible and preten...",0.0,train
2,If only to avoid making this type of film in t...,0.0,train
3,This film was probably inspired by Godard's Ma...,0.0,train
4,"Oh, brother...after hearing about this ridicul...",0.0,train


In [3]:
imdb_data = imdb.iloc[:, 0]
imdb_label = imdb.iloc[:, 1]

### Train the model

In [5]:
model_path = "/homes/ukumaras/scratch/Models/distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

dummy_model_input = tokenizer("This is a sample", return_tensors="pt")

In [6]:
dummy_model_input

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 7099,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

### Compile to ONNX

In [6]:
onnx_model_name = 'distilbert_text_classification.onnx'

In [None]:

torch.onnx.export(
    model, 
    tuple(dummy_model_input.values()),
    f=onnx_model_name,  
    input_names=['input_ids', 'attention_mask'], 
    output_names=['logits'], 
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, 
                  'attention_mask': {0: 'batch_size', 1: 'sequence'}, 
                  'logits': {0: 'batch_size', 1: 'sequence'}}, 
    do_constant_folding=True, 
    opset_version=13, 
)

### Model Inference

In [10]:
onnx_model_path = "/homes/ukumaras/scratch/Models/distilbert-base-uncased-finetuned-sst-2-english-onnx"

tokenizer = AutoTokenizer.from_pretrained(onnx_model_path)
session = InferenceSession(onnx_model_path + "/model.onnx")

In [13]:

inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
inputs

{'input_ids': array([[  101,  2478,  4487, 16643, 23373,  2007,  2006, 26807,  2448,
         7292,   999,   102]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:

outputs = session.run(output_names=["logits"], input_feed=dict(inputs))

In [12]:
outputs

[array([[ 2.3462074, -1.995204 ]], dtype=float32)]

In [1]:
from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig

config = DistilBertConfig()
onnx_config = DistilBertOnnxConfig(config)
print(list(onnx_config.outputs.keys()))

  from .autonotebook import tqdm as notebook_tqdm


['last_hidden_state']


In [4]:
onnx_config.inputs

OrderedDict([('input_ids', {0: 'batch', 1: 'sequence'}),
             ('attention_mask', {0: 'batch', 1: 'sequence'})])