## Question generation model compression using Onnx

In [3]:
!pip install --quiet onnx
!pip install --quiet onnxruntime
!pip install --quiet transformers==4.28.1
!pip install --quiet fastt5

In [4]:
!pip install --quiet sentencepiece
!pip install --quiet gradio
!pip install --quiet ipython-autotime
%load_ext autotime

time: 422 µs (started: 2024-10-11 19:09:02 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
time: 24.4 s (started: 2024-10-11 17:41:17 +00:00)


In [5]:
from transformers import T5ForConditionalGeneration,T5Tokenizer
import gradio as gr

from fastT5 import export_and_get_onnx_model, generate_onnx_representation, quantize, get_onnx_model,get_onnx_runtime_sessions,OnnxT5
from transformers import T5Config,AutoTokenizer
from pathlib import Path
import os

from onnxruntime.quantization import quantize_dynamic



time: 297 ms (started: 2024-10-11 19:09:02 +00:00)


In [9]:
#T5 model size on disk ~ 900 MB
question_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Models/Q-tips/model')
question_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/Models/Q-tips/model')

time: 19.2 s (started: 2024-10-11 17:44:04 +00:00)


In [11]:
def get_question(sentence,answer,mdl,tknizer):
  text = "context: {} answer: {}".format(sentence,answer)
  # print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question


# context = "Samrat loves to watch football during his free time"
# answer = "football"

context = "Donald Trump is an American media personality and businessman who served as the 45th president of the United States."
answer = "Donald Trump"

ques = get_question(context,answer,question_model,question_tokenizer)
print ("question: ",ques)



question:  Who is the 45th president of the United States?
time: 4.96 s (started: 2024-10-11 17:45:17 +00:00)


### Creating an UI with Gradio

In [46]:
context = gr.Textbox(lines=5, placeholder="Enter paragraph/context here...")
answer = gr.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.Textbox(label="Question")

def generate_question(context, answer):
    return get_question(context, answer, question_model, question_tokenizer)

# Update interface to use gradio.Interface with updated components
iface = gr.Interface(
    fn=generate_question,
    inputs=[context, answer],
    outputs=question
)

iface.launch(debug=False)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5fcdc813bb59bdab32.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




time: 2.69 s (started: 2024-10-11 18:13:50 +00:00)


### Convert T5 Pytorch model to Onnx format and Quantize using FastT5 library

In [63]:
def quantize(model_name):
    # Convert PosixPath to string
    model_name_str = str(model_name)  # Convert to string for slicing
    output_model_name = f"{model_name_str[:-5]}-quantized.onnx"
    quantize_dynamic(model_input=model_name_str, model_output=output_model_name, per_channel=True)
    return output_model_name


time: 1.03 ms (started: 2024-10-11 18:38:35 +00:00)


In [64]:
trained_model_path = '/content/drive/MyDrive/Models/Q-tips/model'

# Step 1. convert t5 model to onnx
onnx_model_paths = generate_onnx_representation(trained_model_path)

# Step 2. (recommended) quantize the converted model for fast inference and to reduce model size.
quant_model_paths = [quantize(onnx_model_path) for onnx_model_path in onnx_model_paths]

tokenizer_onnx = AutoTokenizer.from_pretrained(trained_model_path)
config = T5Config.from_pretrained(trained_model_path)

Exporting to onnx... |################################| 3/3


time: 4min 56s (started: 2024-10-11 18:38:37 +00:00)


In [65]:
# save tokenizer also into models folder
tokenizer_onnx.save_pretrained('model/')
config.save_pretrained('model/')

time: 32.6 ms (started: 2024-10-11 18:44:03 +00:00)


**Remove non-quantized onnx files**

In [68]:
!rm -f -r /content/models/*decoder.onnx
!rm -f -r /content/models/*encoder.onnx
!du -sh /content/models

408M	/content/models
time: 518 ms (started: 2024-10-11 18:46:13 +00:00)


In [69]:
!cp -r /content/model/ '/content/drive/MyDrive/Models/Q-tips/onnx_model'

time: 7.14 s (started: 2024-10-11 18:48:16 +00:00)


### Onnx Interface

In [1]:
!pip install transformers==4.6.1 sentencepiece
!pip install fastt5==0.0.4 --no-dependencies
!pip install --quiet ipython-autotime
%load_ext autotime

[31mERROR: Could not find a version that satisfies the requirement onnxruntime==1.7.0 (from versions: 1.12.0, 1.12.1, 1.13.1, 1.14.0, 1.14.1, 1.15.0, 1.15.1, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.17.0, 1.17.1, 1.17.3, 1.18.0, 1.18.1, 1.19.0, 1.19.2)[0m[31m
[0m[31mERROR: No matching distribution found for onnxruntime==1.7.0[0m[31m
time: 396 µs (started: 2024-10-11 19:29:36 +00:00)


In [6]:
from fastT5 import get_onnx_model,get_onnx_runtime_sessions,OnnxT5
from transformers import T5Tokenizer
from pathlib import Path
import os

time: 698 µs (started: 2024-10-11 19:32:40 +00:00)


In [7]:
trained_model_path =  "/content/drive/MyDrive/Models/Q-tips/onnx_model/model"

pretrained_model_name = Path(trained_model_path).stem


encoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-encoder-quantized.onnx")
decoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-decoder-quantized.onnx")
init_decoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-init-decoder-quantized.onnx")

model_paths = encoder_path, decoder_path, init_decoder_path
model_sessions = get_onnx_runtime_sessions(model_paths)
model = OnnxT5(trained_model_path, model_sessions)

tokenizer = T5Tokenizer.from_pretrained(trained_model_path)

time: 9.25 s (started: 2024-10-11 19:32:46 +00:00)


In [10]:
def get_question(sentence,answer,mdl,tknizer):
  text = "context: {} answer: {}".format(sentence,answer)
  print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question


# context = "Ramsri loves to watch cricket during his free time"
# answer = "cricket"

context = "Donald Trump is an American media personality and businessman who served as the 45th president of the United States."
answer = "Donald Trump"

ques = get_question(context,answer,model,tokenizer)
print ("question: ",ques)


time: 1 ms (started: 2024-10-11 19:34:15 +00:00)


In [11]:
context = "Iraq launched a barrage of 180 ballistic missiles aimed at Israel, marking a significant increase in military hostilities and raising fears of a broader conflict in the region."
answer = "ballistic"

ques = get_question(context, answer, model, tokenizer)
print("question: ", ques)


context: Iraq launched a barrage of 180 ballistic missiles aimed at Israel, marking a significant increase in military hostilities and raising fears of a broader conflict in the region. answer: ballistic


AttributeError: 'OnnxT5' object has no attribute 'run'

time: 135 ms (started: 2024-10-11 19:34:18 +00:00)


In [15]:
import gradio as gr

context = gr.Textbox(lines=5, placeholder="Enter paragraph/context here...")
answer = gr.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.Textbox(label="Question")

def generate_question(context,answer):
  return get_question(context,answer,model,tokenizer)

# Update interface to use gradio.Interface with updated components
iface = gr.Interface(
    fn=generate_question,
    inputs=[context, answer],
    outputs=question
)
iface.launch(debug=False)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3d76be2908b190f645.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




time: 2.74 s (started: 2024-10-11 19:36:47 +00:00)
