In [19]:
class  BaseModel():
    def __init__(self, model_name = ''):
        
        self.model =  ort.InferenceSession(model_name)
    def preprocess(self, input):
        return input
        
    def run(self, inputs):
        return self.model.run(output_names=[ "output" ], input_feed=dict(inputs))

    def postprocess(self, output):
        return output

# Речь в текст

In [20]:
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTModelForAudioClassification, ORTModelForSequenceClassification, ORTModelForSpeechSeq2Seq
from transformers import AutoConfig, Wav2Vec2Processor, AutoTokenizer, PretrainedConfig, WhisperProcessor
import time
import librosa
import numpy as np
import os

class  SpeechtoText(BaseModel):
    def __init__(self, model_path, model_name, model_name_preprocess, postprocess = None):
        self.processor =  WhisperProcessor.from_pretrained(model_name_preprocess)#"openai/whisper-tiny")
        # self.sampling_rate = processor.feature_extractor.sampling_rate
        model_config = PretrainedConfig.from_pretrained(model_name)
        predictions = []
        references = []
        sessions = ORTModelForSpeechSeq2Seq.load_model(
            os.path.join(model_path, 'encoder_model.onnx'),
            os.path.join(model_path, 'decoder_model.onnx'),
            os.path.join(model_path, 'decoder_with_past_model.onnx'))
        self.model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], model_config, model_path, sessions[2])
        self.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="russian", task="transcribe")
        if  postprocess is not None:
            self.post_process =  postprocess
        else:
            self.post_process = None
        self.sampling_rate = 16000
    def postprocess(self, output):

        output = self.processor.batch_decode(output, skip_special_tokens=True)
        return output
        
    def run(self, speech):
        feature = self.processor(speech, sampling_rate=self.sampling_rate, return_tensors="pt")
        rez = self.model.generate(feature['input_features'], forced_decoder_ids=self.forced_decoder_ids)
        # print(rez.logits[0], self.emotions)
        return self.postprocess(rez)

sampling_rate =  16000
path_a ='01_happiness_anger a_020.wav'
speech, sr = librosa.load(path_a, sr=sampling_rate)
model_text = SpeechtoText('whisper-tiny_onnx', "openai/whisper-base", "openai/whisper-tiny")
model_text.run(speech)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type whisper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


[' Слушай, я потратил обретьом кучу денег для того, чтобы притоситься в эту дару. Это что вообще такое? Посмотри на официантов, они все в черных каких-то рубашках с кислыми минами. Даже никто из них до сих пор не подошел к нам.']

# Речь в эмоции

In [21]:

class  SpeechtoEmotion(BaseModel):
    def __init__(self, model_name, model_name_preprocess, model_postprocess = None):
        self.processor = Wav2Vec2Processor.from_pretrained(model_name_preprocess)
        self.sampling_rate = self.processor.feature_extractor.sampling_rate
        self.model = ORTModelForAudioClassification.from_pretrained(model_name)#"wav2vec2-xls-r-300m-emotion-ru_onnx"
        self.emotions = ['neutral', 'positive', 'angry', 'sad', 'other']
        if  model_postprocess is not None:
            self.post_process =  model_postprocess
        else:
            self.post_process = None
    
    def postprocess(self, outputs):
        return self.emotions[np.argmax(outputs.logits[0])]
        
    def run(self, speech):
        features = self.processor(speech, sampling_rate=self.sampling_rate, return_tensors="pt", padding=True)
        rez = self.model(features['input_values'])
        # print(rez.logits[0], self.emotions)
        return self.postprocess(rez)


path_a ='01_happiness_anger a_020.wav'
speech, sr = librosa.load(path_a, sr=sampling_rate)
model_em = SpeechtoEmotion("wav2vec2-xls-r-300m-emotion-ru_onnx", "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru")
model_em.run(speech)

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


'angry'

# Текст в эмоции

In [15]:
class  TexttoEmotion(BaseModel):
    def __init__(self, model_name, model_name_or_path):
        self.processor = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = ORTModelForSequenceClassification.from_pretrained(model_name)
        self.labels = ['neutral', 'joy', 'sadness', 'anger', 'enthusiasm', 'surprise', 'disgust', 'fear', 'guilt', 'shame']
        self.labels_ru = ['нейтрально', 'радость', 'грусть', 'гнев', 'интерес', 'удивление', 'отвращение', 'страх', 'вина', 'стыд']
    
    def postprocess(self, outputs):
        return self.labels[np.argmax(outputs.logits[0])]
        
    def run(self, text):
        features = self.processor(text,  max_length=512, truncation=True, return_tensors='pt')
        rez = self.model(**features)
        # print(features )
        return self.postprocess(rez)


model_textem = TexttoEmotion("rubert-tiny2-russian-emotion-detection_onnx", "Djacon/rubert-tiny2-russian-emotion-detection")
model_textem.run("ой беда, какая беда")

'sadness'

In [22]:
path_a ='01_happiness_anger a_020.wav'
t1 = time.time()
speech, sr = librosa.load(path_a, sr=sampling_rate)
model_text = SpeechtoText('whisper-tiny_onnx', "openai/whisper-base", "openai/whisper-tiny")
rez_text = model_text.run(speech)

model_textem = TexttoEmotion("rubert-tiny2-russian-emotion-detection_onnx", "Djacon/rubert-tiny2-russian-emotion-detection")
rez_em_text = model_textem.run(rez_text)

model_em = SpeechtoEmotion("wav2vec2-xls-r-300m-emotion-ru_onnx", "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru")
rez_em_speech = model_em.run(speech)
print('Time: ',time.time() - t1)
print('Result text: ',rez_text)
print('Result emotion: ', rez_em_text, rez_em_speech)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type whisper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Time:  7.873037576675415
Result text:  [' Слушай, я потратил обретьом кучу денег для того, чтобы притоситься в эту дару. Это что вообще такое? Посмотри на официантов, они все в черных каких-то рубашках с кислыми минами. Даже никто из них до сих пор не подошел к нам.']
Result emotion:  enthusiasm angry


# Конвертация моделей из Hugginface

# МУСОР

In [40]:
from optimum.exporters.tasks import TasksManager
distilbert_tasks = list(TasksManager.get_supported_tasks_for_model_type("distilbert", "onnx").keys())

print(distilbert_tasks)

Not passing the argument `library_name` to `get_supported_tasks_for_model_type` is deprecated and the support will be removed in a future version of Optimum. Please specify a `library_name`. Defaulting to `"transformers`.


['feature-extraction', 'fill-mask', 'text-classification', 'multiple-choice', 'token-classification', 'question-answering']


In [50]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTModelForAudioClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-4.7652, -1.0452, -7.0409, -4.6864, -4.0277, -6.2021, -4.9473,  2.6287,
          7.6111, -1.2488, -2.0551, -0.9350,  4.9758, -0.7707,  2.1493, -2.0703,
         -4.3232, -4.9472]]), end_logits=tensor([[ 0.4382, -1.6502, -6.3654, -6.0661, -4.1482, -3.5779, -0.0774, -3.6168,
         -1.8750, -2.8910,  6.2582,  0.5425, -3.7699,  3.8232, -1.5073,  6.2311,
          3.3604, -0.0772]]), hidden_states=None, attentions=None)

In [58]:
features

{'input_values': tensor([[ 0.0044,  0.0131,  0.0104,  ..., -0.0023,  0.0031,  0.0031]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

In [60]:
speech

array([ 0.00018311,  0.00057983,  0.00045776, ..., -0.00012207,
        0.00012207,  0.00012207], dtype=float32)

### onnx model from : optimum-cli export onnx --model KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru wav2vec2-xls-r-300m-emotion-ru_onnx

## from 

work

In [4]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTModelForAudioClassification
from transformers import AutoConfig, Wav2Vec2Processor
import librosa

path_a ='../dataset/RESD_csv/train/01_happiness_anger/01_happiness_anger a_020.wav'
model = ORTModelForAudioClassification.from_pretrained("wav2vec2-xls-r-300m-emotion-ru_onnx")
# inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
model_name_or_path = "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru"
# config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
sampling_rate = processor.feature_extractor.sampling_rate
speech, sr = librosa.load(path_a, sr=sampling_rate)
features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
outputs = model(features['input_values'])
outputs

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0182,  0.0492,  0.0886, -0.0229,  0.0305]]), hidden_states=None, attentions=None)

In [14]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "processor_class": "Wav2Vec2ProcessorWithLM",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru', vocab_size=40, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("<s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("</s>", rstrip=True, lstrip=True, single_word=False, normalized=False, spe

In [11]:
from onnxruntime.quantization.quantize import quantize
from transformers import Wav2Vec2Processor
import torch

def convert_to_onnx(model_id_or_path, onnx_model_name, path_a = ''):
    print(f"Converting {model_id_or_path} to onnx")
    model = Wav2Vec2Processor.from_pretrained(model_id_or_path).feature_extractor
    sampling_rate = model.sampling_rate
    speech, sr = librosa.load(path_a, sr=sampling_rate)

    x = speech#torch.randn(1, sampling_rate , requires_grad=True)

    torch.onnx.export(model,                        # model being run
                    x,                              # model input (or a tuple for multiple inputs)
                    onnx_model_name,                # where to save the model (can be a file or file-like object)
                    export_params=True,             # store the trained parameter weights inside the model file
                    opset_version=11,               # the ONNX version to export the model to
                    do_constant_folding=True,       # whether to execute constant folding for optimization
                    input_names = ['input'],        # the model's input names
                    output_names = ['output'],      # the model's output names
                    dynamic_axes={'input' : {1 : 'audio_len'},    # variable length axes
                                'output' : {1 : 'audio_len'}})

In [None]:
model_name_or_path = "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru"
onnx_model_name = model_name_or_path.split("/")[-1] + ".onnx"
print(onnx_model_name)
convert_to_onnx(model_name_or_path, onnx_model_name, path_a ='../dataset/RESD_csv/train/01_happiness_anger/01_happiness_anger a_020.wav')

In [44]:
model = ORTModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad", export=True)

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Framework not specified. Using pt to export the model.


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.1+cpu
  mask, torch.tensor(torch.finfo(scores.dtype).min)


In [None]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Djacon/rubert-tiny2-russian-emotion-detection")
model = ORTModelForSequenceClassification.from_pretrained("rubert-tiny2-russian-emotion-detection_onnx")
inputs = tokenizer("ой беда, какая беда", max_length=512, truncation=True, return_tensors='pt')
outputs = model(**inputs)
outputs

In [28]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
LABELS = ['neutral', 'joy', 'sadness', 'anger', 'enthusiasm', 'surprise', 'disgust', 'fear', 'guilt', 'shame']
LABELS_RU = ['нейтрально', 'радость', 'грусть', 'гнев', 'интерес', 'удивление', 'отвращение', 'страх', 'вина', 'стыд']

tokenizer = AutoTokenizer.from_pretrained("Djacon/rubert-tiny2-russian-emotion-detection")
model = AutoModelForSequenceClassification.from_pretrained("Djacon/rubert-tiny2-russian-emotion-detection")
input = tokenizer("ой беда, какая беда", max_length=512, truncation=True, return_tensors='pt')
model(**input)



SequenceClassifierOutput(loss=None, logits=tensor([[-2.1391, -2.8492,  1.7666,  0.2717, -3.0771, -2.4219, -0.5103, -1.8227,
         -1.0535, -1.0158]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [48]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")

# load streaming dataset and read first audio sample
# ds = load_dataset("common_voice", "fr", split="test", streaming=True)
ds = speech
sampling_rate=16_000

input_speech = speech
input_features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Слушай, я потратил обретьом кучу денег для того, чтобы притоситься в эту дару. Это что вообще такое? Посмотри на официантов, они все в черных каких-то рубашках с кислыми минами. Даже никто из них до сих пор не подошел к нам.']


In [52]:
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
from datasets import Audio, load_dataset
import os


In [57]:
from transformers import PretrainedConfig
model_name = 'openai/whisper-base'
model_path = 'whisper-tiny_onnx'
model_config = PretrainedConfig.from_pretrained(model_name)
predictions = []
references = []
sessions = ORTModelForSpeechSeq2Seq.load_model(
            os.path.join(model_path, 'encoder_model.onnx'),
            os.path.join(model_path, 'decoder_model.onnx'),
            os.path.join(model_path, 'decoder_with_past_model.onnx'))
model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], model_config, model_path, sessions[2])
model

feature = processor(speech, sampling_rate=sampling_rate, return_tensors="pt")
model.generate(feature['input_features'], forced_decoder_ids=forced_decoder_ids)


You are using a model of type whisper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


tensor([[50258, 50263, 50359, 50363,  2933, 43689,    11,  2552,  6364, 11157,
          2338,  3348,   481,  9108,  1253,   981,  4187,   585, 40957,  5561,
         11283,    11,  7887,  1285,  1635,  1885, 12306,   740, 18763,  1070,
          2222,   585,    13,  6684,  2143, 14345, 18292,    30, 18689, 44443,
          1470, 31950, 30321,  1416,  8642,    11,  7515,  4640,   740, 12360,
          5783, 44178,    12,   860, 27371,  6835, 18366,   776,   981, 47105,
         24670, 19073,  5150,    13, 42900, 31666,  3943, 14319,  5865,   776,
          4165, 11948,  1725,  4095,  6824,  1414,   981, 11401,    13, 50257]])