In [None]:
# Some standard imports
import sys
sys.path.append("../../..")

import numpy as np

from torch import nn
import torch.utils.model_zoo as model_zoo
import torch.onnx

from funasr import AutoModel

model = AutoModel(
    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
    # punc_model="iic/punc_ct-transformer_cn-en-common-vocab471067-large",
    # spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
)

In [None]:
bimodel = model.model

CTTransformer = model.punc_model

# input: data_in(音频文件列表)
# batch = {
#                 "feats": speech,   [batch_size,  speech_length, feature_dimension]
#                 "waveform": cache["frontend"]["waveforms"],   [batch_size, sequence_length]
#                 "is_final": kwargs["is_final"],   True
#                 "cache": cache,   
#                 "is_streaming_input": is_streaming_input,
#             }
FsmnVADStreaming = model.vad_model   


In [None]:
res = model.generate(
    # input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav",
    input="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav",
    batch_size_s=300,
    batch_size_threshold_s=60,
)
print(res)

In [None]:
from funasr.models.bicif_paraformer.model import BiCifParaformer
from funasr.train_utils.load_pretrained_model import load_pretrained_model
from funasr.register import tables
from funasr.utils.misc import deep_update

kwargs = model.kwargs
model_class = tables.model_classes.get(kwargs["model"])
model_conf = {}
deep_update(model_conf, kwargs.get("model_conf", {}))
deep_update(model_conf, kwargs)
bimodel_new = model_class(**model_conf, vocab_size=512)

# bimodel = torch.load("iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pt")
load_pretrained_model(
                    model=bimodel_new,
                    path="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pt",
                    ignore_init_mismatch=model.kwargs.get("ignore_init_mismatch", True),
                    oss_bucket=model.kwargs.get("oss_bucket", None),
                    scope_map=model.kwargs.get("scope_map", []),
                    excludes=model.kwargs.get("excludes", None),
                )

In [None]:
sequence_length = 218
speech = torch.randn([1, sequence_length, 560])
speech_lengths = torch.Tensor([sequence_length])
bimodel = bimodel_new.to("cpu")
inputs = {"speech": speech, "speech_lengths": speech_lengths}
torch.onnx.export(bimodel, inputs, "BiCifParaformer.onnx.pb", export_params=True, 
                  input_names=["speech", "speech_lengths"],
                  output_names=["results"],
                  dynamic_axes={'speech' : {1 : 'sequence_length'}})

In [None]:
import onnxruntime

x = torch.randn([1, 128, 560])
ort_session = onnxruntime.InferenceSession("BiCifParaformer.onnx.pb")

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x), ort_session.get_inputs()[1].name: to_numpy(torch.Tensor([128]))}

# ort_inputs = {ort_session.get_inputs()[0], "speech_lengths": to_numpy(torch.Tensor([128]))}
ort_outs = ort_session.run(None, ort_inputs)

print(ort_outs)
# compare ONNX Runtime and PyTorch results
# np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)
