In [1]:
import torch
import sys
new_path = 'third_part/soxan'
sys.path.append(new_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.models import Wav2Vec2ForSpeechClassification
pretrain_model_path = 'soxan_checkpoints'

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(pretrain_model_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrain_model_path)
sampling_rate = feature_extractor.sampling_rate

# for wav2vec
model = Wav2Vec2ForSpeechClassification.from_pretrained(pretrain_model_path).to(device)



def speech_file_to_array_fn(wave, sampling_rate):
    # speech_array, _sampling_rate = torchaudio.load(path)
    speech_array = wave
    resampler = torchaudio.transforms.Resample(16800, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    print(speech.shape, inputs['input_values'].shape, inputs.keys())
    inputs = {key: inputs[key].to(device) for key in inputs}
    with torch.no_grad():
        features = model.extract_feature(**inputs)
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in
               enumerate(scores)]
    return outputs, features


path = "data/Data/genres_original/blues/blues.00000.wav"
audio_input_16khz = torch.randn(1, 1024*16)
outputs, features = predict(audio_input_16khz, sampling_rate)

(15604,) torch.Size([1, 15604]) dict_keys(['input_values'])
