In [26]:
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

In [27]:
classifier.model.config.id2label[27]

'marvin'

In [28]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def launch_fn(
    wake_word="marvin",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [29]:
launch_fn(debug=True)

Listening for wake word...
Found microphone Microphone (High Definition Audio Device)


  waveform = torch.from_numpy(waveform).unsqueeze(0)


{'score': 0.05077214539051056, 'label': 'two'}
{'score': 0.05789390951395035, 'label': 'two'}
{'score': 0.07772436738014221, 'label': 'up'}
{'score': 0.11682263016700745, 'label': 'off'}
{'score': 0.12027334421873093, 'label': 'off'}
{'score': 0.11932394653558731, 'label': 'off'}
{'score': 0.1193239763379097, 'label': 'off'}
{'score': 0.1193239837884903, 'label': 'off'}
{'score': 0.09795408695936203, 'label': 'off'}
{'score': 0.26684102416038513, 'label': 'three'}
{'score': 0.18778195977210999, 'label': 'seven'}
{'score': 0.18778195977210999, 'label': 'seven'}
{'score': 0.18778195977210999, 'label': 'seven'}
{'score': 0.18778161704540253, 'label': 'seven'}
{'score': 0.9975855350494385, 'label': 'marvin'}


True

In [22]:
import numpy as np

intent_class_pipe = pipeline(
    "audio-classification", model="anton-l/xtreme_s_xlsr_minds14", device=device
)

# Listens to 2 second chunks, if there is one chunk silence, it will quit loop and concatineate all chunks into one audio file

def listen(chunk_length_s=2.0, stream_chunk_s=2.0):
    sampling_rate = intent_class_pipe.feature_extractor.sampling_rate


    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )
    audio_buffer = []
    
    print("Listening")
    for i in range(5):
        audio_chunk = next(mic)
        audio_buffer.append(audio_chunk["raw"])
        
        if is_silence(audio_chunk["raw"], threshold=0.7):
            print("Silence detected, processing audio.")
            break
        
    combined_audio = np.concatenate(audio_buffer)
    prediction = intent_class_pipe(combined_audio)
    prediction = prediction[0]
    print(prediction)
    
def is_silence(audio_chunk, threshold):
    silence = intent_class_pipe(audio_chunk)
    if silence[0]["label"] == "silence" and silence[0]["score"] > threshold:
        return True
    else:
        return False

Some weights of the model checkpoint at anton-l/xtreme_s_xlsr_minds14 were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at anton-l/xtreme_s_xlsr_minds14 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embe

In [23]:
listen()

Listening
Found microphone Microphone (High Definition Audio Device)
{'score': 0.3211755156517029, 'label': 'address'}


In [25]:
intent_class_pipe.model.config.id2label

{0: 'abroad',
 1: 'address',
 2: 'app_error',
 3: 'atm_limit',
 4: 'balance',
 5: 'business_loan',
 6: 'card_issues',
 7: 'cash_deposit',
 8: 'direct_debit',
 9: 'freeze',
 10: 'high_value_payment',
 11: 'joint_account',
 12: 'latest_transactions',
 13: 'pay_bill'}

In [32]:
launch_fn(debug=True)
listen()

Listening for wake word...
Found microphone Microphone (High Definition Audio Device)
{'score': 0.052111607044935226, 'label': 'two'}
{'score': 0.0628180131316185, 'label': 'two'}
{'score': 0.19197994470596313, 'label': 'five'}
{'score': 0.9999701976776123, 'label': 'marvin'}
Listening
Found microphone Microphone (High Definition Audio Device)
{'score': 0.9997138381004333, 'label': 'pay_bill'}
