### How to use matchboxnet to train a model to classify ('awɔ', 'ayi', 'foyi') audio.

#### Installation

In [None]:
!pip install git+https://github.com/Panga-az/matchboxnet.git

#### Import modules

In [None]:
from matchboxnet.config import MatchboxNetConfig
from matchboxnet.model import MatchboxNetForAudioClassification
from matchboxnet.dataset import MatchboxNetDataset
from matchboxnet.feature_extraction import MatchboxNetFeatureExtractor
from datasets import load_dataset

#### Load datasets from Hugging Face

In [None]:
ds = load_dataset('Panga-Azazia/Bambara-Keyword-Spotting-Aug')

#### Split dataset

In [None]:
ds_train = ds["train"]
ds_eval = ds["validation"]
ds_test = ds["test"]

#### Make labels, num_classes, id2label, label2id

In [None]:
labels = ds_train.features["label"].names 

print(f" labels: {labels}")

In [None]:
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}

num_classes = len(labels)

#### Make model config

In [None]:
#Here we use default config, we just update id2label , label2id and num_classes
config = MatchboxNetConfig(
    num_classes=num_classes,
    id2label = id2label,
    label2id =label2id,
)

In [None]:
fe = MatchboxNetFeatureExtractor()

In [None]:
config.save_pretrained("matchboxnet")
fe.save_pretrained("matchboxnet")

#### Prepare datasets for training

In [None]:
train_ds = MatchboxNetDataset(ds_train, config = config, augment=True)
eval_ds = MatchboxNetDataset(ds_eval, config = config, augment=False)
test_ds = MatchboxNetDataset(ds_test, config = config, augment=False)

#### Make the model

In [None]:
import torch
device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = MatchboxNetForAudioClassification(config=config)
model.to(device)

#### Make Compute metrics


In [None]:
import evaluate

accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return accuracy_metric.compute(predictions=preds, references=labels)

#### Make training

In [None]:
hub_id = "Panga-Azazia/matchboxnet3x2x64-google-speech_commands-1" #change to yours
batch_size = 4096
eval_steps = 100
epochs = 1000
gradient_accumulation_steps = 16
logging_steps = 100
save_steps = 100
save_total_limit = 3
dataloader_num_workers = 12

##### Connect you to the hub with your tokens

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./matchboxnet",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    gradient_accumulation_steps=gradient_accumulation_steps,
    eval_strategy="steps",
    eval_steps=eval_steps,
    logging_dir="./matchboxnet/logs",
    logging_steps=logging_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    push_to_hub=True,
    hub_model_id=hub_id,
    hub_strategy="end",
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    seed=0,
    remove_unused_columns=False,
    do_predict=True,
    do_train=True,
    dataloader_num_workers=dataloader_num_workers,
    resume_from_checkpoint=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],

)

In [None]:
trainer.train()

##### Predict on your test dataset

In [None]:
trainer.predict(test_ds)

##### Push your model

In [None]:
trainer.push_to_hub()

### How to use the model with real examples

In [None]:
import glob

In [None]:
oui = glob.glob("./oui/**")
non = glob.glob("./non/**")
rien = glob.glob("./rien/**")


In [None]:
all = oui  + non + rien

In [None]:
all

In [None]:
from matchboxnet.model import MatchboxNetForAudioClassification
from matchboxnet.feature_extraction import MatchboxNetFeatureExtractor

model = MatchboxNetForAudioClassification.from_pretrained("Panga-Azazia/matchboxnet3x2x64-bambara-a-c")
feature_extractor = MatchboxNetFeatureExtractor.from_pretrained("Panga-Azazia/matchboxnet3x2x64-bambara-a-c")

In [None]:
!pip install sounddevice

In [None]:
batch = feature_extractor(
    all,
    return_tensors="pt"
)


In [None]:
batch["paths"] = all 

In [None]:
import torch
import sounddevice as sd
import torchaudio
import time

with torch.no_grad():
    outputs = model(**batch)

preds = outputs.logits.argmax(-1)  
model.config.id2label = {int(k): v for k, v in model.config.id2label.items()}
labels = [model.config.id2label[i.item()] for i in preds]

for path, label in zip(batch["paths"], labels):
    print(f"{path} → {label}")
    waveform, sr = torchaudio.load(path)
    sd.play(waveform.numpy().squeeze(), samplerate=sr)
    sd.wait()  
    time.sleep(0.5) 