# Audio Classification

In [8]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
import torch

In [2]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")



In [3]:
import librosa
audio_path = 'example.mp3'
y, sr = librosa.load(audio_path, sr=None)

### Sampling Rate Issues

Most ML models are trained on 16 kHz sampling rate, we will run into issues if we try to force our own sampling rate:

In [1]:
# ERROR!
# result = feature_extractor(y,sampling_rate=sr)

In [4]:
result = feature_extractor(y,return_tensors="pt")

It is strongly recommended to pass the `sampling_rate` argument to `ASTFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


In [5]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [6]:
prediction_logits = model(result['input_values']).logits

In [9]:
predicted_class_ids = torch.argmax(prediction_logits, dim=-1).item()

In [10]:
predicted_label = model.config.id2label[predicted_class_ids]

In [11]:
predicted_label

'Music'

### Pipeline for Audio Classification
Using a pipeline as high-level helper

In [3]:
from transformers import pipeline

In [5]:
pipe = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")

Device set to use cuda:0


In [6]:
pipe.model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=T

In [8]:
predictions = pipe("example.mp3")

In [9]:
type(predictions)

list

In [13]:
top_5 = predictions[:5]

In [14]:
top_5

[{'score': 0.4848681092262268, 'label': 'Music'},
 {'score': 0.19131147861480713, 'label': 'Violin, fiddle'},
 {'score': 0.08519703149795532, 'label': 'Musical instrument'},
 {'score': 0.046924445778131485, 'label': 'Bowed string instrument'},
 {'score': 0.04536091908812523, 'label': 'Orchestra'}]

In [33]:
for result in top_5:
    print(f"{result['label']:<25} {float(result['score']*100):>6.2f} %")

Music                      48.49 %
Violin, fiddle             19.13 %
Musical instrument          8.52 %
Bowed string instrument     4.69 %
Orchestra                   4.54 %
