In [None]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-q8hj2vp0
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-q8hj2vp0
  Resolved https://github.com/huggingface/transformers to commit b46bd8b9d2ac991c0c04674957ebc0a65fb3f42b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install datasets




# Load the MINDS-14 Dataset
In this step, we will load the MINDS-14 dataset using the datasets library from Hugging Face. This dataset contains examples of spoken language understanding (SLU) data for various languages and dialects. We will load the English (Australia) subset of the dataset for the training split.

In [None]:
from datasets import load_dataset

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")

# Access Audio Data from the MINDS-14 Dataset
In this step, we will extract the audio data from the MINDS-14 dataset and access a sample from the training split. This will help us understand the structure of the data and how to work with individual audio samples.

In [None]:
train_dataset = minds['audio']

# Access data from the first sample in the training split
audio_input = train_dataset[0]


# Set Up an Audio Classification Pipeline
In this step, we will set up an audio classification pipeline using the transformers library from Hugging Face. This pipeline will use a pre-trained model specifically designed for audio classification tasks. We'll configure the pipeline to use the xtreme_s_xlsr_300m_minds14 model.

In [None]:
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

Some weights of the model checkpoint at anton-l/xtreme_s_xlsr_300m_minds14 were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at anton-l/xtreme_s_xlsr_300m_minds14 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos

In [None]:
classifier(minds[0]["audio"])

[{'score': 0.9611983895301819, 'label': 'pay_bill'},
 {'score': 0.0296021718531847, 'label': 'freeze'},
 {'score': 0.0035503290127962828, 'label': 'card_issues'},
 {'score': 0.002132321475073695, 'label': 'abroad'},
 {'score': 0.000882967549841851, 'label': 'high_value_payment'}]

In [None]:
prediction = classifier(minds[0]["audio"])
print(prediction[0]['label'])
print(prediction[0][ 'score'])

pay_bill
0.9611983895301819


Different model for SPEECH COMMANDS


In [None]:
speech_commands = load_dataset(
    "speech_commands", "v0.02", split="validation", streaming=True
)
sample = next(iter(speech_commands))

Downloading builder script:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

The repository for speech_commands contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/speech_commands.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


# Classify Audio
In this step, we will set up an audio classification pipeline using a different pre-trained model and make a prediction on a sample from our dataset. We will use the MIT/ast-finetuned-speech-commands-v2 model for this task.

In [None]:
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2"
)
prediction=classifier(sample["audio"].copy())
print(prediction[0]['label'])

backward


In [None]:
from IPython.display import Audio

Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

# Convert Stereo Audio to Mono
In this step, we will define a function to convert a stereo audio file to a mono audio file and save the result. We will use the librosa library to load the audio and soundfile to save the converted audio.

In [None]:
import librosa
import soundfile as sf

def stereo_to_mono(input_file, output_file):
  """Converts a stereo audio file to mono and saves it as a new file.

  Args:
    input_file: Path to the input stereo audio file.
    output_file: Path to the output mono audio file.
  """

  # Load the audio file
  audio, sample_rate = librosa.load(input_file, sr=None)

  # Handle mono and stereo cases
  if len(audio.shape) == 2:
    # Stereo audio
    audio_mono = (audio[:, 0] + audio[:, 1]) / 2
  else:
    # Mono audio
    audio_mono = audio

  # Save the mono audio file
  sf.write(output_file, audio_mono, sample_rate)

if __name__ == "__main__":
  input_file = "/content/happy.wav"
  output_file = "mono_audio.wav"
  stereo_to_mono(input_file, output_file)


# Display the Mono Audio File
In this step, we will load and display the mono audio file using IPython's audio display capabilities. This allows us to listen to the audio directly within the notebook.

In [None]:
import IPython.display as ipd

# Load the audio file
audio_file = '/content/mono_audio.wav'  # Replace with your audio file path

# Display the audio
ipd.Audio(audio_file)


# Resample and Classify Audio
In this step, we will load an audio file, resample it to a specific sampling rate if needed, and then classify it using the pre-trained classifier. This process ensures that the audio is in the correct format for the model.

In [None]:
import soundfile as sf

# Load your audio file
audio_data, sample_rate = sf.read("/content/mono_audio.wav")

# Assuming the model expects a specific sampling rate
if sample_rate != 16000:
    audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

# Classify the audio
prediction = classifier(audio_data)
print(prediction[0]['label'])


happy
