## Audio Processing using Python

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa

In [None]:
audio_path = "audio.mp3"

In [None]:
y, sr = librosa.load(audio_path, sr=None)

In [None]:
y

In [None]:
y[:100]

In [None]:
y.shape

In [None]:
sr

In [None]:
plt.figure(figsize=(14, 5), dpi=150)
plt.plot(y)
plt.xlabel("Time - Samples")
plt.ylabel("Amplitude");

In [None]:
len(y) / sr

In [None]:
from IPython.display import Audio

In [None]:
Audio(data=y, rate=sr)

In [None]:
# DFT - Discrete Fourier Transform
window = np.hanning(len(y))
windowed_input = y * window
dft = np.fft.rfft(windowed_input)

In [None]:
dft

In [None]:
plt.plot(dft)
plt.title("Discrete Fourier Transform");

In [None]:
amplitude = np.abs(dft)

In [None]:
plt.plot(amplitude);

In [None]:
amplitude_db = librosa.amplitude_to_db(amplitude, ref=np.max)

In [None]:
frequency = librosa.fft_frequencies(sr=sr, n_fft=len(y))

In [None]:
plt.figure(figsize=(15, 4), dpi=150)
plt.plot(frequency, amplitude_db)
plt.xlabel("Freq Hz")
plt.ylabel("Amp dB");
plt.xscale("log"); # logarithm

In [None]:
D = librosa.stft(y)
D

In [None]:
D_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

In [None]:
plt.figure(figsize=(14, 5), dpi=150)
librosa.display.specshow(D_db, sr=sr, x_axis="time", y_axis="log")
plt.colorbar(format="%+2.0f dB")

In [None]:
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(14, 5), dpi=150)
librosa.display.specshow(S_dB, sr=sr, x_axis="time", y_axis="log", fmax=8000)
plt.colorbar(format="%+2.0f dB")

## Import required libraries - Audio Classification

In [None]:
import torch, transformers, torchaudio
print(torch.__version__)
print(transformers.__version__)
print(torchaudio.__version__)

## Audio Classification

In [None]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

In [None]:
import librosa

In [None]:
audio_path = "audio.mp3"

In [None]:
y, sr = librosa.load(audio_path, sr=None)

In [None]:
y

In [None]:
sr

In [None]:
from IPython.display import Audio

In [None]:
Audio(data=audio_path)

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
result = feature_extractor(y, return_tensors="pt")

In [None]:
result["input_values"]

In [None]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
model

In [None]:
result["input_values"]

In [None]:
prediction_logits = model(result["input_values"]).logits

In [None]:
prediction_logits

In [None]:
predicted_class_ids = torch.argmax(prediction_logits, dim=-1).item()

In [None]:
predicted_class_ids

In [None]:
model.config.id2label

In [None]:
model.config.id2label[predicted_class_ids]

## Converting Audio to Text

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("automatic-speech-recognition")

In [None]:
pipe

In [None]:
pipe("audio.mp3")

## Converting Text to Audio

In [None]:
from transformers import pipeline
pipe = pipeline("text-to-speech")

In [None]:
text = "I like coding in python"

In [None]:
text

In [None]:
output = pipe(text)

In [None]:
output

In [None]:
output["audio"]

In [None]:
output["audio"].shape

In [None]:
plt.plot(output["audio"].squeeze());

In [None]:
from IPython.display import Audio

In [None]:
Audio(data=output["audio"], rate=output["sampling_rate"])

In [None]:
from pydub import AudioSegment

In [None]:
audio_seg = AudioSegment(output["audio"].tobytes(),
                         frame_rate=output["sampling_rate"],
                         sample_width=output["audio"].dtype.itemsize,
                         channels=1)

In [None]:
audio_seg.export("my_audio_saved.mp3", format="mp3")