In [1]:
import librosa as lb
import pandas as pd
import numpy as np
import os
from scipy.fftpack import dct, fft
from scipy.signal import periodogram
from librosa.feature import mfcc

In [2]:
TRAIN_CSV = "Train.csv"
OUTPUT = "Features"
LENGTH = 2000

In [3]:
# Load train csv file
train_csv = pd.read_csv(TRAIN_CSV)
train_csv.head()

Unnamed: 0,ID,common_name
0,MBMG2C,Ring-necked Dove
1,K8LJSB,Ring-necked Dove
2,OGD9L6,Ring-necked Dove
3,581PCQ,Ring-necked Dove
4,P91M1F,Ring-necked Dove


In [4]:
# Create a folder named after each bird specie under the Features folder. We will save the extracted features from the audio files under their corresponding folders
for bird in train_csv['common_name'].unique():
        if not (os.path.exists(OUTPUT + rf"/{bird}")):
                os.makedirs(OUTPUT + rf"/{bird}")

In [5]:
# Feutures to extract
# MFCC
def MFCCs(sig, sr, n_mfcc):
    mfccs = mfcc(y=sig, sr=sr, n_mfcc=n_mfcc).flatten()
    return mfccs[:n_mfcc]

# Power Spectral Density
def PSD(sig, sr, length):
    (f, psd) = periodogram(sig, fs=sr, scaling='density')
    return psd.flatten()[:length]

# Discrete Cosine Transform
def DCT(sig, length):
    d = dct(sig)
    return d.flatten()[:length]

# Fast Fourier Transform
def FFT(sig, length):
    f = fft(sig).flatten()
    n = []
    for e in f:
        n.append(np.sqrt(np.real(e)**2 + np.imag(e)**2))
    return n[:length]

In [6]:
features = { 
    'mfcc': [], 
    'psd': [],
    'dct': [],
    'fft': [],
}

In [None]:
# Extract the 4 features and save them in a dataframe under their corrsponding specie
for i, row in train_csv.iterrows():
    # Load audio file
    audio_file, sr = lb.load(rf"Train/{row['ID']}.mp3")
    features['mfcc'] = MFCCs(audio_file, sr, LENGTH)
    features['psd'] = PSD(audio_file, sr, LENGTH)
    features['dct'] = DCT(audio_file, LENGTH)
    features['fft'] = FFT(audio_file, LENGTH)
    
    feature_df = pd.DataFrame(features)
    feature_df.to_csv(OUTPUT + rf"/{row['common_name']}/{row['ID']}.csv", index=False)

In [None]:
# Check the number of dataframes in each class
counter = 0
for root, dirs, files in os.walk(OUTPUT, topdown=False):
    for name in files:
        if name.endswith('.csv'):
            counter = counter + 1
if counter == len(train_csv):
    print('OK')