In [None]:
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pickle
import joblib
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
import tensorflow as tf

In [None]:
os.chdir('/kaggle/input/rfcx-species-audio-detection')
df = pd.read_csv('train_tp.csv')

In [None]:
df.head()

In [None]:
# Use the librosa package to load and display an audio file like this:

sample_num=3 #pick a file to display
#get the filename 
filename=df.recording_id[sample_num]+str('.flac')
#define the beginning time of the signal
tstart = df.t_min[sample_num] 
tend = df.t_max[sample_num] #define the end time of the signal
y,sr=librosa.load('train/'+str(filename)) #load the file
librosa.display.waveplot(y,sr=sr, x_axis='time', color='cyan')

In [None]:
# So you have to make your audio features look like an image.
# Choose either 1D for a grayscale image (one feature) or 3D for a color image (to represent multiple features).
# Scale and pad the audio features so that every “channel” is the same size.


#This code was adapted from Nicolas Gervais on https://stackoverflow.com/questions/59241216/padding-numpy-arrays-to-a-specific-size on 1/10/2021
def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """
h = array.shape[0]
    w = array.shape[1]
a = max((xx - h) // 2,0)
    aa = max(0,xx - a - h)
b = max(0,(yy - w) // 2)
    bb = max(yy - b - w,0)
return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

In [None]:
#The eventual shape of the features
print(X_train.shape,X_test.shape)

In [None]:
def generate_features(y_cut):
    max_size=1000 #my max audio file feature width

    stft = padding(np.abs(librosa.stft(y_cut, n_fft=255, hop_length = 512)), 128, max_size)
    MFCCs = padding(librosa.feature.mfcc(y_cut, n_fft=n_fft, hop_length=hop_length,n_mfcc=128),128,max_size)
    spec_centroid = librosa.feature.spectral_centroid(y=y_cut, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y_cut, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y_cut, sr=sr)

    #Now the padding part
    image = np.array([padding(normalize(spec_bw),1, max_size)]).reshape(1,max_size)
    image = np.append(image,padding(normalize(spec_centroid),1, max_size), axis=0) 

    #repeat the padded spec_bw,spec_centroid and chroma stft until they are stft and MFCC-sized
    for i in range(0,9):
        image = np.append(image,padding(normalize(spec_bw),1, max_size), axis=0)
        image = np.append(image, padding(normalize(spec_centroid),1, max_size), axis=0)
        image = np.append(image, padding(normalize(chroma_stft),12, max_size), axis=0)

    image=np.dstack((image,np.abs(stft)))
    image=np.dstack((image,MFCCs))

    return image