In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import librosa
from librosa import display
import IPython.display as ipd
audio_dir='/content/drive/MyDrive/Colab Notebooks/Audio/'

#Creating Dataset

In [None]:
def get_filename(dir_name):
  filenames_list=[]
  for entry in os.scandir(dir_name):
     if (entry.path.endswith(".wav") and entry.is_file()):
      filenames_list.append(entry.name)
  return filenames_list

def get_subfolders(dir_name):
  dirlist=[]
  for item in os.listdir(dir_name):
    if os.path.isdir(item):
        dirlist.append(item)
  return dirlist

def Audio_to_tensor(dir_name):
  os.chdir(dir_name)
  print(dir_name)
  X=[]
  folder_list=get_subfolders(dir_name)
  avance=1
  for folder in folder_list:
    print("Treating folder",avance,"out of",np.size(folder_list))
    avance=avance+1
    filename_list=get_filename(dir_name+'/'+folder)
    os.chdir(dir_name+'/'+folder)
    for file in filename_list:
      S,sr=librosa.load(dir_name+'/'+folder+'/'+file)
      sf=librosa.feature.mfcc(S,n_mfcc=40)
      X.append(sf)
  X=np.array(X)
  tensor=tf.convert_to_tensor(X,dtype=tf.float32)
  print("Done !")
  return tensor

In [None]:
X=Audio_to_tensor(audio_dir+'X_train')
y=Audio_to_tensor(audio_dir+'y_train')
X_valid=Audio_to_tensor(audio_dir+'X_valid')
y_valid=Audio_to_tensor(audio_dir+'y_valid')

In [None]:
input_shape = X.shape[1:]

print("Input shape: {}".format(input_shape))

#CNN

## Configuration of our CNN

In [None]:
from tensorflow.keras import callbacks
early_stopping=callbacks.EarlyStopping(
    patience=5,
    min_delta=0.01,
    restore_best_weights=True
)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
        tf.keras.layers.Conv1D(9, 3,padding="same",activation='relu',input_shape=input_shape),
        #tf.keras.layers.MaxPooling1D(pool_size=2, padding='valid'),
        ])
model.compile(
    optimizer = 'Adam',
    loss = 'mae',)
model.summary()

## Training of our model

In [None]:
history = model.fit(
    X, y,
    validation_data=(X_valid,y_valid),
    batch_size=1160,
    epochs=1500,
    callbacks=[early_stopping]
)
# Start the plot at epoch 10
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));

##Results

After training our model, we decided to make a prediction using the same dataset used to train it. This is obviously not a good idea, but it is a quick and dirty way to see if our model has learned anything.
Our previous models were not even capable of doing that. 
Of course, we will make predictions on a specific dataset to validate our  findings.

In [None]:
y_predict=model.predict(X)

### Visualizing our predictions

In [None]:
fig, ax = plt.subplots()
img = librosa.display.specshow(y_predict[180], x_axis='time', ax=ax)
fig.colorbar(img, ax=ax)
ax.set(title='MFCC')


In [None]:
fig, ax = plt.subplots()
img = librosa.display.specshow(y_train[180], x_axis='time', ax=ax)
fig.colorbar(img, ax=ax)
ax.set(title='MFCC')


### Listening to the predicitions :


In [None]:
ipd.Audio(librosa.feature.inverse.mfcc_to_audio(y_train[181], n_mels=128, dct_type=2, norm='ortho', ref=1.0, lifter=0),rate=44100)

In [None]:
ipd.Audio(librosa.feature.inverse.mfcc_to_audio(y_predict[181], n_mels=128, dct_type=2, norm='ortho', ref=1.0, lifter=0),rate=44100)

In [None]:
#Mise en place d'un early stopping : 

# Certainement à utiliser plus tard...
#from tensorflow.keras.callbacks import EarlyStopping

#early_stopping = EarlyStopping(
#    min_delta=0.001, # minimium amount of change to count as an improvement
#    patience=20, # how many epochs to wait before stopping
#    restore_best_weights=True,
#)