<a href="https://colab.research.google.com/github/Solrak97/clasificador_de_sentimientos/blob/main/Notebooks/Prueba_de_concepto_RAVDESS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
%%capture
#! pip install kaggle wavio pydub keras-metrics tensorflow
#! rm -rf *
#! mkdir ~/.kaggle
#! curl https://raw.githubusercontent.com/Solrak97/clasificador_de_sentimientos/main/kaggle.json > kaggle.json
#! cp kaggle.json ~/.kaggle/
#! chmod 600 ~/.kaggle/kaggle.json
#! kaggle datasets download uwrfkaggler/ravdess-emotional-speech-audio
#! unzip ravdess-emotional-speech-audio.zip

import soundfile
import numpy as np
import librosa
import glob
import os
import pickle
import tensorflow as tf
from pydub import AudioSegment
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import keras
from keras import Sequential
from keras.layers import Dropout, MaxPool1D, Flatten, Dense, ReLU, Input, BatchNormalization, Softmax
from keras.layers.convolutional import Conv1D
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

## Extraccion de caracteristicas

In [2]:
def extract_feature(file_name, **kwargs):
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz") 

    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
      
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=84).T, axis=0)
            result = np.hstack((result, mfccs))
           
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
           
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate, n_mels = 84).T,axis=0)
            result = np.hstack((result, mel))
           
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
           
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
           
    return result
  

## Enums de emociones dentro del dataset RAVDESS

In [31]:
# we allow only these emotions ( feel free to tune this on your need )
AVAILABLE_EMOTIONS = {
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "disgust",
    "surprised"
}

int_2_emotion = {
    '01': "neutral",
    '02': "calm",
    '03': "happy",
    '04': "sad",
    '05': "angry",
    '06': "fearful",
    '07': "disgust",
    '08': "surprised"
}

## Función para carga de datos

In [52]:
def load_data():
    X, y = [], []

    for file in glob.glob("Actor_*/*.wav"):
      file_name=os.path.basename(file)
      
      # El audio viene en estereo para algunas partes
      # Así que se pasa a mono
      sound = AudioSegment.from_wav(file)
      sound = sound.set_channels(1)
      sound.export(file, format="wav")
  
      name_split = file_name.split("-")
      emotion = int_2_emotion[name_split[2]]

      # Limitación de emociones
      if emotion not in AVAILABLE_EMOTIONS:
        continue
      
      # Extracción de caracteristicas
      features = extract_feature(file, mfcc=True, 
                                 chroma=True, mel=True, 
                                 contrast=True, tonnetz=True)
      
      
      X.append(features)
      y.append(emotion)

    return (np.matrix(X), np.array(y))

## Carga de datos

In [53]:
X, y = load_data()

(1440,)

# Replicación de la topología base

In [54]:
def build_baseline():
  model = Sequential()

  # input layer
  model.add(Input(shape=(193, 1)))
  
  # Primer Convolutional layer
  model.add(Conv1D(strides=1, filters=255, kernel_size=5))
  model.add(BatchNormalization())
  model.add(ReLU())

  # Segund Convolutional layer
  model.add(Conv1D(strides=1, filters=128, kernel_size=5))
  model.add(ReLU())
  model.add(Dropout(rate=0.1))
  model.add(BatchNormalization())

  # Capa de Maxpooling
  model.add(MaxPool1D(pool_size=8))

  # 3 capas convolucionales intermedias
  model.add(Conv1D(strides=1, filters=128, kernel_size=5))
  model.add(ReLU())

  model.add(Conv1D(strides=1, filters=128, kernel_size=5))
  model.add(ReLU())

  model.add(Conv1D(strides=1, filters=128, kernel_size=5))
  model.add(BatchNormalization())
  model.add(ReLU())
  model.add(Dropout(rate=0.2))

  # Capa convolucional final
  model.add(Conv1D(strides=1, filters=128, kernel_size=5))
  model.add(Flatten())
  model.add(Dropout(rate=0.2))

  # Capa densa, tiene la misma cantidad de neuronas que de clases a predecir
  model.add(Dense(units=8))
  model.add(BatchNormalization())
  model.add(Softmax())  
  return model

## Pruebas del modelo base

In [74]:
from keras import metrics
encoder = LabelEncoder()
_y = encoder.fit_transform(y)
base = build_baseline()
#base.summary()

opt = tf.keras.optimizers.RMSprop(learning_rate=1e-4)
base.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=[
        'MeanSquaredError',
        'acc'
    ])

X_train, X_test, y_train, y_test = train_test_split(X, _y)

# Model fit
base.fit(X_train, y_train)

# Model predict
y_pred = base.predict(X_test)

print(y_pred)

# Plot de matriz
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=encoder.classes_)


disp.plot(cmap=plt.cm.Blues)
plt.show()

[[0.11019858 0.1568628  0.1276804  ... 0.12155099 0.11766429 0.12809113]
 [0.10859814 0.15587899 0.1284609  ... 0.12178741 0.12184825 0.1245279 ]
 [0.2503324  0.11212965 0.06125806 ... 0.08243667 0.14324087 0.06585706]
 ...
 [0.10864582 0.15511167 0.12994458 ... 0.12037726 0.1204299  0.12588984]
 [0.11146557 0.15669954 0.12769644 ... 0.12116181 0.11711467 0.12746684]
 [0.11227853 0.1524566  0.12946859 ... 0.12031659 0.11632673 0.12710305]]


ValueError: ignored