<a href="https://colab.research.google.com/github/Solrak97/clasificador_de_sentimientos/blob/main/second_model_speech_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Busqueda de entradas para el modelo base propuesto por Dias Issa et al.
El modelo base cuenta con una entrada de 193 nodos, por lo que es necesario que cada entrada de datos tenga exactamente 193 variables, eso a partir de las ya conocidas caracteristicas 

* Chromagram
* Contrast
* Mel coeficent
* Mel frequency
* Tonnetz

### Descarga de los datos desde Kaggle hasta el notebook.

In [None]:
%%capture
! pip install kaggle wavio pydub keras-metrics
! rm -rf *
! mkdir ~/.kaggle
! curl https://raw.githubusercontent.com/Solrak97/clasificador_de_sentimientos/main/kaggle.json > kaggle.json
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download uwrfkaggler/ravdess-emotional-speech-audio
! unzip ravdess-emotional-speech-audio.zip

import soundfile
import numpy as np
import librosa
import glob
import os
import pickle
from pydub import AudioSegment
import seaborn as sns
from matplotlib import pyplot as plt

### Algoritmo de extracción de datos

In [None]:
def extract_feature(file_name, **kwargs):
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz") 

    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
      
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=128).T, axis=0)
            result = np.hstack((result, mfccs))
           
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
           
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
           
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
           
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
           
    return result
  

In [None]:
# we allow only these emotions ( feel free to tune this on your need )
AVAILABLE_EMOTIONS = {
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "disgust",
    "surprised"
}

'''
Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
Vocal channel (01 = speech, 02 = song).
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
Repetition (01 = 1st repetition, 02 = 2nd repetition).
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).
'''

INT_2_MODALITY = {
    '01' : 'full-AV',
    '02' : 'video-only',
    '03' : 'audio-only'
}

INT_2_VOCAL = {
    '01' : 'speech',
    '02' : 'song'
}

INT_2_EMOTION = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

INT_2_INTENSITY = {
    '01' : 'normal', 
    '02' : 'strong'
}

INT_2_STATEMENT = {
    '01' : '"Kids are talking by the door"',
    '02' : '"Dogs are sitting by the door"'
}

INT_2_REPETITION = {
    '01' : '1st repetition', 
    '02' : '2nd repetition'
}

In [None]:
import pandas as pd


def load_data():

    _modality = []
    _vocal_channel = []
    _emotions = []
    _intensity = []
    _statement = []
    _repetition = []
    _actor = []
    _features = []

    X, y = [], []

    for file in glob.glob("Actor_*/*.wav"):
            
      file_name = os.path.basename(file)
      
      # El audio viene en estereo para algunas partes
      # Así que se pasa a mono
      sound = AudioSegment.from_wav(file)
      sound = sound.set_channels(1)
      sound.export(file, format="wav")
  
      name_split = file_name.split("-")
      emotion = INT_2_EMOTION[name_split[2]]

      # Limitación de emociones
      if emotion not in AVAILABLE_EMOTIONS:
        continue
      
      _modality.append(INT_2_MODALITY[name_split[0]])
      _vocal_channel.append(INT_2_VOCAL[name_split[1]])
      _intensity.append(INT_2_INTENSITY[name_split[3]])
      _statement.append(INT_2_STATEMENT[name_split[4]])
      _repetition.append(INT_2_REPETITION[name_split[5]])
      _actor.append(name_split[6])
      _emotions.append(emotion)

      # Extracción de los datos graciosos
      features = extract_feature(file, mfcc=True, 
                                 chroma=True, mel=True, 
                                 contrast=True, tonnetz=True)
      
      _features.append(features)


    data = {
      'Modality' :  _modality,
      'Vocal Channel' : _vocal_channel,
      'Emotion' : _emotions,
      'Intensity' : _intensity,
      'Statement' : _statement,
      'Repetition' : _repetition,
      'Actor_ID' : _actor,
      'Features' : _features
    } 

    df = pd.DataFrame(data)

    return df

In [None]:
data = load_data()

## Plan para el modelo 02:
1. Separar el training (80%) del test (20%)
1. Agrandar el Sample
  1. Modificar con distintas velocidades
    1. velocidades: { 0.81%, 1.00%, 1.23% }
    1. leer un audio, modificar su velocidad y reproducirlo
    1. de todo el dataset tomar un 80% de entrenamiento y 20 de testeo
    1. cada audio en el dataset de entrenamiento debe contar con una versión en cada una de las velocidades
    1. se pasa de tener 425 a 1275
  1. Agregar ruido para duplicar el sample
    1. para cada uno de los audios en las distintas velocidades, se agrega un ruido de 25%
    1. se pasa de tener 1275 a tener 2550
1. Replicar los pasos del grafo en la figura 4 en la fabricación del modelo
1. Construir la matriz de confusión

In [None]:
from pydub import AudioSegment


def cambio_velocidad(sound, speed=1.0):

    sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
         "frame_rate": int(sound.frame_rate * speed)
      })

    return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)


#slow_sound = cambio_velocidad(my_sound, 0.81)
#fast_sound = cambio_velocidad(my_sound, 1.23)

In [None]:
sound_array = []

for file in glob.glob("Actor_*/*.wav"):
        
  file_name = os.path.basename(file)
  
  # El audio viene en estereo para algunas partes así que se pasa a mono
  sound = AudioSegment.from_wav(file)
  sound = sound.set_channels(1)
  sound.export(file, format="wav")

  # agregamos el audio en velocidad 1.00%
  sound_array.append(sound)

  # agregamos versión del audio a 0.81%
  slow_sound = cambio_velocidad(sound, 0.81)
  sound_array.append(slow_sound)

  # agregamos versión del audio a 1.23%
  fast_sound = cambio_velocidad(sound, 1.23)
  sound_array.append(fast_sound)

  name_split = file_name.split("-")
  emotion = INT_2_EMOTION[name_split[2]]

print(len(sound_array))

1440


In [None]:
my_sound = sound_array[34]
my_sound

In [None]:
slow_sound = cambio_velocidad(my_sound, 0.81)
slow_sound

In [None]:
fast_sound = cambio_velocidad(my_sound, 1.23)
fast_sound