<a href="https://colab.research.google.com/github/Pavun-KumarCH/Librosa-Speech-Emotion-Recognition/blob/main/Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Speech emotion Recognition Using Machine Learning

In [None]:
# Tensor Flow libraries
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense, BatchNormalization
from keras.callbacks import ReduceLROnPlateau
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras import regularizers

In [None]:
# import all required libraries
import pandas as pd # Data Manuplualtion
import numpy as np # Arrays Calculation
import glob #  file directories
import soundfile # soundfile format
import os
import sys

## Import librosa for audio analysis

In [None]:
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.io.wavfile
import plotly.express as px
from sklearn import metrics

## Data Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
import IPython.display as ipd

## Errors filter


In [None]:
import warnings
if not sys.warnoptions:
  warnings.simplefilter("ignore")
warnings.filterwarnings('ignore',category = DeprecationWarning)


## Load the RAVDESS Dataset

In [None]:
RAV = '/Users/pavankumar/Downloads/archive/audio_speech_actors_01-24/'
list_dir = os.listdir(RAV)

list_dir.remove('.DS_Store')

## Segregate the data according to the relavent sections

In [None]:
emotion = []
gender = []
path = []
for i in list_dir:
  fname = os.listdir(RAV + i)
  for f in fname:
    part = f.split('.')[0].split('-')
    emotion.append(int(part[2]))
    temp = int(part[6])
    if temp % 2 == 0 :
      temp = 'Female'
    else:
      temp = 'Male'
    gender.append(temp)
    path.append(RAV + i + '/' + f)

In [None]:
data = pd.DataFrame(emotion, columns = ['Emotion'])
data.value_counts()
emotion_mapping = {1 : "Neutral",
                   2 : 'Neutral',
                   3 : 'Happy',
                   4 : 'Sad',
                   5 : 'Anger',
                   6 : 'Fear',
                   7 : 'Disgust',
                   8 : 'Suprise'}
data = data['Emotion'].replace(emotion_mapping)
data = pd.concat([pd.DataFrame(gender, columns = ['Gender']), data, pd.DataFrame(path, columns = ['Path'])], axis = 1)
data

data['Labels'] = data['Gender'] + '_' + data['Emotion']
data.drop(['Gender'], axis = 1, inplace = True)

data['Labels'].value_counts()

In [None]:
data.head()

In [None]:

data.describe()

# Data Visualization

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'colab'

px_fig = px.histogram(data, x = 'Emotion', color = 'Emotion', marginal= 'box',title = 'Emotion Count')
px_fig.update_layout(bargap = 0.2)
px_fig.show()

px_fig = px.histogram(data, x = 'Labels', color = 'Emotion', marginal = 'box', title = 'Label Count')
px_fig.update_layout(bargap = 0.2)
px_fig.show()

In [None]:
def create_waveplot(meta, sr, e):
  plt.figure(figsize = (10,3))
  plt.title("Waveplot for audio with {} emotion".format(e),size = 15)
  librosa.display.waveshow(meta, sr = sr)
  plt.show()



In [None]:
def create_spectrogram(meta, sr, e):
  X = librosa.stft(meta)
  Xdb = librosa.amplitude_to_db(abs(X))
  plt.figure(figsize = (12, 3))
  plt.title('Spectrogram for audio with {} emotion'.format(e), size =15)
  librosa.display.specshow(Xdb, sr =sr, x_axis = 'time', y_axis = 'hz')
  plt.colorbar()

In [None]:
Emotion = 'Fear'
path = np.array(data.Path[data.Emotion==Emotion])[1]
meta, sampling_rate = librosa.load(path)



In [None]:
create_waveplot(meta, sampling_rate, Emotion)
create_spectrogram(meta, sampling_rate, Emotion)
ipd.Audio(path)

# Data Augmentation

In [None]:
def noise(meta):
  noise_amp = 0.035*np.random.uniform()*np.amax(meta)
  meta = meta + noise_amp*np.random.normal(size = meta.shape[0])
  return meta

def stretch(meta, rate = 0.8):
  return librosa.effects.time_stretch(meta, rate = rate)

def shift(meta):
  shift_range = int(np.random.uniform(low = 5, high = 5)*1000)
  return np.roll(meta, shift_range)

def pitch(meta, sampling_rate, pitch_factor = 0.7):
  return librosa.effects.pitch_shift(meta, sr = sampling_rate, n_steps = pitch_factor)

# Taking anf Example and checking for techniques
path = np.array(data.Path)[1]
meta, sample_rate = librosa.load(path)


## Simple Audio

In [None]:
plt.figure(figsize = (14, 4))
librosa.display.waveshow(y = meta, sr = sample_rate)
ipd.Audio(path)

Noise Injection :

In [None]:
x = noise(meta)
plt.figure(figsize = (14, 4))
librosa.display.waveshow(y = x, sr = sample_rate)
ipd.Audio(x, rate = sample_rate)

Stretching

In [None]:
x = stretch(meta)
plt.figure(figsize = (14, 4))
librosa.display.waveshow(y = x, sr = sample_rate)
ipd.Audio(x, rate = sample_rate)

Shifting

In [None]:
x = shift(meta)
plt.figure(figsize = (14, 4))
librosa.display.waveshow(y = x, sr = sample_rate)
ipd.Audio(x, rate = sample_rate)

Pitch

In [None]:
x = pitch(meta, sample_rate)
plt.figure(figsize = (14, 4))
librosa.display.waveshow(y = x, sr = sample_rate)
ipd.Audio(x, rate = sample_rate)

# Feature Extraction

In [None]:
def extract_features(meat):
  # ZCR
  result = np.array([])
  zcr = np.mean(librosa.feature.zero_crossing_rate(y = meat).T, axis = 0)
  result = np.hstack((result,zcr)) # stacking horizantally
  # chroma_stft
  stft = np.abs(librosa.stft(meta))
  chroma_stft = np.mean(librosa.feature.chroma_stft(S = stft, sr = sample_rate).T, axis = 0)
  result = np.hstack((result, chroma_stft)) # stacking horizontally

  # MFCC
  mfcc = np.mean(librosa.feature.mfcc(y = meat, sr = sample_rate).T, axis  = 0)
  result = np.hstack((result, mfcc)) # stacking horizontally


  # Root Mean Square Value
  rms = np.mean(librosa.feature.rms(y = meta).T, axis = 0)
  result = np.hstack((result, rms)) # stacking horizontally

  # Melspectrogram
  mel = np.mean(librosa.feature.melspectrogram(y = meat, sr = sample_rate).T, axis = 0)
  result  = np.hstack((result, mel))  # stacking horizontally
  return result

def get_feature(path):

  meta ; sample_rate = librosa.load(path, duration = 2.5, offset = 0.6)
  res1 = extract_features(meta)
  result = np.array(res1)

  noise_meat = noise(meat)
  res2 = extract_features(noise_meat)
  result = np.vstack((result, res2)) # stacking vertically

  # data with strtching and pitching
  new_meta = stretch(meta)
  data_stretch_pitch = pitch(new_meta, sample_rate)
  res3 = extract_features(data_stretch_pitch)
  result = np.vstack((result, res3))

  return result

# Data Preparation

In [None]:
X, Y = [], []
for path , emotion in zip(data.Path, data.Emotion):
  feature = get_features(path)
  for ele in feature:
    X.append(ele)
    Y.append(emotion)


In [None]:
len(X), len(Y), data.Path.shape

In [None]:
Feature = pd.DataFrame(X)
Feature['Labels'] = Y
Feature.to_csv('features.csv', index = False)

In [None]:
display(Feature.head())
display(Feature.describe())

Seperate Input and Output Variables

In [None]:
X = Feature.iloc[:, :- 1].values
Y = Feature['Labels'].values

In [None]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()


Splitting the data into Train and Test

In [None]:
x_train, x_test, y_train, y_test  = train_test_split(X, Y, shuffle = True, random_state = 0 )

x_train.shape, x_test.shape, y_train.shape, y_test.shape


In [None]:
scalar = StandardScaler()
x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

# Modelling

In [None]:
model = tf.keras.Sequential([

       # Block 1
       Conv1D(256, kernel_size = 5, strides = 1, padding = 'same', activation = 'relu', input_shape = (x_train.shape[1], 1)),
       MaxPooling1D(pool_size = 5, strides = 2, padding = 'same'),

       # Block 2
       Conv1D(128, kernel_size = 5, strides = 1, padding = 'same', activation = 'relu'),
       MaxPooling1D(pool_size = 5, strides = 2, padding = 'same'),

       # Block 3
       Conv1D(64, kernel_size = 5, strides = 1, padding = 'same', activation = 'relu'),
       MaxPooling1D(pool_size = 5, strides = 2, padding = 'same'),
       Dropout(0.2),

       # Block 4
       Conv1D(32, kernel_size = 5, strides = 1, padding = 'same', activation = 'relu'),
       MaxPooling1D(pool_size = 5, strides = 2, padding = 'same'),

       Flatten(),
       Dense(units= 16, activation = 'relu'),
       Dropout(0.3),
       Dense(units = 7, activation = 'softmax')
])

model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])


model.summary()

# Training

In [None]:
rlrp = ReduceLROnPlateau(monitor = 'loss', factor = 0.4, verbose = 0, patience = 4, min_lr = 0.0000001)

history = model.fit(x_train, y_train, batch_size = 64, epochs = 51, validation_data = (x_test, y_test), callbacks = [rlrp])

# Evaluation

In [None]:
print("\n Accuracy of our model on Test data :", model.evaluate(x_test, y_test)[1]*100,'%')

plt.style.use('seaborn-darkgrid')
plt.rcParams.update({'font.size': 12})
epochs = [i for i in range(51)]

fig, ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs, train_loss, label = 'Training Loss', marker = 'o', linewidth = 2)
ax[0].plot(epochs, test_loss, label = 'Testing Loss', marker = '.', linewidth = 2)
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs, train_acc, label = 'Training Accuracy', marker = 'o', linewidth = 2)
ax[1].plot(epochs, test_acc, label = 'Testing Accuracy', marker = '.', linewidth = 2)
ax[1].set_title('Trainig & Testing Accuracy')
ax[1].legend()

# Prediction


In [None]:
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)
y_test = encoder.inverse_transform(y_test)

In [None]:
final_data = pd.DataFrame(columns = ['Predicted Labels', 'Actual Labels'])
final_data['Predicted Labels'] = y_pred.flatten()
final_data['Actual Labels'] = y_test.flatten()

display(final_data.head())

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Purples', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm, index = [i for i in encoder.categories_], columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap = 'Purples', linewidths = 1, annot = True, fmt = '')
plt.title('Confusion Matrix', size = 20)
plt.xlabel('Predicted Labels', size = 14)
plt.ylabel('Actual Labels', size = 14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))