In [1]:
import os
import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
import librosa as lr
import pickle
import pandas as pd
from scipy import signal
import noisereduce as nr
get_ipython().magic('matplotlib inline')
import soundfile
from tensorflow.keras.layers import MaxPool2D, Flatten, LSTM
from keras.layers.convolutional import Conv2D
from keras.layers import Dropout, Dense, TimeDistributed
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
path = r"C:\Users\abhin\archive"
listdir = os.listdir(path)
def getListOfFiles(dirName):
    listOfFile = os.listdir(dirName)
    allFiles = list()
    for entry in listOfFile:
        fullPath = os.path.join(dirName,entry)
        if os.path.isdir(fullPath):
            allFiles = allFiles+getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
    return allFiles

dirName = r"C:\Users\abhin\archive"
listOfFiles = getListOfFiles(dirName)
len(listOfFiles)

1436

In [3]:
def envelope(y , rate, threshold):
    mask=[]
    y=pd.Series(y).apply(np.abs)
    y_mean = y.rolling(window=int(rate/10) ,  min_periods=1 , center = True).mean()
    for mean in y_mean:
        if mean>threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask

In [4]:
import glob,pickle
for file in tqdm(glob.glob(r"C:\Users\abhin\archive\\**\\*.wav")):
    file_name = os.path.basename(file)
    signal , rate = lr.load(file, sr=16000)
    mask = envelope(signal,rate, 0.0005)
    wavfile.write(filename= r"C:\Users\abhin\Clean Speech\\"+str(file_name), rate=rate,data=signal[mask])

100%|██████████████████████████████████████████████████████████████████████████████| 1436/1436 [08:03<00:00,  2.97it/s]


In [5]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result

In [6]:
"""def extract_feature_images(file_name):
    data, sampling_rate = librosa.load(file_name)
    x = librosa.display.waveplot(data, sr=sampling_rate)
    return x"""

'def extract_feature_images(file_name):\n    data, sampling_rate = librosa.load(file_name)\n    x = librosa.display.waveplot(data, sr=sampling_rate)\n    return x'

In [7]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#These are the emotions User wants to observe more :
observed_emotions=['calm', 'happy', 'sad', 'fearful']

In [11]:
from glob import glob
import os
import glob
def load_data(test_size1=0.27):
    x,y=[],[]
    answer = 0
    for file in glob.glob(r"C:\Users\abhin\archive\\**\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            answer += 1
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append([emotion,file_name])
    return train_test_split(np.array(x), y, test_size=test_size1, random_state=9)

In [12]:
import librosa
import numpy as np
import librosa.display
x_train,x_test,y_trai,y_tes=load_data(test_size1=0.33)
print(np.shape(x_train),np.shape(x_test), np.shape(y_trai),np.shape(y_tes))
y_test_map = np.array(y_tes).T
y_test = y_test_map[0]
test_filename = y_test_map[1]
y_train_map = np.array(y_trai).T
y_train = y_train_map[0]
train_filename = y_train_map[1]
print(np.shape(y_train),np.shape(y_test))
print(*test_filename,sep="\n")

(511, 180) (253, 180) (511, 2) (253, 2)
(511,) (253,)
03-01-03-01-01-01-15.wav
03-01-02-02-02-01-09.wav
03-01-03-02-01-02-21.wav
03-01-04-01-01-02-05.wav
03-01-04-01-02-01-21.wav
03-01-04-02-02-01-20.wav
03-01-03-02-02-02-19.wav
03-01-03-02-02-01-03.wav
03-01-02-02-01-02-11.wav
03-01-06-02-02-01-19.wav
03-01-04-01-01-01-06.wav
03-01-02-02-02-02-21.wav
03-01-02-01-01-01-15.wav
03-01-03-02-01-02-10.wav
03-01-06-02-02-01-14.wav
03-01-03-01-01-01-14.wav
03-01-03-01-01-01-02.wav
03-01-04-01-02-01-24.wav
03-01-04-01-01-01-11.wav
03-01-06-01-02-02-03.wav
03-01-02-01-01-02-05.wav
03-01-06-02-02-02-01.wav
03-01-06-02-02-01-08.wav
03-01-02-01-02-01-03.wav
03-01-03-02-01-01-06.wav
03-01-03-01-02-02-04.wav
03-01-06-02-01-02-09.wav
03-01-06-02-02-01-10.wav
03-01-04-01-01-02-16.wav
03-01-02-01-02-02-09.wav
03-01-03-01-02-02-10.wav
03-01-02-01-01-01-23.wav
03-01-03-02-02-02-23.wav
03-01-04-01-02-02-09.wav
03-01-03-01-02-01-15.wav
03-01-04-02-02-02-19.wav
03-01-04-02-01-01-19.wav
03-01-04-02-01-02-22.

In [None]:
#print(images)

In [13]:
print((x_train[0], x_test[0]))
#Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

(array([-7.03027832e+02,  5.72944145e+01,  5.33608627e+00,  1.11139946e+01,
        1.66350055e+00,  7.87825108e+00, -9.22317314e+00, -3.41159439e+00,
       -1.16426344e+01, -2.94879866e+00, -4.47133493e+00, -1.20431423e-01,
       -6.49245739e+00, -6.70930576e+00, -2.14721346e+00, -1.12317348e+00,
       -5.54660797e+00, -1.86105657e+00, -3.69644666e+00, -2.79150295e+00,
       -3.40290904e+00, -3.03830242e+00, -4.13836527e+00, -4.69398260e+00,
       -3.43905377e+00, -3.53944445e+00, -4.62198448e+00, -3.00573158e+00,
       -3.57722330e+00, -4.28554630e+00, -3.57737017e+00, -1.45512438e+00,
       -2.66516614e+00,  6.48885742e-02, -2.40921304e-01,  1.22853625e+00,
        4.30144489e-01,  2.63056755e-01,  4.00095284e-01,  1.90424725e-01,
        6.05980396e-01,  5.46665370e-01,  5.23380876e-01,  5.23688197e-01,
        5.74538410e-01,  5.95160365e-01,  6.07343972e-01,  6.03799462e-01,
        5.98900914e-01,  6.06616855e-01,  6.23565376e-01,  6.59761727e-01,
        4.28079784e-06, 

In [None]:
#model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [14]:
from sklearn.preprocessing import LabelEncoder

def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
print(y_train)
print('\n\n\n\n')
y_train = dummyEncode(y_train)
y_test = dummyEncode(y_test)
print(y_train)

           0
0        sad
1        sad
2    fearful
3      happy
4        sad
..       ...
506  fearful
507      sad
508     calm
509  fearful
510     calm

[511 rows x 1 columns]





     0
0    3
1    3
2    1
3    2
4    3
..  ..
506  1
507  3
508  0
509  1
510  0

[511 rows x 1 columns]


In [15]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [16]:
#y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
#y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
#x_test = np.expand_dims(x_test, axis=2)
print(y_train)
print(y_train.shape)
print(y_test.shape)

[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
(511, 4)
(253, 4)


In [None]:
#x_train = np.expand_dims(x_train, axis=2) 

In [17]:
print(x_train.shape)

(511, 180)


In [18]:
from tensorflow.keras import layers
from tensorflow.keras import models

#X = np.expand_dims(X, axis=2) 
model = models.Sequential()

In [19]:
"""model.add(layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=2))"""
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(600, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(200, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(120, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(4,activation = 'softmax'))

In [20]:
from tensorflow.keras.callbacks import EarlyStopping
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])
es = EarlyStopping(monitor='val_auc', patience=5, min_delta=0.001, mode='max')
history = model.fit(x_train,y_train,epochs = 2000,batch_size = 512,validation_data = (x_test,y_test))

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E