# Loading

In [None]:
import pickle
from tensorflow.keras.saving import load_model
import soundfile
import librosa
import pandas as pd
import numpy as np
import os

In [None]:
!gdown 1s82xC_XU5rE45kd_h-Lb1-ScspRuSyfY
!unzip models.zip

Downloading...
From: https://drive.google.com/uc?id=1s82xC_XU5rE45kd_h-Lb1-ScspRuSyfY
To: /content/models.zip
  0% 0.00/841k [00:00<?, ?B/s]100% 841k/841k [00:00<00:00, 114MB/s]
Archive:  models.zip
replace models/ann.h5? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace models/encoder.pickle? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace models/scaler.pickle? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


In [None]:
model = load_model("models/ann.h5")
with open("models/encoder.pickle", "rb") as f:
	encoder = pickle.load(f)
with open("models/scaler.pickle", "rb") as f:
	scaler = pickle.load(f)

In [None]:
def mfcc(testpath):
    x, sample_rate = soundfile.read(testpath, dtype="float32")
    mfcc = librosa.feature.mfcc(y=x, sr=sample_rate)
    del1 = librosa.feature.delta(mfcc, delta=1)
    del2 = librosa.feature.delta(mfcc, delta=2)
    complete_mfcc = np.vstack((mfcc, del1, del2))
    mfcc_mean = complete_mfcc.mean(axis=1)
    mfcc_max = complete_mfcc.max(axis=1)
    mfcc_min = complete_mfcc.min(axis=1)
    mfcc_std = complete_mfcc.std(axis=1)
    return pd.Series(np.hstack((mfcc_mean, mfcc_max, mfcc_min, mfcc_std)))

In [None]:
def predictEmotion(filepath:str):
    feat = [mfcc(filepath)]
    feat = scaler.transform(feat)
    pred = model.predict(feat)
    pred = encoder.inverse_transform(pred)
    return pred

# App

In [None]:
#supported_formats = ["wav"]

#while(True):
#    print("EMOTION PREDICTION")
#    testfile = input("Enter file name [0 to exit]: ")
#    if testfile == "0":
#        break
#    if not os.path.exists(testfile):
#        print("Please enter path to a real file!\n\n\n")
#        continue
#    if sum([testfile.endswith(f) for f in supported_formats]) < 1:
#        print(f"Please enter path to an audio file! {supported_formats}\n\n\n")
#        continue
#    pred = predictEmotion(testfile)
#    print(f"Predicted emotion: {pred[0]}\n\n\n")

**DEMO**

In [None]:
import wave

In [None]:
tester_path = '/content/klee.wav'

In [None]:
wave.open(tester_path, 'r')

<wave.Wave_read at 0x7f36acfcab00>

In [None]:
#displaying the audio
from IPython.display import Audio, display
display(Audio(tester_path))


In [None]:
#apply mfcc
testfile = mfcc(tester_path)
testfile

0     -418.591980
1       95.818634
2      -22.850357
3       11.824527
4      -13.418823
          ...    
235      0.629527
236      0.650654
237      0.675660
238      0.851285
239      0.837959
Length: 240, dtype: float32

In [None]:
#turn into dataframe
tester = pd.DataFrame(testfile)
tester

Unnamed: 0,0
0,-418.591980
1,95.818634
2,-22.850357
3,11.824527
4,-13.418823
...,...
235,0.629527
236,0.650654
237,0.675660
238,0.851285


In [None]:
#switch row and column
tester_audio = tester.transpose()
tester_audio

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
0,-418.59198,95.818634,-22.850357,11.824527,-13.418823,14.411099,-1.775394,3.679089,-14.961114,2.15923,...,0.835354,0.950542,0.939077,0.768695,0.565494,0.629527,0.650654,0.67566,0.851285,0.837959


In [None]:
#scale the audio
scaled_tester = scaler.transform(tester_audio)
scaled_tester

array([[ 1.567613  ,  3.196689  , -1.2434533 ,  0.31499913, -0.8993186 ,
         2.7287648 ,  1.4203584 ,  2.3004348 , -2.346066  ,  0.9672646 ,
         1.4821504 ,  0.5087619 ,  1.6101441 , -0.3855991 ,  2.3386412 ,
         1.2423142 ,  0.47841138,  1.5827669 ,  1.1830512 ,  0.16538125,
         1.2080245 ,  0.21379294, -0.07041633,  1.3871989 , -1.0456984 ,
         0.4488326 ,  0.13096146,  0.06704152, -0.7939819 ,  0.23645507,
         0.70943886,  0.07673223, -0.41187838, -0.49436402, -0.4414784 ,
        -1.283751  , -1.0830961 ,  0.3457078 , -1.513852  , -1.3011223 ,
         1.2080245 ,  0.21379294, -0.07041633,  1.3871989 , -1.0456984 ,
         0.4488326 ,  0.13096146,  0.06704152, -0.7939819 ,  0.23645507,
         0.70943886,  0.07673223, -0.41187838, -0.49436402, -0.4414784 ,
        -1.283751  , -1.0830961 ,  0.3457078 , -1.513852  , -1.3011223 ,
         0.7865514 , -0.37321666, -3.0825398 , -1.9621127 , -1.0776067 ,
         1.4084865 ,  1.5218452 ,  8.235504  , -1.7

In [None]:
#prediction result
encoder.inverse_transform(model.predict(scaled_tester))



array(['fearful'], dtype='<U9')