<a href="https://colab.research.google.com/github/ValeryNikiforov/Speech-Emotion-Recognition/blob/master/SER_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import scipy.io.wavfile as wavfile
from scipy.fftpack import dct
!pip install librosa==0.7.2
import librosa
import torch
import torch.nn as nn




In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
emotions = {0:'angry', 1:'disgust', 2:'fear', 3:'happy', 4:'neutral', 5:'surprise', 6:'sad'}

In [None]:
def mfcc(X_raw_data):
  X_raw_data = np.expand_dims(X_raw_data,axis=0)
  first_fragment_size = 297
  sec_fragment_size = 200
  frame_size = 0.025
  frame_stride = 0.01
  sample_rate = 8000
  num = np.shape(X_raw_data)[0]
  frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  
  X_frames = np.zeros((num,first_fragment_size,sec_fragment_size))
  for i in range(len(X_raw_data)):
    signal_length = len(X_raw_data[i])
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
 
    num_frames = 1+int(np.ceil(float(signal_length - frame_length) / frame_step)) 
    pad_signal_length = int((num_frames-1) * frame_step + frame_length)
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(X_raw_data[i], z) 
    for j in range(len( X_frames[i])):
      X_frames[i,j] *= np.hamming(frame_length)
  
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    X_frames[i] = pad_signal[indices.astype(np.int32, copy=False)]
  del z, pad_signal, X_raw_data
  NFFT = 512
  pow_frames = np.zeros((num,first_fragment_size,257))
  for i in range(len(X_frames)):
    pow_frames[i] = ((1.0 / NFFT) * (np.square(np.absolute(np.fft.rfft(X_frames[i], NFFT)))))
  del X_frames

  nfilt = 26
  low_freq_mel = 0
  high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700.)) 
  mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  
  hz_points = (700 * (10**(mel_points / 2595.0) - 1)) 
  bin = np.floor((NFFT + 1) * hz_points / sample_rate)

  fbanks = np.zeros([nfilt,NFFT // 2 + 1])
  for j in range(0, nfilt):
  
    for i in range(int(bin[j]), int(bin[j+1])):
            fbanks[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
    for i in range(int(bin[j+1]), int(bin[j+2])):
            fbanks[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])       
  filter_banks = np.zeros((num,first_fragment_size,26))   
  energy = np.zeros((num,first_fragment_size))
  for i in range(num):
    filter_banks[i]= np.dot(pow_frames[i], fbanks.T)
    filter_banks[i] = np.where(filter_banks[i] == 0, np.finfo(float).eps, filter_banks[i])  
    energy[i] = np.sum(pow_frames[i],1) 
    energy[i] = np.where(energy[i] == 0,np.finfo(float).eps,energy[i])
  del pow_frames
  num_ceps = 26
  mfcc_old = np.zeros((np.shape(filter_banks)[0], first_fragment_size, 26))
  _, nframes,ncoeff = np.shape(mfcc_old)
  n = np.arange(ncoeff)
  lift = 1 + (22/2.)*np.sin(np.pi*n/22)
  for i in range(np.shape(filter_banks)[0]):
    filter_banks[i]= np.log(filter_banks[i])
    mfcc_old[i] = dct(filter_banks[i], type=2, axis=1, norm='ortho')[:, : num_ceps]
    mfcc_old[i] = lift * mfcc_old[i]
    mfcc_old[i,:,0] = np.log(energy[i])
  return  mfcc_old[0]

In [None]:
def melspec(X):
  X = np.array(X[1], dtype=float)
  X = librosa.feature.melspectrogram(X, win_length=1024)
  X = np.where(X == 0, np.finfo(float).eps, X)
  X = 20*np.log10(X)
  return X

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        self.input_spec_size=128
        self.cnn_filter_size=128
        self.num_layers_lstm=1
        self.num_heads_self_attn=4
        self.hidden_size_lstm=60
        self.num_emo_classes=7
        
        self.conv_1 = nn.Conv1d(self.input_spec_size,self.cnn_filter_size,3,1)
        self.max_pooling_1 = nn.MaxPool1d(5)
        self.bn = nn.BatchNorm1d(self.cnn_filter_size)
        self.conv_2 = nn.Conv1d(self.cnn_filter_size,self.cnn_filter_size,3,1)
        self.max_pooling_2 = nn.MaxPool1d(3)
        
        ###
        self.lstm = nn.LSTM(input_size=self.cnn_filter_size, hidden_size=self.hidden_size_lstm,num_layers=self.num_layers_lstm,bidirectional=True,dropout=0.3,batch_first=True)
        ## Transformer
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.hidden_size_lstm*2,dim_feedforward=512, nhead=self.num_heads_self_attn)
        self.emotion_layer = nn.Linear(self.hidden_size_lstm*4,self.num_emo_classes)


    def forward(self,inputs):
        out = self.conv_1(inputs)
        out = self.max_pooling_1(out)
        out = self.bn(out)
        out = self.conv_2(out)
        #out = self.max_pooling_2(out)
        out = self.bn(out)
        out = out.permute(0, 2, 1)
        out, (final_hidden_state, final_cell_state) = self.lstm(out)
        out = self.encoder_layer(out)
        mean = torch.mean(out,1)
        std = torch.std(out,1)
        stat = torch.cat((mean,std),1)
        pred_emo = self.emotion_layer(stat)
        return pred_emo

In [None]:
def open_prepare(path, feats_type):
  new_record = wavfile.read(path)[1]
  if (np.size(new_record) < 23879):
    time_differece = 23879-np.size(new_record)
    res = np.append(new_record,np.zeros((time_differece)))
  else:
    res = new_record[:23879] 
  if feats_type == 'mfcc':
    return mfcc(res)
  return melspec(res)

In [None]:
def predict(features, feats_type):
  features = torch.Tensor(features)
  features = features.unsqueeze(-1)
  if feats_type == 'mfcc':
    features = features.permute(2,1,0)
    model = torch.load('drive/My Drive/model_mfcc13.pth')
  if feats_type == 'melspec':
    features = features.permute(2,0,1)
    model = torch.load('drive/My Drive/model_spec.pth')
  out = model(features)
  out = torch.argmax(out)
  print(emotions[out.item()])

In [None]:
feats_type = 'mfcc' #'mfcc' or 'melspec'
inp=open_prepare('drive/My Drive/YAF_wire_sad.wav', feats_type)
predict(inp, feats_type)

(1, 23879)
sad
