In [4]:
import pickle
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
import os
from transformers import BertTokenizer,  BertForSequenceClassification, AdamW, BertConfig
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
from tensorboardX import SummaryWriter
# from torchvggish import vggish, vggish_input
import sys
import random
import csv
from sklearn.metrics import confusion_matrix,classification_report
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import seaborn as sns
from datetime import datetime
from utils import * 
from scipy import signal
from scipy.io import wavfile
import librosa
import librosa.display

ModuleNotFoundError: No module named 'numpy'

In [5]:
import pandas as pd
from utils import *
list_files = []
for x in range(5): 
    sess_name = "Session" + str(x+1)
    path = 'data/IEMOCAP_full_release/' + sess_name + '/sentences/wav/'
    file_search(path, list_files)
    list_files = sorted(list_files)
    print(sess_name + ", #sum_file: " + str(len(list_files)))
list_files_cut= [x.split('.')[-2].split('/')[-1] for x in list_files]
df = pd.DataFrame({'path': list_files, 'sessionID': list_files_cut})
df1 = pd.read_csv('speech_data/processed_digital_labels_head.csv')
merged_df = pd.merge(df, df1, on='sessionID', how='inner')
merged_df.head()

Session1, #sum_file: 1820
Session2, #sum_file: 3633
Session3, #sum_file: 5769
Session4, #sum_file: 7873
Session5, #sum_file: 10043


Unnamed: 0,path,sessionID,label
0,data/IEMOCAP_full_release/Session1/sentences/w...,Ses01F_impro01_F001,2
1,data/IEMOCAP_full_release/Session1/sentences/w...,Ses01F_impro01_F002,2
2,data/IEMOCAP_full_release/Session1/sentences/w...,Ses01F_impro01_F003,-1
3,data/IEMOCAP_full_release/Session1/sentences/w...,Ses01F_impro01_F004,-1
4,data/IEMOCAP_full_release/Session1/sentences/w...,Ses01F_impro01_F005,2


In [None]:
def audio2spectrogram(filepath):
    sample_rate, test_sound = wavfile.read(filepath, mmap=True)
    print('sample rate', sample_rate)
    _, spectrogram = log_specgram(test_sound, sample_rate)
    print('spectrogram shape', spectrogram.shape)
    print(type(spectrogram))
    plt.imshow(spectrogram.T, aspect='auto', origin='lower')
    return spectrogram

def audio2wave(filepath):
    fig = plt.figure(figsize=(5, 5))
    sample_rate, test_sound = wavfile.read(filepath, mmap=True)
    plt.plot(test_sound)

def log_specgram(audio, sample_rate, window_size=40, step_size=20, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    print('nperseg', nperseg)
    print('noverlap', noverlap)
    freqs, _, spec = signal.spectrogram(audio, fs=sample_rate, window='hann', nperseg=nperseg, noverlap=noverlap, detrend=False)
    return freqs, np.log(spec.T.astype(np.float32) + eps)

In [None]:
## Sample audio file

In [None]:
data, sr = librosa.load(list_files[0])

In [None]:
ipd.Audio(data, rate=sr)

In [None]:
# Create log mel spectrogram
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, fmax=8000)
log_spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(log_spectrogram, y_axis='mel', sr=sr, x_axis='time')
plt.title('Mel spectrogram')
plt.colorbar(format='%+02.0f dB')

In [None]:
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=30)
plt.figure(figsize=(16, 10))
plt.subplot(3, 1, 1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(data, rate=sr)

### Data Augmentation

In [None]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data
def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)
def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5)*1000)
    return np.roll(data, shift_range)
def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [None]:
# Normal audio
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=data, sr=sr)
ipd.Audio(data, rate=sr)

In [None]:
# Audio with noise
x = noise(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# Stretched audio
x = stretch(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# Shifted audio
x = shift(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# Pitched audio 
x = pitch(data, sr)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

### Feature Extraction

In [None]:
def zcr(data, frame_length, hop_length):
    zcr = librosa.feature.zero_crosssing_rate(data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data, frame_length, hop_length):
    rmse = librosa.feature.rms(data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rmse)

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten=True):
    mfcc = librosa.feature.mfcc(data, sr)
    return np.squeeze(mfcc.T) if not flatten else np.ravel(mfcc.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    result = np.array([])
    result = np.hstack((result, rcr(data, frame_length, hop_length), 
                        rmse(data, frame_length, hop_length), 
                        mfcc(data, sr, frame_length, hop_length)))
    return result

def get_features(path, duration=2.5, offset=0.6):
    data, sr = librosa.load(path, duration=duration, offset=offset)
    aud = extract_features(data)
    audio = np.array(aud)

    noised_audio = noise(data)
    aud2 = extract_features(noised_audio)
    audio = np.vstack((audio, aud2))

    pitched_audio = pitch(data, sr)
    aud3 = extract_features(pitched_audio)
    audio = np.vstack((audio, aud3))

    pitched_audio1 = pitch(data, sr)
    pitched_noised_audio = noise(pitched_audio1)
    aud4 = extract_features(pitched_noised_audio)
    audio = np.vstack((audio, aud4))
    return audio




In [8]:
import multiprocessing as mp 
print("Number of processors: ", mp.cpu_count())

Number of processors:  24


In [None]:
# Get features in parallel way 
import timeit
start = timeit.default_timer()
def process_feature(path, emotion):
    features = get_features(path)
    X = []
    Y = []
    for ele in features: 
        X.append(ele)
        Y.append(emotion)
    return X, Y

paths = merged_df['path']
emotions = merged_df['label']
results = Parallel(n_jobs=-1)(delayed(process_feature)(path, emotion) for path, emotion in zip(paths, emotions))

X = []
Y = []
for result in results: 
    X.extend(result[0])
    Y.extend(result[1])
stop = timeit.default_timer()
print('Time: ', stop - start)

In [None]:
speech_emo = pd.DataFrame(X)
speech_emo['sessionID'] = merged_df['sessionID']
speech_emo['label'] = Y
speech_emo.to_csv('speech_data/speech_feature.csv', index=False)

In [None]:
speech_emo = pd.read_csv('speech_data/speech_feature.csv')
print(speech_emo.isna().any())
speech_emo=speech_emo.fillna(0)
print(speech_emo.isna().any())

In [None]:
X = Emotions.iloc[: ,:-1].values
Y = Emotions['Emotions'].values

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [None]:
print(Y.shape)
X.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42,test_size=0.2, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
X_train = x_train.reshape(x_train.shape[0] , x_train.shape[1] , 1)
X_test = x_test.reshape(x_test.shape[0] , x_test.shape[1] , 1)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
class ParallelModel(nn.Module):
    def __init__(self,num_emotions):
        super().__init__()
        # conv block
        self.conv2Dblock = nn.Sequential(
            # 1. conv block
            nn.Conv2d(in_channels=1,
                       out_channels=16,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.3),
            # 2. conv block
            nn.Conv2d(in_channels=16,
                       out_channels=32,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            # 3. conv block
            nn.Conv2d(in_channels=32,
                       out_channels=64,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            # 4. conv block
            nn.Conv2d(in_channels=64,
                       out_channels=64,
                       kernel_size=3,
                       stride=1,
                       padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3)
        )
        # Transformer block
        self.transf_maxpool = nn.MaxPool2d(kernel_size=[2,4], stride=[2,4])
        transf_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=512, dropout=0.4, activation='relu')
        self.transf_encoder = nn.TransformerEncoder(transf_layer, num_layers=4)
        # Linear softmax layer
        self.out_linear = nn.Linear(320,num_emotions)
        self.dropout_linear = nn.Dropout(p=0)
        self.out_softmax = nn.Softmax(dim=1)
    def forward(self,x):
        # conv embedding
        conv_embedding = self.conv2Dblock(x) #(b,channel,freq,time)
        conv_embedding = torch.flatten(conv_embedding, start_dim=1) # do not flatten batch dimension
        # transformer embedding
        x_reduced = self.transf_maxpool(x)
        x_reduced = torch.squeeze(x_reduced,1)
        x_reduced = x_reduced.permute(2,0,1) # requires shape = (time,batch,embedding)
        transf_out = self.transf_encoder(x_reduced)
        transf_embedding = torch.mean(transf_out, dim=0)
        # concatenate
        complete_embedding = torch.cat([conv_embedding, transf_embedding], dim=1) 
        # final Linear
        output_logits = self.out_linear(complete_embedding)
        output_logits = self.dropout_linear(output_logits)
        output_softmax = self.out_softmax(output_logits)
        return output_logits, output_softmax