In [1]:
!pip install Signal_Analysis

Collecting Signal_Analysis
  Downloading Signal_Analysis-0.1.26.tar.gz (378 kB)
[K     |████████████████████████████████| 378 kB 863 kB/s 
Collecting peakutils
  Downloading PeakUtils-1.3.3-py3-none-any.whl (7.7 kB)
Building wheels for collected packages: Signal-Analysis
  Building wheel for Signal-Analysis (setup.py) ... [?25l- \ | done
[?25h  Created wheel for Signal-Analysis: filename=Signal_Analysis-0.1.26-py3-none-any.whl size=14536 sha256=11ba6699ae1ef0f9a8caf6911994f14b3519e4fe4071523075e0057b3dff82c5
  Stored in directory: /root/.cache/pip/wheels/72/da/25/128af0db67fe61f8282e790d94387346357c063d72522661d6
Successfully built Signal-Analysis
Installing collected packages: peakutils, Signal-Analysis
Successfully installed Signal-Analysis-0.1.26 peakutils-1.3.3


In [2]:
import numpy as np
import pandas as pd

import ast
import os
from tqdm.notebook import tqdm
import time

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Sound Processing
import librosa
from Signal_Analysis.features.signal import get_F_0, get_HNR

# Training Data Preparation
from sklearn.model_selection import train_test_split

NUM_SAMPLES = 2380
RNN_FEATS = 150
DENSE_FEATS = 43
MAX_TIMESTEP = 1403
NUM_EMOTIONS = 4
N_FFT = 4096
HOP_LENGTH = 1024
EMOTIONS = ['ang', 'hap', 'neu', 'sad']
SR = 48000
CHUNKSIZE = 100

def extract_HSF(lld):
    mean_val = lld.mean()
    min_val = lld.min()
    max_val = lld.max()
    var_val = lld.var()
    range_val = np.subtract(max_val, min_val)
    q25_val = np.quantile(lld, 0.25)
    q50_val = np.quantile(lld, 0.5)
    q75_val = np.quantile(lld, 0.75)
    return np.asarray([
        mean_val,
        min_val,
        max_val,
        var_val,
        range_val,
        q25_val,
        q50_val,
        q75_val,
    ])

def extract_LLD_from_subaudio(subaudio, fs):
    # Frame-wise energy
    energy_val = np.sum(np.square(subaudio)) / (subaudio.shape[0] / fs + 0.00000000000001)
    
    # Frame-wise Zero Crossing Rate
    zcr_val = np.sum((subaudio[:-1] * subaudio[1:]) < 0)
    
    return np.asarray([
        energy_val,
        zcr_val,
    ])

def extract_LLD_from_audio(audio, fs):
    # MFCC
    mfcc = librosa.feature.mfcc(audio, fs, n_fft = N_FFT, hop_length = HOP_LENGTH, center = False).transpose()
    mfcc_hsf = extract_HSF(mfcc)
    
    # LPC
    lpc = librosa.lpc(audio, 16)
    
    # Mel-Spectrogram
    spect = librosa.feature.melspectrogram(y = audio, sr = fs, n_fft = N_FFT, hop_length = HOP_LENGTH, center = False)
    spect = librosa.power_to_db(spect, ref = np.max).transpose()
    spect_hsf = extract_HSF(spect)
    
    # Other features
    f0 = get_F_0(audio, fs)[0]
    hnr = get_HNR(audio, fs)
    
    return np.asarray(mfcc), np.asarray(mfcc_hsf), np.asarray(lpc), np.asarray(spect), np.asarray(spect_hsf), np.asarray([f0, hnr])

def extract_LLD(audio, fs):
    num_windows = int((audio.shape[0] - N_FFT) // HOP_LENGTH) + 1
    framewise_lld = np.zeros((num_windows, 2))
    for idx in range(num_windows):
        subaudio = audio[int(idx * HOP_LENGTH): int(idx * HOP_LENGTH + N_FFT)]
        framewise_lld[idx, :] = extract_LLD_from_subaudio(subaudio, fs)
    framewise_lld_hsf = extract_HSF(framewise_lld)
    
    mfcc, mfcc_hsf, lpc, spect, spect_hsf, others = extract_LLD_from_audio(audio, fs)
    
    assert(framewise_lld.shape[0] == mfcc.shape[0])
    assert(mfcc.shape[0] == spect.shape[0])

    rnn_feats = np.concatenate((framewise_lld, mfcc, spect), axis = 1)
    dense_feats = np.concatenate((framewise_lld_hsf, mfcc_hsf, lpc, spect_hsf, others))
    return rnn_feats, dense_feats

In [3]:
# with pd.read_csv('/kaggle/input/iemocap-audio-vectors-csv/final_df.csv', usecols = ['emotion', 'audio_vector'], chunksize = CHUNKSIZE) as reader:
#     for chunk_idx, chunk in enumerate(reader):
#         print(chunk_idx)
#         print(chunk.shape[0])
#         print(chunk['emotion'].unique())
#         #audio_sizes = chunk['audio_vector'].apply(lambda x: len(ast.literal_eval(x)))
#         #print(np.max(audio_sizes.values))
#         print('\n\n')

In [4]:
# For creating data_X_rnn, data_X_dense & data_y
data_X_rnn = np.zeros((NUM_SAMPLES, MAX_TIMESTEP, RNN_FEATS), dtype = np.float64)
data_X_dense = np.zeros((NUM_SAMPLES, DENSE_FEATS), dtype = np.float64)
data_y = np.zeros((NUM_SAMPLES, NUM_EMOTIONS), dtype = np.uint8)
idx = 0

with pd.read_csv('/kaggle/input/iemocap-audio-vectors-csv/final_df.csv', usecols = ['emotion', 'audio_vector'], chunksize = CHUNKSIZE) as reader:
    for chunk_idx, chunk in tqdm(enumerate(reader), total = NUM_SAMPLES // CHUNKSIZE + 1):
        for row_idx, row in chunk.iterrows():
            rnn_feats, dense_feats = extract_LLD(np.asarray(ast.literal_eval(row['audio_vector'])), SR)
            data_X_rnn[idx, :rnn_feats.shape[0], :] = rnn_feats
            data_X_dense[idx, :] = dense_feats
            data_y[idx, :] = np.identity(NUM_EMOTIONS)[EMOTIONS.index(row['emotion'])]
            idx += 1

  0%|          | 0/24 [00:00<?, ?it/s]

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 



  r_x = r_a / r_w
  thres = thres * (np.max(y) - np.min(y)) + np.min(y)
  a = op(a[slice1], a[slice2])


<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'nump

  r_x = r_a / r_w


<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'numpy.float64'> 

<class 'nump

In [5]:
with open('data_X_rnn.npy', 'wb') as save_file:
    np.save(save_file, data_X_rnn)
with open('data_X_dense.npy', 'wb') as save_file:
    np.save(save_file, data_X_dense)
with open('data_y.npy', 'wb') as save_file:
    np.save(save_file, data_y)