In [2]:
#************************************************************************************
# Rezwan Matin
# Thesis B
# Filename: ML_RNN_RAVDESS_1.py
# Date: 4/12/20
#
# Objective:
# Testing stuff for building the first RNN model.
#
#*************************************************************************************

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa as rosa
import glob
import os
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix
import itertools
import tensorflow as tf
import tensorflow.keras as keras

In [20]:
# Save directory path in 'path'
path = r'C:/Books/Texas State Books/Fall 2019/Thesis A/Corpus/Simulated/RAVDESS/All'

# Create a list of audio file names 'file_list'
file_list = os.listdir(path)

# Declare an empty list to store the length (no. of frames) for each sample (audio).
num_frames = []

i=0

sum = 0

for filename in file_list:
    
    # Read WAV file. 'rosa.core.load' returns sampling frequency in 'fs' and audio signal in 'sig'
    sig, fs = rosa.core.load(path + '/' + file_list[i], sr=None)
    
    # 'rosa.feature.mfcc' extracts n_mfccs from signal and stores it into 'mfcc_feat'
    mfcc_feat = rosa.feature.mfcc(y=sig, sr=fs, n_mfcc=26)
    
    num_frames.insert(i, mfcc_feat.shape[1])
    
    i+=1

# Print the list containing the frame lengths of all the samples
num_frames

[310,
 341,
 323,
 310,
 338,
 313,
 366,
 329,
 307,
 332,
 295,
 341,
 298,
 326,
 316,
 351,
 360,
 326,
 316,
 326,
 360,
 341,
 357,
 348,
 313,
 338,
 329,
 316,
 341,
 338,
 357,
 341,
 301,
 360,
 291,
 332,
 301,
 313,
 323,
 351,
 329,
 332,
 304,
 335,
 373,
 323,
 351,
 341,
 307,
 354,
 329,
 307,
 291,
 320,
 388,
 332,
 313,
 351,
 301,
 338,
 288,
 326,
 323,
 338,
 329,
 326,
 313,
 323,
 338,
 338,
 345,
 341,
 298,
 338,
 326,
 304,
 338,
 323,
 351,
 329,
 304,
 345,
 298,
 341,
 295,
 335,
 320,
 335,
 326,
 326,
 310,
 332,
 345,
 332,
 360,
 338,
 332,
 360,
 360,
 357,
 379,
 379,
 326,
 338,
 313,
 335,
 307,
 363,
 310,
 351,
 335,
 348,
 329,
 326,
 345,
 341,
 379,
 332,
 373,
 335,
 338,
 376,
 351,
 351,
 385,
 385,
 351,
 320,
 313,
 338,
 313,
 354,
 298,
 335,
 335,
 363,
 332,
 320,
 345,
 335,
 376,
 335,
 382,
 329,
 329,
 360,
 360,
 341,
 388,
 373,
 329,
 326,
 326,
 341,
 313,
 354,
 295,
 320,
 351,
 357,
 351,
 341,
 313,
 313,
 357,
 335,
 392

In [31]:
import statistics

# Calculate the Median of the number of frames for all samples. This will then be used to cap the maximum number of frames per sample, which in turn will be used as the number of RNN units.
median_num_frames = statistics.median(num_frames)

# Calculate the Mean of the number of frames for all samples. This is just to cross-check with the Median value.
average_num_frames = statistics.mean(num_frames)

average_num_frames
median_num_frames

345.0

313

1440

In [122]:
# Store RAVDESS file path into 'path1'
path = 'C:/Books/Texas State Books/Fall 2019/Thesis A/Corpus/Simulated/RAVDESS/Original/Actor_01/03-01-06-02-02-01-01.wav'
# Read WAV file. 'rosa.core.load' returns sampling frequency in 'fs1' and audio signal in 'sig1'
sig, fs = rosa.core.load(path, sr=None)

In [123]:
# 'rosa.feature.mfcc' extracts n_mfccs from signal and stores it into 'mfcc_feat'
mfcc_feat = rosa.feature.mfcc(y=sig, sr=fs, n_mfcc=26)
mfcc_feat.shape

(26, 373)

In [119]:
# Transpose the array to flip the rows and columns. This is done so that the features become column parameters, making each row an audio frame.
transp_mfcc_feat = mfcc_feat.T
transp_mfcc_feat.shape

(373, 26)

In [120]:
# Note: The 'cap frame number' is basically the limit we set for the number of frames for each sample, so that all samples have equal dimensions.

if transp_mfcc_feat.shape[0]<345:
    
    # If number of frames is smaller than the cap frame number, we pad the array in order to reach our desired dimensions.
    
    # Pad the array so that it matches the cap frame number. The second value in the argument contains two tuples which indicate which way to pad how much.  
    transp_mfcc_feat = np.pad(transp_mfcc_feat, ((0, 345-transp_mfcc_feat.shape[0]), (0,0)), 'mean')

elif transp_mfcc_feat.shape[0]>345:
    
    # If number of frames is larger than the cap frame number, we delete rows (frames) which exceed the cap frame number in order to reach our desired dimensions.
    
    # Define a tuple which contains the range of the row indices to delete.
    row_del_index = (range(345, transp_mfcc_feat.shape[0], 1))
    
    transp_mfcc_feat = np.delete(transp_mfcc_feat, row_del_index, axis=0)

else:
    # If number of frames match the cap frame length, perfect!
    transp_mfcc_feat = transp_mfcc_feat
    
transp_mfcc_feat.shape

(345, 26)

In [76]:
a = np.array([[1,2],[3,4]])
if a.shape[1] == 2:
    print('2')
else:
    print('Not 2')

2


In [124]:
# Calculate the average zero crossing rate (utterance-level feature) using 'rosa.feat.zero_crossing_rate()' and 'np.mean' method. '.T' transposes the rows and columns. 'axis=0' indicates average is calculated column-wise
zcross_feat = rosa.feature.zero_crossing_rate(sig)
    
# Append the two 2D arrays into a single 2D array called 'mfcczcr_feat'.
mfcczcr_feat = np.append(mfcc_feat, zcross_feat, axis=0)

mfcczcr_feat

(27, 373)

In [133]:
mfcczcr_feat_flatten = mfcczcr_feat.flatten('C')
mfcczcr_feat_flatten

array([-4.01062029e+02, -4.16607037e+02, -4.45170190e+02, ...,
        6.98242188e-02,  7.76367188e-02,  5.32226562e-02])

In [134]:
mfcczcr_feat

array([[-4.01062029e+02, -4.16607037e+02, -4.45170190e+02, ...,
        -5.76538669e+02, -5.76020147e+02, -5.75995278e+02],
       [ 1.37126208e+02,  1.35369861e+02,  1.13092790e+02, ...,
         0.00000000e+00,  4.95275435e-01,  6.55174458e-01],
       [ 3.37038494e+01,  3.35070271e+01,  1.89263917e+01, ...,
         0.00000000e+00, -1.79259504e-02,  3.72893311e-01],
       ...,
       [-1.80748844e+01, -1.30548078e+01, -1.63043887e+01, ...,
         0.00000000e+00,  1.02658765e-01, -8.82195396e-02],
       [ 1.79392462e+00,  1.99604618e-01, -1.01305639e+01, ...,
         0.00000000e+00,  3.94081535e-01,  2.13723516e-01],
       [ 1.41601562e-02,  1.95312500e-02,  2.24609375e-02, ...,
         6.98242188e-02,  7.76367188e-02,  5.32226562e-02]])

In [164]:
r = np.array([])
r1 = np.insert(r,obj=0,values=mfcczcr_feat_flatten,axis=0)
r1.shape

(10071,)

In [165]:
r2 =  np.insert(r1,obj=1, values=mfcczcr_feat_flatten,axis=0)
r2.shape

(20142,)

In [9]:
j = np.array([[1,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20]])
j

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]])

In [10]:
# Dimensions are (layers, rows, columns)
k = np.reshape(j,(2,5,2))
k

array([[[ 1,  2],
        [ 3,  4],
        [ 5,  6],
        [ 7,  8],
        [ 9, 10]],

       [[11, 12],
        [13, 14],
        [15, 16],
        [17, 18],
        [19, 20]]])

In [12]:
l = tf.transpose(k)
l

<tf.Tensor: shape=(2, 5, 2), dtype=int32, numpy=
array([[[ 1, 11],
        [ 3, 13],
        [ 5, 15],
        [ 7, 17],
        [ 9, 19]],

       [[ 2, 12],
        [ 4, 14],
        [ 6, 16],
        [ 8, 18],
        [10, 20]]])>

In [13]:
m = tf.transpose(k,perm=[0, 2, 1])
m

<tf.Tensor: shape=(2, 2, 5), dtype=int32, numpy=
array([[[ 1,  3,  5,  7,  9],
        [ 2,  4,  6,  8, 10]],

       [[11, 13, 15, 17, 19],
        [12, 14, 16, 18, 20]]])>