In [1]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_io as tfio
from tqdm import tqdm

from data_loader import data_loader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

## Yamnet imports 
import params as yamnet_params
import yamnet_modified  as yamnet_model
import features as features_lib

In [2]:
params = yamnet_params.Params()

class_names = yamnet_model.class_names(
    './yamnet_class_map.csv'
)

# Data loading

In [3]:
dl = data_loader(
    Crema_path='../../Datasets/Crema/',
    Ravdess_path='../../Datasets/Ravdess/',
    Savee_path='../../Datasets/Savee/',
    Tess_path='../../Datasets/Tess/'
)

In [4]:
X, y = dl.get_numpy('tess', pad=True, max_len=40_000)

100%|██████████| 2800/2800 [03:49<00:00, 12.19it/s]


In [5]:
train_data, train_label, val_data, val_label, test_data, test_label = dl.split_numpy(X, y)

In [6]:
# encoder, train_label, val_label, test_label = dl.ohe_labels(train_label, val_label, test_label)

In [6]:
train_data.shape, train_label.shape, val_data.shape, val_label.shape, test_data.shape, test_label.shape

((1960, 40000), (1960,), (420, 40000), (420,), (420, 40000), (420,))

In [8]:
# train_data = train_data.tolist()
# train_label = train_label.tolist()

In [7]:
train_ds = tf.data.Dataset.from_tensor_slices((train_data, train_label))

In [8]:
val_ds = tf.data.Dataset.from_tensor_slices((train_data, train_label))

In [9]:
test_ds = tf.data.Dataset.from_tensor_slices((test_data, test_label))

In [11]:
train_ds.element_spec

(TensorSpec(shape=(40000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.string, name=None))

In [12]:
val_ds.element_spec

(TensorSpec(shape=(40000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.string, name=None))

In [13]:
test_ds.element_spec

(TensorSpec(shape=(40000,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.string, name=None))

In [14]:
# This function recived the wav file and each wav file divid to frames with 
#96ms longe and 10ms hope.for each frame the lable is the label of the
#main audio file.Then a batch of these frames is used as input to the yamnet model
# def yamnet_frames_model_transfer1(wav_data):
#     waveform_padded = features_lib.pad_waveform(wav_data, params)
#     log_mel_spectrogram, features = features_lib.waveform_to_log_mel_spectrogram_patches(
#         waveform_padded, params)
#     # num_embeddings = tf.shape(features)[0]
#     return log_mel_spectrogram

def yamnet_frames_model_transfer1(wav_data, class_names):
    waveform_padded = features_lib.pad_waveform(wav_data, params)
    log_mel_spectrogram, features = features_lib.waveform_to_log_mel_spectrogram_patches(
        waveform_padded, params)
    return log_mel_spectrogram, class_names


In [15]:
train_ds = train_ds.map(yamnet_frames_model_transfer1)
val_ds = val_ds.map(yamnet_frames_model_transfer1)
test_ds = test_ds.map(yamnet_frames_model_transfer1)

In [16]:
print(train_ds.element_spec)
print(val_ds.element_spec)
print(test_ds.element_spec)

(TensorSpec(shape=(288, 64), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))
(TensorSpec(shape=(288, 64), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))
(TensorSpec(shape=(288, 64), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))


In [None]:
# train_data_mod = [yamnet_frames_model_transfer1(train_data[i]) for i in tqdm(range(len(train_data)))]
# val_data_mod = [yamnet_frames_model_transfer1(val_data[i]) for i in tqdm(range(len(val_data)))]
# test_data_mod = [yamnet_frames_model_transfer1(test_data[i]) for i in tqdm(range(len(test_data)))]

# train_data_mod = np.array(train_data_mod)
# val_data_mod = np.array(val_data_mod)
# test_data_mod = np.array(test_data_mod)

# train_data_mod.shape, val_data_mod.shape, test_data_mod.shape

In [None]:
# train_data_mod = tf.convert_to_tensor(train_data_mod)
# val_data_mod = tf.convert_to_tensor(val_data_mod)
# test_data_mod = tf.convert_to_tensor(test_data_mod)

In [None]:
# train_data_mod = tf.expand_dims(train_data_mod, axis=-1)
# val_data_mod = tf.expand_dims(val_data_mod, axis=-1)
# test_data_mod = tf.expand_dims(test_data_mod, axis=-1)

In [None]:
# train_data_mod.shape, val_data_mod.shape, test_data_mod.shape

# Building Model

In [17]:
#load yamnet model. yamnet_frames_model_transfer1 is modified version of the
#yamnet_frames_model_transfer in yamnet.py file in order to be able the 
#train yamnet from scratch

classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
yamnet = yamnet_model.yamnet_frames_model_transfer(params, len(classes))

preloaded_layers = yamnet.layers.copy()
preloaded_weights = []

for pre in preloaded_layers:
        preloaded_weights.append(pre.get_weights())    



In [18]:
#load the weights from pretrain model except for the last layer and
#check which layer used the pretrain weights
# store weights before loading pre-trained weights

chkp=True

if chkp==True:
# load pre-trained weights(fine tuning the model)
#load the weights from pretrain model except for the last layer
    yamnet.load_weights('./yamnet.h5',by_name=True)
 #   yamnet.load_weights('D:/bat_n/yamnet_2.h5',by_name=True)
    for layer, pre in zip(yamnet.layers, preloaded_weights):
        weights = layer.get_weights()
        if weights:
            if np.array_equal(weights, pre):
                print('not loaded', layer.name)
            # else:
            #     print('loaded', layer.name)


  a1, a2 = asarray(a1), asarray(a2)


In [19]:
# NAME='./yamnet.h5'
# checkpoint = tf.keras.callbacks.ModelCheckpoint(NAME, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=10,
                                            restore_best_weights=True,
                                            verbose=1)

# tensorboard=tf.keras.callbacks.TensorBoard(
#     log_dir='D:/bat_n/logs')

yamnet.compile(optimizer='adam', 
               loss="categorical_crossentropy", 
               metrics=[
                        'accuracy',
                        tf.keras.metrics.Recall(),
                        tf.keras.metrics.Precision()
                ]
)

yamnet.summary()

tf.keras.utils.plot_model(
    yamnet,
    to_file='yamnet.png',
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=True,
    dpi=96,
    layer_range=True,
    show_layer_activations=True
)

Model: "yamnet_frames"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 reshape (Reshape)           (None, 96, 64, 1)         0         
                                                                 
 layer1/conv (Conv2D)        (None, 48, 32, 32)        288       
                                                                 
 layer1/conv/bn (BatchNormal  (None, 48, 32, 32)       96        
 ization)                                                        
                                                                 
 layer1/relu (ReLU)          (None, 48, 32, 32)        0         
                                                                 
 layer2/depthwise_conv (Dept  (None, 48, 32, 32)       288       
 hwiseConv2D)                                        

In [21]:

yamnet.fit(
    train_ds,
    epochs=100, 
    validation_data= val_ds,
    callbacks=[
        # checkpoint,
        # tensorboard,
        callback
    ]
)

Epoch 1/100


ValueError: in user code:

    File "c:\Users\samin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\samin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\samin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\samin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\samin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "reshape" "                 f"(type Reshape).
    
    Cannot reshape a tensor with 18432 elements to shape [288,96,64,1] (1769472 elements) for '{{node yamnet_frames/reshape/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](IteratorGetNext, yamnet_frames/reshape/Reshape/shape)' with input shapes: [288,64], [4] and with input tensors computed as partial shapes: input[1] = [288,96,64,1].
    
    Call arguments received by layer "reshape" "                 f"(type Reshape):
      • inputs=tf.Tensor(shape=(288, 64), dtype=float32)


In [None]:
#test the model
test_res = yamnet.evaluate(test_ds)

test_res

In [None]:

#dir_="D:/bat_n/df_test_b.csv"
#dir_="D:/bat_n/df_test_n.csv"
#dir_="D:/bat_n/df_test_uk.csv"
import os
dir_="D:/bat_n/norfolk_test_files.csv"
df_test_b=pd.read_csv(dir_)
base_data_path='D:/bat_n/wav/'
full_path = df_test_b['filename'].apply(lambda row: os.path.join(base_data_path, row))
df_test_b= df_test_b.assign(filename=full_path)

full_path = df_test_b['filename'].apply(lambda row: ( row+ '.wav'))

df_test_b= df_test_b.assign(filename=full_path)

filenames=df_test_b['filename']
targets=df_test_b['target']
df_test_b['fold']=1
folds=df_test_b['fold']


#the directory contained the .wav files

test_b = tf.data.Dataset.from_tensor_slices((filenames, targets,folds))
test_b= test_b.map(load_wav_for_map)
test_b = test_b.map(yamnet_frames_model_transfer1).unbatch()
remove_fold_column = lambda embedding, label, fold: (embedding, label)
test_b = test_b.map(remove_fold_column)
test_b = test_b.cache().batch(32).prefetch( tf.data.experimental.AUTOTUNE)

evaluate= yamnet.evaluate(test_b)







dir_="D:/bat_n/df_test_uk.csv"
dir_="D:/bat_n/uk_test_files1.csv"
df_test_b=pd.read_csv(dir_)


filenames=df_test_b['filename']
targets=df_test_b['target']
df_test_b['fold']=1
folds=df_test_b['fold']


#the directory contained the .wav files

test_b = tf.data.Dataset.from_tensor_slices((filenames, targets,folds))
test_b= test_b.map(load_wav_for_map)
test_b = test_b.map(yamnet_frames_model_transfer1).unbatch()
cached_ds = main_ds.cache()
test_b_train = cached_ds.filter(lambda embedding, label, fold: fold <2)
test_b_val = cached_ds.filter(lambda embedding, label, fold: fold ==3)
test_b_test = cached_ds.filter(lambda embedding, label, fold: fold == 4)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

test_b_train= test_b_train.map(remove_fold_column)

test_b_val= test_b_val.map(remove_fold_column)

test_b_test= test_b_test.map(remove_fold_column)


#X_train = list(map(lambda x: x[0], train_ds))
#y_train = list(map(lambda x: x[1], train_ds))

#creat a batch of size 32 of frames with size (96,64)
#we have to suffle the train set to avoid the frames from the same audio on one batch
train_ds = test_b_train.cache().shuffle(1000).batch(32).prefetch( tf.data.experimental.AUTOTUNE)
val_ds = test_b_val.cache().batch(32).prefetch( tf.data.experimental.AUTOTUNE)
test_ds = test_b_test.cache().batch(32).prefetch( tf.data.experimental.AUTOTUNE)



#test n

'''
dir_="D:/bat_n/df_test_n.csv"
dir_="D:/bat_n/df_test_uk.csv"
df_test_b=pd.read_csv(dir_)

filenames=df_test_b['filename']
targets=df_test_b['target']
folds=df_test_b['fold']
l=[]
for j in range(1,5):
    print((j-1),'--',j*175)
    for i in range(0,175):
        
        l.append(j)
        
        
        
folds=l[:len(df_test_b)]        
        
    

test_b = tf.data.Dataset.from_tensor_slices((filenames, targets,folds))
test_b= test_b.map(load_wav_for_map)

test_b = test_b.map(yamnet_frames_model_transfer1)#.unbatch()


cached_ds = test_b.cache()
test_b_train = cached_ds.filter(lambda embedding, label, fold: fold <2)
test_b_val = cached_ds.filter(lambda embedding, label, fold: fold ==3)
test_b_test = cached_ds.filter(lambda embedding, label, fold: fold <= 2)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

test_b_train= test_b_train.map(remove_fold_column)

test_b_val= test_b_val.map(remove_fold_column)

test_b_test= test_b_test.map(remove_fold_column)



#creat a batch of size 32 of frames with size (96,64)
#we have to suffle the train set to avoid the frames from the same audio on one batch
train_ds = test_b_train.cache().shuffle(1000).batch(32).prefetch( tf.data.experimental.AUTOTUNE)
val_ds = test_b_val.cache().batch(32).prefetch( tf.data.experimental.AUTOTUNE)
test_ds = test_b_test.cache().batch(32).prefetch( tf.data.experimental.AUTOTUNE)


yamnet=yamnet_model.yamnet_frames_model_transfer(params)

preloaded_layers = yamnet.layers.copy()
preloaded_weights = []
for pre in preloaded_layers:
        preloaded_weights.append(pre.get_weights())    


#load the weights from pretrain model except for the last layer and
#check which layer used the pretrain weights
# store weights before loading pre-trained weights
chkp==True
if chkp==True:
# load pre-trained weights(fine tuning the model)
#load the weights from pretrain model except for the last layer
    yamnet.load_weights('D:/bat_n/yamnet_2.h5',by_name=True)
    for layer, pre in zip(yamnet.layers, preloaded_weights):
        weights = layer.get_weights()
        if weights:
            if np.array_equal(weights, pre):
                print('not loaded', layer.name)
            else:
                print('loaded', layer.name)





yamnet.compile(optimizer='adam', 
               loss='sparse_categorical_crossentropy', 
               metrics=['accuracy'])
yamnet.fit(train_ds,epochs=20)






loss= yamnet.evaluate(test_ds)





from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np

SAMPLE_RATE = 16000
X=list(map(lambda x: x[0], test_b))
X=np.array(X)
#y_train = list(map(lambda x: x[1], train_ds))
augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
        ])
augmented_samples = augment(samples=X, sample_rate=16000)
    
    
'''