<a href="https://colab.research.google.com/github/RobertoAlessandri/CNN_DOA/blob/main/NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.8.0


### Load the Dataset

In [None]:
#timit_rir = keras.datasets.fashion_mnist

#(train_audio, train_labels), (test_audio, test_labels) = timit_rir.load_data()

Loading the dataset returns 4 NumPy arrays:
* train_audio, train_labels are the training set.
* test_audio, test_labels are the test set.

Audio have a 512 * 14 size with values from [ to ]. Labels are an array of integers, ranging from 0 to 11 (or 35). Each audio is mapped to a single direction, we have to define the class names:

In [None]:
class_names12 = ['0', '30', '60', '90', '120', '150', '180', '210', '240', '270', '300', '330']
class_names36 = ['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110', '120', '130', '140', '150', '160', '170', '180', '190', '200', '210', '220', '230', '240', '250', '260', '270', '280', '290', '300', '310', '320', '330', '340', '350']


## Brief Data Exploration

In [None]:
# training set shape:
print('Training set audio dimension:',str(train_audio.shape))
print('Training set label dimension:',str(train_labels.shape))

# test set shape:
print('Test set audio dimension:',str(test_audio.shape))
print('Test set label dimension:',str(test_labels.shape))

## Data Pre-Processing

In [None]:
# Inspect one audio
plt.figure()
plt.imshow(train_audio[5000])
plt.colorbar()
plt.grid(False)
plt.show()

We normalize the data in a range of 0 to 1 before feeding the data to the Neural Network model (?)

In [None]:
train_audio = train_audio / train_audio.max()

test_audio = test_audio / train_audio.max()

In [None]:
# Let's show new range
plt.figure()
plt.imshow(train_audio[5000])
plt.colorbar()
plt.grid(False)
plt.show()

In [None]:
# Let's show the first 25 audio
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_audio[i], cmap=plt.cm.binary)
plt.show()

## Network Architecture

- Each convolutional layer uses 64 convolution kernels with the size of 3*3, to learn local correlations between local T-F regions.
- BN layer is used after each convolutional layer to improve the stability of the network and speed up the convergence of the network.
- The activation function of convolutional layers and fully connected layers is ReLU.
- Between the convolutional layer and the fully connected layer and after each fully connected layer, a droput procedure with rate 0.5 is used to avoid overfitting.
- size and number of convolutional kernels = . ? 
number of nodes in the fully connected layers = 512?.
- input = SI features
- The fully connected layer combines all the features extracted by the convolution layer to reduce the input 2D feature matrix to a 1D feature vector to facilitate the output layer for classification processing.
- SoftMax function is used to perform clssification
- The final source DOA is estimated by maximizing the posterior prbability 
- In the CNN training, the cross-entropy functon is used as the loss function
- We employ the Adam as the optimizer
- Initial learning rate is set to be 10^-3
- Maximum number of epochs = 100
- Early stopping with a patience of 10 epochs measured on the validation set is als used to prevent overfitting.



In [None]:
filters = 64
kernel_size = (3,3)
strides = (1,1)
input_shape = (14, 511, 10)
rate = 0.5
K = 12 # Then we will test with K = 36

model = keras.Sequential ([
  # input layer (14 * 511 * 10) (convolutional layers + batch normalization (BN) w ReLU)                     
  keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, activation='relu', input_shape=input_shape, padding='valid', data_format = 'channels_last', use_bias = True, name='conv1'),
  keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=False, scale=False, beta_initializer="zeros", gamma_initializer="ones", moving_mean_initializer="zeros", moving_variance_initializer="ones", beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, name = 'bn1'),
  # 2nd convolutional layers + batch normalization (BN) w ReLU
  keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, activation='relu', padding='valid', data_format = 'channels_last', use_bias = True, name='conv2'),
  keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=False, scale=False, beta_initializer="zeros", gamma_initializer="ones", moving_mean_initializer="zeros", moving_variance_initializer="ones", beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, name = 'bn2'),
  # dropout procedure with rate 0.5
  tf.keras.layers.Dropout(rate, noise_shape=None, seed=None, name = 'dn1'),
  # 1st fully connected layer w ReLU & dropout procedure with rate 0.5
  tf.keras.layers.Dense(512, activation = 'relu', name = 'fc1'),
  tf.keras.layers.Dropout(rate, noise_shape=None, seed=None, name = 'dn2'),
  # 2nd fully connected layer w ReLU & dropout procedure with rate 0.5
  tf.keras.layers.Dense(512, activation = 'relu', name = 'fc2'),
  tf.keras.layers.Dropout(rate, noise_shape=None, seed=None, name = 'dn3'),
  # SoftMax
  #tf.keras.layers.Softmax(output_shape = K)
  tf.keras.layers.Dense(K, activation = 'softmax', name = 'output'),
])

# keras.layers.Flatten(name='flatten'),?


In [None]:
# Let's show the architecture of the model
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1 (Conv2D)              (None, 12, 509, 64)       5824      
                                                                 
 bn1 (BatchNormalization)    (None, 12, 509, 64)       128       
                                                                 
 conv2 (Conv2D)              (None, 10, 507, 64)       36928     
                                                                 
 bn2 (BatchNormalization)    (None, 10, 507, 64)       128       
                                                                 
 dn1 (Dropout)               (None, 10, 507, 64)       0         
                                                                 
 fc1 (Dense)                 (None, 10, 507, 512)      33280     
                                                                 
 dn2 (Dropout)               (None, 10, 507, 512)     

## Compiling the Model


In [None]:
learning_rate = 1e-3
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy'
)

## Training the Model

In [None]:
model.fit(train_audio, train_labels, epochs=100)

## Evaluating Accuracy

In [None]:
# Check how model performs on test dataset
test_loss, test_acc = model.evaluate(test_audio,  test_labels, verbose=2)

print('\nTest accuracy:', test_acc)

## Predictions

In [None]:
# Predictions over test set
predictions = model.predict(test_audio)

In [None]:
# Show result
audio_idx = 0 # Idx of image
print('Model output:',predictions[audio_idx])
print('Predicted label:', np.argmax(predictions[audio_idx]))
print('Ground truth label:',test_labels[audio_idx])

In [None]:
def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])

    plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
        color = 'red'

    plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
    predictions_array, true_label = predictions_array, true_label[i]
    plt.grid(False)
    plt.xticks(range(10))
    plt.yticks([])
    thisplot = plt.bar(range(10), predictions_array, color="#777777")
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)

    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')

In [None]:
num_rows = 5
num_cols = 3
num_audio = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_audio):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image(i, predictions[i], test_labels, test_audio)
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()