IMPORTING THE HORSES

In [1]:
import numpy as np
import tensorflow as tf
import os
import plotly.express as px
from sklearn.model_selection import train_test_split

2023-08-21 11:43:29.964621: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-08-21 11:43:32.169613: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-21 11:43:32.198820: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-21 11:43:32.199025: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

EDA

In [3]:
path = r'path'
input_file1 = 'dev.npy'
input_file2 = 'dev_labels.npy'
utterances = np.load(
    os.path.join(path, input_file1),
    allow_pickle=True,
    encoding = 'bytes'
    )
phoneme_states = np.load(
    os.path.join(path, input_file2),
    allow_pickle=True,
    encoding = 'bytes'
    )

We have a total of 1103 utterances and their respective 1103 phoneme vectors

In [4]:
print('Number of utterances:', utterances.shape, '\nNumber of phoneme vectors:', phoneme_states.shape)

Number of utterances: (1103,) 
Number of phoneme vectors: (1103,)


In [5]:
count = 0
for i in range(utterances.shape[0]):
    print('Shape of utterance %s:'%(i+1), utterances[i].shape, 'with number of phoneme states:', phoneme_states[i].shape)
    count += 1
    if count == 5:
        break
print('and so on ...')

Shape of utterance 1: (388, 40) with number of phoneme states: (388,)
Shape of utterance 2: (416, 40) with number of phoneme states: (416,)
Shape of utterance 3: (467, 40) with number of phoneme states: (467,)
Shape of utterance 4: (482, 40) with number of phoneme states: (482,)
Shape of utterance 5: (493, 40) with number of phoneme states: (493,)
and so on ...


Each utterance is a matrix of shape (number of frames)x(number of frequency bands), where the first dimension is of variable length while the second dimension, number of frequency bands, is fixed and equal to 40. Each utterance is also called a Spectrogram consisting of 100 (time in seconds) 40-dim mel-spectral vectors. For example, the first utterance is a 3.88 second long recording, next one is 4.16 seconds long and so on...

In [6]:
num_frames = []
for utterance in utterances:
    frames = utterance.shape[0]
    num_frames.append(frames)
print('Maximum number of frames in an utterance:', max(num_frames))

Maximum number of frames in an utterance: 1738


In [7]:
fig = px.histogram(
    x=num_frames,
    nbins=40
    )
fig.update_layout(
    showlegend=False,
    width=1000,
    height=600,
    template="plotly_dark",
    )
fig.update_xaxes(title_text='Frames')
fig.update_yaxes(title_text='Count')
fig.show()

In [8]:
unique_phonemes = []
for i in phoneme_states:
    for j in i:
        if j not in unique_phonemes:
            unique_phonemes.append(j)
print(len(unique_phonemes))

138


There are a total of 46 fundamental phonemes an for each phoneme, there are 3 respective phoneme states. Therefore there exist 138 respective phoneme states. The 46 phonemes in the english language are given below:

["+BREATH+", "+COUGH+", "+NOISE+", "+SMACK+", "+UH+", "+UM+", "AA", "AE", "AH", "AO", "AW", "AY", "B", "CH", "D", "DH", "EH", "ER", "EY", "F", "G", "HH", "IH", "IY", "JH", "K", "L", "M", "N", "NG", "OW", "OY", "P", "R", "S", "SH", "SIL", "T", "TH", "UH", "UW", "V", "W", "Y", "Z", "ZH"]

In [9]:
def show_mel_spectrogram(utterances, index=0):
    fig = px.imshow(
        utterances[index].T,
        color_continuous_scale="thermal",
        labels=dict(
            x='Time (seconds)',
            y='Frequency Band',
            color='Energy (log)'
            )
        )
    fig.update_xaxes(side="bottom")
    fig.update_layout(
        xaxis=dict(
            tickmode='array',
            tickvals=[i for i in range(0,utterances[index].shape[0],50)],
            ticktext=[i/100 for i in range(0,utterances[index].shape[0],50)]
            )
        )
    fig.update_layout(
        title=f'Mel Spectrogram of a {round(utterances[index].shape[0]/100, 2)}s recording',
        width=1300,
        height=350
        )
    fig.update_layout(template="plotly_dark")
    fig.show()

In [10]:
show_mel_spectrogram(utterances,620)

PREPARING DATASETS

In [11]:
rs = 25
train_utterances, val_utterances, train_phoneme_states, val_phoneme_states = train_test_split(
    utterances, phoneme_states,
    test_size=0.15,
    random_state=rs
    )
print('Training array:', train_utterances.shape, '\nValidation array:', val_utterances.shape)

Training array: (937,) 
Validation array: (166,)


BUILDING MLP

In [68]:
class SpeechDataGeneratorMLP(tf.keras.utils.Sequence):
    def __init__(
        self,
        dataset,
        batch_size=512,
        context_size=12,
        shuffle=True,
        ):
        self.dataX, self.dataY = dataset 
        self.batch_size = batch_size
        self.context_size = context_size
        self.shuffle = shuffle
        self.freq_limit = 40
        self.num_frequencies = self.dataX[0][:,:self.freq_limit].shape[1]
        self.num_features = self.num_frequencies*(2*self.context_size + 1)
        self.idxMap = []
        for utterance_idx, utterance in enumerate(self.dataX):
            for frame_idx in range(utterance.shape[0]):
                self.idxMap.append((utterance_idx, frame_idx)) 
        
    def __getitem__(
        self,
        batch_idx
        ):
        batch_idxMap = self.idxMap[batch_idx*self.batch_size : (batch_idx+1)*self.batch_size]
        batchedX = np.empty((self.batch_size, self.num_features))
        batchedY = np.empty((self.batch_size))
        for batch_idx, (utterance_idx, frame_idx) in enumerate(batch_idxMap):
            utterance = self.dataX[utterance_idx][:,:self.freq_limit]
            utterance = utterance - (np.mean(utterance) + 1e-8)
            phoneme_label = self.dataY[utterance_idx][frame_idx]
            zero_padding = tf.constant(
                [[self.context_size, self.context_size,],
                 [0, 0]]
                )
            utterance = tf.pad(
                utterance,
                zero_padding,
                "CONSTANT"
                ).numpy()
            frame_withContext = utterance.take(
                range(frame_idx, frame_idx + 2*self.context_size + 1),
                mode='raise',
                axis=0
                ).flatten().reshape(1,-1)
            batchedX[batch_idx,] = frame_withContext
            batchedY[batch_idx,] = phoneme_label
        return batchedX, batchedY
    
    def __len__(self):
        return len(self.idxMap) // self.batch_size
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idxMap)

In [69]:
batch_size = 256
context_size = 12
train_data_generator = SpeechDataGeneratorMLP(
    dataset=(train_utterances, train_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )
val_data_generator = SpeechDataGeneratorMLP(
    dataset=(val_utterances, val_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )    

In [70]:
class FullyConnectedBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        size,
        include_bn=True
        ):
        super().__init__()
        self.dense = tf.keras.layers.Dense(size)
        self.act = tf.nn.silu
        self.dropout = tf.nn.dropout
        self.include_bn = include_bn
        if self.include_bn == True:
            self.bn = tf.keras.layers.BatchNormalization()
        
    def call(
        self,
        input_tensor,
        training = False
        ):
        beta = 1
        rate = 0.275
        x = self.dense(input_tensor, training=training)
        x = self.act(x, beta=beta)
        x = self.dropout(x, rate=rate)
        if self.include_bn == True:
            return self.bn(x, training=training)
        else:
            return x

class ResidualBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        block_sizes
        ):
        super().__init__()
        self.fc_blocks = [FullyConnectedBlock(size) for size in block_sizes] 
        self.act = tf.nn.silu
        self.norm = tf.keras.layers.LayerNormalization()    
           
    def call(
        self,
        input_tensor,
        training=False,
        ):
        beta = 1
        x = tf.keras.Sequential(self.fc_blocks)(input_tensor, training=training)
        x = self.act(x + input_tensor, beta=beta)
        return self.norm(x)
    
class MLPModel(tf.keras.Model):
    def __init__(
        self,
        blocks_sizes,
        train_data_generator,
        num_classes=len(unique_phonemes)
        ):
        super().__init__()
        self.train_data_generator = train_data_generator
        self.model_layers = [ResidualBlock(block_sizes) for block_sizes in blocks_sizes]
        self.nonresidual_layers = [FullyConnectedBlock(i, include_bn=False) for i in [512]]
        self.classifier = tf.keras.layers.Dense(num_classes, name='Output_Layer')
        
        
    def call(
        self,
        input_tensor,
        training=False
        ):
        x = tf.keras.Sequential(self.model_layers)(input_tensor, training=training)       
        x = tf.keras.Sequential(self.nonresidual_layers)(x, training=training) 
        return self.classifier(x)
    
    def model(
        self
        ):
        x = tf.keras.layers.Input(
            shape=(self.train_data_generator.num_features),
            batch_size=self.train_data_generator.batch_size,
            name="Input_Layer"
            )
        return tf.keras.Model(
            inputs=[x],
            outputs=self.call(x)    
            )        

In [71]:
mlp = MLPModel(
    blocks_sizes = [
        [2048,train_data_generator.num_features],        
        [1024,train_data_generator.num_features],
        [1024,train_data_generator.num_features],
        [512,train_data_generator.num_features],
        ],
    train_data_generator=train_data_generator,
    num_classes=len(unique_phonemes)
    ).model()
base_input = mlp.layers[0].input
base_output = mlp.layers[-1].output
mlp = tf.keras.Model(base_input, base_output)
mlp.summary()

Model: "model_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_Layer (InputLayer)    [(256, 1000)]             0         
                                                                 
 sequential_35 (Sequential)  (256, 1000)               9267040   
                                                                 
 sequential_36 (Sequential)  (256, 512)                512512    
                                                                 
 Output_Layer (Dense)        (256, 138)                70794     
                                                                 
Total params: 9850346 (37.58 MB)
Trainable params: 9833130 (37.51 MB)
Non-trainable params: 17216 (67.25 KB)
_________________________________________________________________


In [72]:
epochs = 10
initial_learning_rate = 1e-3
final_learning_rate = 9e-4
learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/epochs)
steps_per_epoch = train_data_generator.__len__()
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=steps_per_epoch,
    decay_rate=learning_rate_decay_factor,
    staircase=True
    )
mlp.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
    )

In [73]:
print(f'Training for context = {context_size}')
mlp_history = mlp.fit(
    x=train_data_generator,
    validation_data=val_data_generator,
    epochs=epochs,
    )

Training for context = 12
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


BUILDING CONV1D

In [41]:
class SpeechDataGeneratorConv(tf.keras.utils.Sequence):
    def __init__(
        self,
        dataset,
        batch_size=256,
        context_size=20,
        shuffle=True,
        ):
        self.dataX, self.dataY = dataset 
        self.batch_size = batch_size
        self.context_size = context_size
        self.shuffle = shuffle
        self.num_frequencies = self.dataX[0].shape[1]
        self.num_timesteps = 2*self.context_size + 1
        self.idxMap = []
        for utterance_idx, utterance in enumerate(self.dataX):
            for frame_idx in range(utterance.shape[0]):
                self.idxMap.append((utterance_idx, frame_idx)) 
        
    def __getitem__(
        self,
        batch_idx
        ):
        batch_idxMap = self.idxMap[batch_idx*self.batch_size : (batch_idx+1)*self.batch_size]
        batchedX = np.empty((self.batch_size, self.num_timesteps, self.num_frequencies))
        batchedY = np.empty((self.batch_size))
        for batch_idx, (utterance_idx, frame_idx) in enumerate(batch_idxMap):
            utterance = self.dataX[utterance_idx]
            utterance = utterance - (np.mean(utterance) + 1e-8)
            phoneme_label = self.dataY[utterance_idx][frame_idx]
            zero_padding = tf.constant(
                [[self.context_size, self.context_size,],
                 [0, 0]]
                )
            utterance = tf.pad(
                utterance,
                zero_padding,
                "CONSTANT"
                ).numpy()
            frame_withContext = utterance.take(
                range(frame_idx, frame_idx + 2*self.context_size + 1),
                mode='raise',
                axis=0
                )
            batchedX[batch_idx,] = frame_withContext
            batchedY[batch_idx,] = phoneme_label
        return batchedX, batchedY
    
    def __len__(self):
        return len(self.idxMap) // self.batch_size
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idxMap)

In [42]:
batch_size = 256
context_size = 20
train_data_generator = SpeechDataGeneratorConv(
    dataset=(train_utterances, train_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )
val_data_generator = SpeechDataGeneratorConv(
    dataset=(val_utterances, val_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )    

In [60]:
class Conv1DBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        out_channels,
        kernel_size=3,
        include_bn=True
        ):
        super().__init__()
        self.conv = tf.keras.layers.Conv1D(out_channels, kernel_size, padding='same')
        self.act = tf.nn.silu
        self.include_bn = include_bn
        self.dropout = tf.nn.dropout
        if self.include_bn == True:
            self.bn = tf.keras.layers.BatchNormalization()
        
    def call(
        self,
        input_tensor,
        training=False
        ):
        beta = 0.9
        rate = 0.1
        x = self.conv(input_tensor, training=training)
        x = self.act(x, beta=beta)
        x = self.dropout(x, rate=rate)
        if self.include_bn == True:
            return self.bn(x, training=training)
        else:
            return x 
    
class ResidualConv1DBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        block_channels
        ):
        super().__init__()
        self.cnn_blocks = [Conv1DBlock(out_channels) for out_channels in block_channels] 
        self.pooling = tf.keras.layers.MaxPooling1D()
        self.skip_connection = tf.keras.layers.Conv1D(block_channels[-1], 1, padding='same')
        self.act = tf.nn.silu
        
    def call(
        self,
        input_tensor,
        training=False,
        ):
        beta = 0.9
        x = tf.keras.Sequential(self.cnn_blocks)(input_tensor, training=training)
        x = self.act(x + self.skip_connection(input_tensor), beta=beta)
        return self.pooling(x)

class CNN1DModel(tf.keras.Model):
    def __init__(
        self,
        blocks_channels,
        train_data_generator,
        num_classes=len(unique_phonemes)
        ):
        super().__init__()
        self.train_data_generator = train_data_generator
        self.res_blocks = [ResidualConv1DBlock(block_channels) for block_channels in blocks_channels]
        self.nonresidual_layers = [FullyConnectedBlock(size, include_bn=False) for size in [512]]  
        self.global_pool = tf.keras.layers.GlobalAveragePooling1D(name='GAP_Layer')
        self.flatten = tf.keras.layers.Flatten(name='Flatten_Layer')
        self.classifier = tf.keras.layers.Dense(num_classes, name='Output_Layer')
        
    def call(
        self,
        input_tensor,
        training=False
        ):
        x = tf.keras.Sequential(self.res_blocks)(input_tensor, training=training)
        # x = self.global_pool(x)
        x = self.flatten(x)
        x = tf.keras.Sequential(self.nonresidual_layers)(x, training=training)
        return self.classifier(x)
    
    def model(
        self
        ):
        x = tf.keras.layers.Input(
            shape=(self.train_data_generator.num_timesteps, self.train_data_generator.num_frequencies),
            batch_size=self.train_data_generator.batch_size,
            name="Input_Layer"
            )
        return tf.keras.Model(
            inputs=[x],
            outputs=self.call(x)    
            )

In [65]:
cnn1d = CNN1DModel(
    blocks_channels=[
        [64,128,256],
        [64,128,256],
    ],
    train_data_generator=train_data_generator,
    num_classes=len(unique_phonemes)
    ).model()
base_input = cnn1d.layers[0].input
base_output = cnn1d.layers[-1].output
cnn1d = tf.keras.Model(base_input, base_output)
cnn1d.summary()

Model: "model_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_Layer (InputLayer)    [(256, 41, 40)]           0         
                                                                 
 sequential_33 (Sequential)  (256, 10, 256)            383360    
                                                                 
 Flatten_Layer (Flatten)     (256, 2560)               0         
                                                                 
 sequential_34 (Sequential)  (256, 512)                1311232   
                                                                 
 Output_Layer (Dense)        (256, 138)                70794     
                                                                 
Total params: 1765386 (6.73 MB)
Trainable params: 1763594 (6.73 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________


In [66]:
epochs = 5
initial_learning_rate = 1e-3
final_learning_rate = 9e-4
learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/epochs)
steps_per_epoch = train_data_generator.__len__()
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=steps_per_epoch,
    decay_rate=learning_rate_decay_factor,
    staircase=True
    )
cnn1d.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
    )

In [67]:
print(f'Training for context = {context_size}')
cnn1d_history = cnn1d.fit(
    x=train_data_generator,
    validation_data=val_data_generator,
    epochs=epochs,
    )

Training for context = 20
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


BUILDING CONV2D

In [16]:
batch_size = 256
context_size = 20
train_data_generator = SpeechDataGeneratorConv(
    dataset=(train_utterances, train_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )
val_data_generator = SpeechDataGeneratorConv(
    dataset=(val_utterances, val_phoneme_states),
    batch_size=batch_size,
    context_size=context_size,
    shuffle=True
    )    

In [19]:
class Conv2DBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        out_channels,
        kernel_size=3,
        include_bn=True
        ):
        super().__init__()
        self.conv = tf.keras.layers.Conv2D(out_channels, kernel_size, padding='same')
        self.act = tf.nn.silu
        self.dropout = tf.nn.dropout
        self.include_bn = include_bn
        if self.include_bn == True:
            self.bn = tf.keras.layers.BatchNormalization()
        
    def call(
        self,
        input_tensor,
        training=False
        ):
        beta = 0.9
        rate = 0.15
        x = self.conv(input_tensor, training=training)
        x = self.act(x, beta=beta)
        x = self.dropout(x, rate=rate)
        if self.include_bn == True:
            return self.bn(x, training=training)
        else:
            return x 
    
class ResidualConv2DBlock(tf.keras.layers.Layer):
    def __init__(
        self,
        block_channels
        ):
        super().__init__()
        self.cnn_blocks = [Conv2DBlock(out_channels) for out_channels in block_channels] 
        self.pooling = tf.keras.layers.MaxPooling2D()
        self.skip_connection = tf.keras.layers.Conv2D(block_channels[-1], 1, padding='same')
        self.act = tf.nn.silu
        
    def call(
        self,
        input_tensor,
        training=False,
        ):
        beta = 0.9
        x = tf.keras.Sequential(self.cnn_blocks)(input_tensor, training=training)
        x = self.act(x + self.skip_connection(input_tensor), beta=beta)
        return self.pooling(x)

class CNN2DModel(tf.keras.Model):
    def __init__(
        self,
        blocks_channels,
        train_data_generator,
        num_classes=len(unique_phonemes)
        ):
        super().__init__()
        self.train_data_generator = train_data_generator
        self.resize = tf.keras.layers.Resizing(28,28)
        self.res_blocks = [ResidualConv2DBlock(block_channels) for block_channels in blocks_channels]
        self.nonresidual_layers = [FullyConnectedBlock(size, include_bn=False) for size in [512]]  
        self.gap = tf.keras.layers.GlobalAveragePooling2D(name='GAP_Layer')
        self.flatten = tf.keras.layers.Flatten(name='Flatten_Layer')
        self.classifier = tf.keras.layers.Dense(num_classes, name='Output_Layer')
        
    def call(
        self,
        input_tensor,
        training=False
        ):
        x = tf.expand_dims(input_tensor, axis=-1)
        # x = self.resize(x)
        x = tf.keras.Sequential(self.res_blocks)(x, training=training)
        # x = self.gap(x)
        x = self.flatten(x)
        x = tf.keras.Sequential(self.nonresidual_layers)(x, training=training)
        return self.classifier(x)
    
    def model(
        self
        ):
        x = tf.keras.layers.Input(
            shape=(self.train_data_generator.num_timesteps, self.train_data_generator.num_frequencies),
            batch_size=self.train_data_generator.batch_size,
            name="Input_Layer"
            )
        return tf.keras.Model(
            inputs=[x],
            outputs=self.call(x)    
            )

In [22]:
cnn2d = CNN2DModel(
    blocks_channels=[
        [16,32],
        [16,32],
        [16,32],
        ],
    train_data_generator=train_data_generator,
    num_classes=len(unique_phonemes)
    ).model()
base_input = cnn2d.layers[0].input
base_output = cnn2d.layers[-1].output
cnn2d = tf.keras.Model(base_input, base_output)
cnn2d.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_Layer (InputLayer)    [(256, 41, 40)]           0         
                                                                 
 tf.expand_dims_2 (TFOpLamb  (256, 41, 40, 1)          0         
 da)                                                             
                                                                 
 sequential_3 (Sequential)   (256, 5, 5, 32)           26080     
                                                                 
 Flatten_Layer (Flatten)     (256, 800)                0         
                                                                 
 sequential_4 (Sequential)   (256, 512)                410112    
                                                                 
 Output_Layer (Dense)        (256, 138)                70794     
                                                           

In [23]:
epochs = 5
initial_learning_rate = 1e-3
final_learning_rate = 9e-4
learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/epochs)
steps_per_epoch = train_data_generator.__len__()
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=steps_per_epoch,
    decay_rate=learning_rate_decay_factor,
    staircase=True
    )
cnn2d.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
    )

In [24]:
print(f'Training for context = {context_size}')
cnn2d_history = cnn2d.fit(
    x=train_data_generator,
    validation_data=val_data_generator,
    epochs=epochs,
    )

Training for context = 20
Epoch 1/5


2023-08-21 11:45:17.583679: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel_3/sequential_3/residual_conv2d_block_4/sequential/conv2d_block_8/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2023-08-21 11:45:18.310019: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-21 11:45:18.756195: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-08-21 11:45:18.757082: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-08-21 11:45:18.757148: W tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc:109] Couldn't get ptxas version : FAILED_PRECONDITION: Couldn't get ptxas/nvlink version string: INTERNAL: Couldn't invoke ptxas --version
2023-08-21 11:45:18.757839: I tensorflow/tsl/platform



2023-08-21 11:55:41.132527: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel_3/sequential_3/residual_conv2d_block_4/sequential/conv2d_block_8/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
