In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense,Conv2D,MaxPool2D,Flatten,BatchNormalization,Dropout
from keras.optimizers import Adam,SGD
import numpy as np
import matplotlib.pyplot as plt
import os
import wave
from tqdm import tqdm

In [None]:
# wav.readframes(-1) reads all the frames and returns a bytestring, we convert
# that into an array using np.frombuffer
# wav.getframerate() returns the frame rate
data = 'audio_to_image/'
def create_image(wav_file,count):
    wav = wave.open(wav_file,'r')
    out = plt.specgram(np.frombuffer(wav.readframes(-1),np.int16),wav.getframerate())
    class_label = os.path.basename(wav_file).split('_')[0]  # Modify this based on your file naming convention
  # Create directory if it doesn't exist
    output_directory = os.path.join(data, class_label)
    os.makedirs(output_directory, exist_ok=True)
    plt.savefig(os.path.join(output_directory, f'{count}.png'))
    plt.close()


In [None]:
path = 'free-spoken-digit-dataset-master/recordings/'
files = os.listdir(path)
for i,f in tqdm(enumerate(files)):
    create_image(path+f,i)

In [None]:
arr =plt.imread('audio_to_image/1/301.png')

In [None]:
arr.shape

(480, 640, 4)

In [127]:
from keras.preprocessing.image import ImageDataGenerator
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
BATCH_SIZE = 64
CHANNELS = 3
CLASSES = 10


train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
                                             batch_size=BATCH_SIZE,
                                             validation_split=0.2,
                                             directory=os.path.join('audio_to_image'),
                                             shuffle=True,
                                             color_mode='rgb',
                                             image_size=(IMAGE_HEIGHT, IMAGE_WIDTH),
                                             subset="training",
                                             seed=0)

val_dataset = tf.keras.preprocessing.image_dataset_from_directory(
                                             batch_size=BATCH_SIZE,
                                             validation_split=0.2,
                                             directory=os.path.join('audio_to_image'),
                                             shuffle=True,
                                             color_mode='rgb',
                                             image_size=(IMAGE_HEIGHT, IMAGE_WIDTH),
                                             subset="validation",
                                             seed=0)

Found 3003 files belonging to 10 classes.
Using 2403 files for training.
Found 3003 files belonging to 10 classes.
Using 600 files for validation.


In [None]:
model = Sequential()
model.add(tf.keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS)))
model.add(Conv2D(32,3,strides=2,padding='same',activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2,2),strides=2))
model.add(Conv2D(64,3,strides=2,padding='same',activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2,2),strides=2))
model.add(Conv2D(128,3,strides=2,padding='same',activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2,2),strides=2))
model.add(Flatten())
model.add(Dense(256,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(CLASSES,activation='softmax'))

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_22 (Conv2D)          (None, 128, 128, 32)      896       
                                                                 
 batch_normalization_32 (Ba  (None, 128, 128, 32)      128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_22 (MaxPooli  (None, 64, 64, 32)        0         
 ng2D)                                                           
                                                                 
 conv2d_23 (Conv2D)          (None, 32, 32, 64)        18496     
                                                                 
 batch_normalization_33 (Ba  (None, 32, 32, 64)        256       
 tchNormalization)                                               
                                                      

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer=tf.keras.optimizers.RMSprop(),metrics=['accuracy'])

history = model.fit(train_dataset, epochs=20, validation_data=val_dataset,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# wav.readframes(-1) reads all the frames and returns a bytestring, we convert
# that into an array using np.frombuffer
# wav.getframerate() returns the frame rate

def predict(wav_file):
    wav = wave.open(wav_file,'r')
    out = plt.specgram(np.frombuffer(wav.readframes(-1),np.int16),wav.getframerate())
    plt.savefig('out.png')
    plt.imread('out.png')
    plt.close()


In [132]:
# wav = wave.open("recordings_0_george_0.wav",'r')
# out = plt.specgram(np.frombuffer(wav.readframes(-1),np.int16),wav.getframerate())
# plt.savefig('')
plt.close()
out = tf.keras.utils.load_img(
    'audio_to_image/9/2786.png',
    color_mode='rgb',
    target_size=(256,256),
    interpolation='nearest',
    keep_aspect_ratio=False
)
out = np.expand_dims(out,axis=0)
np.argmax(model.predict(out))



7

In [125]:
model.predict(out)



array([[1.1432675e-04, 8.5397250e-10, 8.6906837e-10, 3.3344670e-13,
        8.9090103e-01, 8.3814934e-02, 2.4816871e-02, 5.3972023e-08,
        3.5279038e-04, 2.3688623e-08]], dtype=float32)

In [130]:
# Create CNN model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS)))
model.add(tf.keras.layers.Conv2D(32, 3, strides=2, padding='same', activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(CLASSES, activation='softmax'))

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.RMSprop(),
    metrics=['accuracy'],
)

# Train model for 10 epochs, capture the history
history = model.fit(train_dataset, epochs=10, validation_data=val_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
