In [3]:
import os
import librosa
import numpy as np
import pandas as pd 
from PIL import Image
import sounddevice as sd
from sklearn.model_selection import train_test_split

# Dataset src: https://www.kaggle.com/datasets/warcoder/cats-vs-dogs-vs-birds-audio-classification?resource=download

### Word to Text

Human speech is made up of numerous frequencies; which we interpret in a logarithmic way. Hence while frequencies that are a set distance apart from each other may not be distingusiahble to us, ones that are logarithmically distributed are identifiable. This is the basic premise behind the development of mel spectrograms vs normal ones. Spectrograms are also extremely helpful in sound classification as a form of encoding (and displaying) sound based information along time. This piece of code leverages the abilities of mel spectrograms to build a Convolution Neural Network (CNN) that can be used to classify the sounded audio of discrete words and thus output them as text.

In [4]:
folder_path = os.path.join(os.curdir,r'Animals') # downloaded into the folder Animals (in the same directory as this file)
assert os.path.exists(folder_path) # checks that the folder exists in directory
r_state = 27 # to be used as random state

In [5]:
data = []
for k in os.listdir(folder_path): # for the folders (classes) and files in the Animals folder
    subfolder = os.path.join(folder_path, k)
    if os.path.isfile(subfolder): # skips files
        pass 
    else:
        for e in os.listdir(subfolder): 
            data.append((os.path.join(subfolder, e),k)) # and gets the complete (tho still relative) path of each file and the class

In [6]:
df_paths = pd.DataFrame(data) # creates a dataframe for those
df_paths.columns = ["Filepath","Class"]
df_paths["Class"].value_counts(normalize = True)*100 # pretty even split wrt classes

dog     34.426230
cat     33.934426
bird    31.639344
Name: Class, dtype: float64

In [7]:
def scale_minmax(X, min=0.0, max=1.0): # normalization 
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def to_mel_img(file): # function that gets mel spectrogram of an audio file (in 2D array)
    s,f = librosa.load(file)
    melspectrogram = librosa.feature.melspectrogram(y=s,sr=f)
    mels = np.log(melspectrogram+1e-10)
    img_arr = scale_minmax(mels,0,255).astype(np.uint8) # normalized from 0 - 255 (img)
    img_arr = np.flip(img_arr,axis=0) # flipped to have low values down below
    return img_arr

In [8]:
counts = df_paths["Filepath"].apply(lambda x: (to_mel_img(x).shape)).value_counts() # most of the arrays are of a specific shape
counts # all arrays need to be standardised to this form (most common one chosen -> 128,44) : will redefine to_mel_img

(128, 44)    536
(128, 39)     15
(128, 43)     14
(128, 41)     13
(128, 37)      8
(128, 32)      6
(128, 35)      5
(128, 33)      5
(128, 23)      2
(128, 26)      2
(128, 34)      2
(128, 42)      1
(128, 29)      1
Name: Filepath, dtype: int64

In [9]:
duration = df_paths["Filepath"].apply(lambda x:librosa.get_duration(y=librosa.load(x)[0],sr=librosa.load(x)[1]))
print(sorted(set(map(lambda x:round(x,2),duration.value_counts().index)))) # files are of length of about 1/2 to 1 second

[0.51, 0.6, 0.65, 0.73, 0.74, 0.75, 0.77, 0.79, 0.81, 0.84, 0.85, 0.88, 0.89, 0.9, 0.93, 0.94, 0.97, 0.98, 1.0]


In [10]:
duration.value_counts() # most of them hover at the 1 second mark --> ties in with the shape of such files

1.000000    536
0.981406      9
0.882404      7
0.896009      7
0.938730      7
0.928844      6
0.725397      6
0.835964      5
0.975283      5
0.743084      4
0.853379      3
0.789524      3
0.810703      2
0.512018      2
0.597415      2
0.768027      2
0.885397      1
0.970703      1
0.650204      1
0.746712      1
Name: Filepath, dtype: int64

In [11]:
def to_mel_img(file,shape=None):
    s,f = librosa.load(file)
    melspectrogram = librosa.feature.melspectrogram(y=s,sr=f)
    mels = np.log(melspectrogram+1e-10)
    img_arr = scale_minmax(mels,0,255).astype(np.uint8)
    if shape==None:
        img_arr = np.flip(img_arr,axis=0)
        return img_arr
    else: # added code
        img = Image.fromarray(np.uint8(img_arr)) # creates an image from the array
        w,h = shape
        resized_img = img.resize((h, w), Image.LANCZOS) # resizes the image accordingly 
        # (LANCZOS --> algo to smoothly interpolate values)
        resized_arr = np.array(resized_img)
        resized_arr = np.flip(resized_arr,axis=0)
        return resized_arr

In [12]:
df_paths["Filepath"].apply(lambda x: (to_mel_img(x,(128,44))).shape).value_counts() # all values now standardised

(128, 44)    610
Name: Filepath, dtype: int64

In [13]:
sf_paths = pd.DataFrame({"Img_arr":df_paths["Filepath"].apply(lambda x: (to_mel_img(x,(128,44))))}) # passed to a new dataframe

In [14]:
classes = sorted(set(df_paths["Class"].values))
encoding_dic = {k:i for i,k in enumerate(classes)} # with the class values encoded
sf_paths["Encoded_class"] = df_paths["Class"].map(encoding_dic)

sf_paths

Unnamed: 0,Img_arr,Encoded_class
0,"[[91, 79, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[[10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0
2,"[[59, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,"[[40, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[[13, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0
...,...,...
605,"[[5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",2
606,"[[39, 28, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
607,"[[55, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
608,"[[67, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2


In [15]:
encoding_dic # number to which each value maps to

{'bird': 0, 'cat': 1, 'dog': 2}

In [16]:
x, x_test, y, y_test = train_test_split(sf_paths[["Img_arr"]],sf_paths["Encoded_class"],test_size=0.2,train_size=0.8,
    stratify=sf_paths["Encoded_class"],random_state=r_state) # split into train and test
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size = 0.25,train_size =0.75,stratify=y,random_state=r_state)
# from train split --> further split into a proper train set and validation set

# x and y splits are wrapped together into single dfs shown below
train_data = pd.concat([x_train,y_train],axis=1)
val_data = pd.concat([x_val,y_val],axis=1)
test_data = pd.concat([x_test,y_test],axis=1) 

In [17]:
## Part on training the model
import torch
import torch.nn as nn
from tqdm import tqdm
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader

In [18]:
torch.manual_seed(r_state) # manual state set

<torch._C.Generator at 0x1dcd5285b70>

In [19]:
class CNN(nn.Module): # CNN model created by pytorch
    def __init__(self,shape,n_classes):
        super(CNN,self).__init__()
        self.conv1 = nn.Conv2d(1,16,3,padding=1)
        self.pool = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(16,32,3,padding=1)
        self.convs = nn.Sequential(
            self.conv1,
            nn.ReLU(),
            self.pool,
            self.conv2,
            nn.ReLU(),
            self.pool
        )
        self._to_linear = None
        self._determine_linear_input(shape)
        self.fc1 = nn.Linear(self._to_linear,128)
        self.fc2 = nn.Linear(128,n_classes)
    
    def _determine_linear_input(self, shape):
        # Automatic way to determine length of layers
        with torch.no_grad():
            o = self.convs(torch.zeros(1, *shape))
        self._to_linear = int(o.numel())

    def forward(self,x):
        x=self.pool(F.relu(self.conv1(x)))
        x=self.pool(F.relu(self.conv2(x)))
        x=x.view(-1,self._to_linear)
        x=F.relu(self.fc1(x))
        x=self.fc2(x)
        return x



In [20]:
class ImageDataset(Dataset): #parse pd dataframe to be converted to tensors for pytorch
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image = self.dataframe.iloc[idx]['Img_arr']
        label = self.dataframe.iloc[idx]['Encoded_class']
        image = image / 255.0 # normalisation of image
        image = torch.tensor(image, dtype=torch.float32).unsqueeze(0) # number of channels (1) added to the front
        label = torch.tensor(label, dtype=torch.long)
        return image, label

batch_size = 16

train_torch = DataLoader(ImageDataset(train_data), batch_size=batch_size, shuffle=False)
val_torch = DataLoader(ImageDataset(val_data), batch_size=batch_size, shuffle=False)
test_torch = DataLoader(ImageDataset(test_data) , batch_size=batch_size, shuffle=False)

In [21]:
def train_one_epoch(model, train, criterion, optimizer, device): # for training an epoch
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train, desc='Training'):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train)

def validate_one_epoch(model, val, criterion, device): # to run on the validation set per epoch
    model.eval()
    running_loss = 0.0
    correct = 0 # for accuracy testing
    total = 0
    all_preds = [] # for f1 score evaluation
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(val, desc='Validation'):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())  # move to CPU and convert to numpy array
            all_labels.extend(labels.cpu().numpy())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    f1 = f1_score(all_labels, all_preds,average='weighted')
    return running_loss / len(val), accuracy, f1

In [22]:
shape = (1,*x_train.iloc[0].values[0].shape)
classes = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN(shape,classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_torch, criterion, optimizer, device)
    val_loss, val_accuracy, val_f1 = validate_one_epoch(model, val_torch, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs} - '
          f'Train Loss: {train_loss:.4f}, '
          f'Validation Loss: {val_loss:.4f}, '
          f'Validation Accuracy: {val_accuracy:.4f}',
          f'Validation f1-score: {val_f1:.4f}\n')

Training: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 54.77it/s]


Epoch 1/10 - Train Loss: 1.1195, Validation Loss: 1.0962, Validation Accuracy: 0.3443 Validation f1-score: 0.1763



Training: 100%|██████████| 23/23 [00:01<00:00, 18.31it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 38.42it/s]


Epoch 2/10 - Train Loss: 1.0936, Validation Loss: 1.0851, Validation Accuracy: 0.4344 Validation f1-score: 0.3379



Training: 100%|██████████| 23/23 [00:01<00:00, 15.25it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 41.19it/s]


Epoch 3/10 - Train Loss: 1.0620, Validation Loss: 1.0387, Validation Accuracy: 0.4754 Validation f1-score: 0.4334



Training: 100%|██████████| 23/23 [00:01<00:00, 16.56it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 29.21it/s]


Epoch 4/10 - Train Loss: 0.9519, Validation Loss: 0.8990, Validation Accuracy: 0.5574 Validation f1-score: 0.5423



Training: 100%|██████████| 23/23 [00:01<00:00, 14.87it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 38.99it/s]


Epoch 5/10 - Train Loss: 0.7734, Validation Loss: 0.7014, Validation Accuracy: 0.7459 Validation f1-score: 0.7411



Training: 100%|██████████| 23/23 [00:01<00:00, 17.40it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 39.24it/s]


Epoch 6/10 - Train Loss: 0.6072, Validation Loss: 0.5436, Validation Accuracy: 0.8279 Validation f1-score: 0.8266



Training: 100%|██████████| 23/23 [00:01<00:00, 17.36it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 44.03it/s]


Epoch 7/10 - Train Loss: 0.4711, Validation Loss: 0.4285, Validation Accuracy: 0.8852 Validation f1-score: 0.8846



Training: 100%|██████████| 23/23 [00:01<00:00, 12.58it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 35.11it/s]


Epoch 8/10 - Train Loss: 0.3929, Validation Loss: 0.3520, Validation Accuracy: 0.9344 Validation f1-score: 0.9342



Training: 100%|██████████| 23/23 [00:01<00:00, 15.45it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 33.77it/s]


Epoch 9/10 - Train Loss: 0.3420, Validation Loss: 0.3219, Validation Accuracy: 0.9262 Validation f1-score: 0.9262



Training: 100%|██████████| 23/23 [00:01<00:00, 15.51it/s]
Validation: 100%|██████████| 8/8 [00:00<00:00, 41.74it/s]

Epoch 10/10 - Train Loss: 0.2862, Validation Loss: 0.2810, Validation Accuracy: 0.9180 Validation f1-score: 0.9185






In [23]:
def audio_pred(path,label_name_dic): # predict on an audio file given to it
    mel_img = to_mel_img(path,(128,44))/255 # normalises the img_arr produced by the to_mel_img fn
    image = torch.tensor(mel_img, dtype=torch.float32).unsqueeze(0).unsqueeze(0) # adds channel number and batch size sequentially
    model.eval()
    with torch.no_grad():
        output = model(image)
    predicted_classes = torch.argmax(output, dim=1)
    return label_name_dic.get(predicted_classes.tolist()[0]) # model output is used as ref in dictionary to give the name of the class

file = os.path.join(folder_path, "test_rec1.mp3") # gets the filpath of one of the test recordings (can change as wished)

prediction = audio_pred(file, {v:k for k,v in encoding_dic.items()})
y,sr = librosa.load(file) # playes the audio of the file 
sd.play(y,sr)
sd.wait()
print(f"Predicted Sound: {prediction.title()}")

Predicted Sound: Bird


### Conclusion

The test recordings 1 through 3 in the Animals folder help to determine the performance of the model from an external perspective --> and indicate how useful the visualisation of audio through mel spectrograms into model training (as a CNN) can be. Recordings 4 through 6, while helping to further prove the skill of the model by wrapping similar sounding words in with the classes given to the model to be trained; also highlight the limitations of the approach taken. While no "other" category has been shown, it is clear that the model is not sensitive to every phoneme of the spoken word. 

This can be beneficial to accomodate a wide range of pronunciations; but is also (in the case of the test recordings) proof that there ought to be a better approach of creating a model that can determine how the word sounds intrinsically and from there try and guess the correct word. I aim to investigate this by checking if a phoneme database is available/ could be made to split similar sounding words or even look into other approaches that can distinguish speech more effectively. 