<a href="https://colab.research.google.com/github/Swetha-Oruganti/Multi-Accent-Speech-Recognition/blob/main/dl_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Determine the number of unique accent labels
num_labels = len(np.unique(labels))
print(f"Number of unique accent labels: {num_labels}")

# Define the model
model = Sequential()

# Add dense layers
model.add(Dense(128, activation='relu', input_shape=(n_mfcc,)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Add the output layer
model.add(Dense(num_labels, activation='softmax'))

# Print the model summary
model.summary()

Number of unique accent labels: 8


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


**Reasoning**:
The error indicates that the labels are strings and need to be converted to numerical format before one-hot encoding. I need to encode the string labels into integers first.



**Reasoning**:
The first step is to load the data from the CSV file into a pandas DataFrame and display the first few rows to understand its structure.



In [20]:
# Re-extract features suitable for CNN (keeping temporal dimension)
extracted_features_cnn = []
n_mfcc = 13 # Number of MFCCs to extract

for audio in audio_data:
    # Extract MFCCs, keeping the temporal dimension
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    extracted_features_cnn.append(mfccs)

# Pad or truncate MFCC sequences to a fixed length (necessary for CNN input)
# Find the maximum length of the MFCC sequences
max_length = max([mfccs.shape[1] for mfccs in extracted_features_cnn])

# Pad or truncate all MFCC sequences to the max_length
padded_features = []
for mfccs in extracted_features_cnn:
    if mfccs.shape[1] < max_length:
        pad_width = max_length - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    elif mfccs.shape[1] > max_length:
        mfccs = mfccs[:, :max_length]
    padded_features.append(mfccs)

# Convert the list of padded features to a NumPy array
extracted_features_cnn = np.array(padded_features)

# Add a channel dimension for CNN input (usually 1 for grayscale-like data)
extracted_features_cnn = np.expand_dims(extracted_features_cnn, axis=-1)


print(f"Extracted features shape for CNN: {extracted_features_cnn.shape}")

Extracted features shape for CNN: (12486, 13, 1172, 1)


## Prepare data for cnn

### Subtask:
Reshape the extracted features to have the correct dimensions for a CNN input and split the data into training and validation sets.


**Reasoning**:
Encode the string labels into integers, convert the integer encoded labels to one-hot encoding, and then split the features and labels into training and testing sets for the CNN model.



In [21]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Encode the string labels into integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Convert the integer encoded labels to one-hot encoding
labels_one_hot = to_categorical(labels_encoded)

# Split the data into training and testing sets for CNN
X_train_cnn, X_test_cnn, y_train_one_hot_cnn, y_test_one_hot_cnn = train_test_split(
    extracted_features_cnn, labels_one_hot, test_size=0.2, random_state=42
)

print(f"X_train_cnn shape: {X_train_cnn.shape}")
print(f"X_test_cnn shape: {X_test_cnn.shape}")
print(f"y_train_one_hot_cnn shape: {y_train_one_hot_cnn.shape}")
print(f"y_test_one_hot_cnn shape: {y_test_one_hot_cnn.shape}")

X_train_cnn shape: (9988, 13, 1172, 1)
X_test_cnn shape: (2498, 13, 1172, 1)
y_train_one_hot_cnn shape: (9988, 8)
y_test_one_hot_cnn shape: (2498, 8)


## Cnn model selection

### Subtask:
Define a Convolutional Neural Network (CNN) architecture for accent recognition.


**Reasoning**:
Import the necessary layers for building a CNN from TensorFlow and Keras, define a sequential model, add convolutional, pooling, flatten, dense, and dropout layers, and print the model summary as per the instructions.



In [22]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, InputLayer

# Define the CNN model
model_cnn = Sequential()

# Add InputLayer to specify the input shape
model_cnn.add(InputLayer(input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2], X_train_cnn.shape[3])))

# Add Conv2D and MaxPooling2D layers
model_cnn.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten the output
model_cnn.add(Flatten())

# Add Dense layers with Dropout
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.5))

# Add the output layer
model_cnn.add(Dense(num_labels, activation='softmax'))

# Print the model summary
model_cnn.summary()



## Cnn model training

### Subtask:
Train the CNN model on the prepared features and corresponding accent labels.


**Reasoning**:
Compile and train the CNN model using the prepared data and specified parameters.



In [27]:
from tensorflow.keras.optimizers import Adam

# Compile the CNN model
model_cnn.compile(optimizer=Adam(),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

# Train the CNN model
epochs = 15
batch_size = 32

history_cnn = model_cnn.fit(X_train_cnn, y_train_one_hot_cnn,
                            validation_data=(X_test_cnn, y_test_one_hot_cnn),
                            epochs=epochs,
                            batch_size=batch_size)

Epoch 1/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 273ms/step - accuracy: 0.9734 - loss: 0.1075 - val_accuracy: 0.9680 - val_loss: 0.1530
Epoch 2/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 254ms/step - accuracy: 0.9732 - loss: 0.1027 - val_accuracy: 0.9752 - val_loss: 0.0933
Epoch 3/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 251ms/step - accuracy: 0.9771 - loss: 0.0801 - val_accuracy: 0.9604 - val_loss: 0.1379
Epoch 4/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 253ms/step - accuracy: 0.9810 - loss: 0.0648 - val_accuracy: 0.9644 - val_loss: 0.1184
Epoch 5/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 254ms/step - accuracy: 0.9808 - loss: 0.0789 - val_accuracy: 0.9776 - val_loss: 0.0765
Epoch 6/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 251ms/step - accuracy: 0.9835 - loss: 0.0622 - val_accuracy: 0.9608 - val_loss: 0.1516
Epoch 7/15

## Inference with CNN

### Subtask:
Update the inference function to work with the CNN model and test it with a new audio sample.

**Reasoning**:
Update the `predict_accent` function to accept the CNN model and reshape the features accordingly, then use the updated function with a new audio file path to test the CNN model.

In [37]:
# Function to predict accent of a new audio file using the CNN model
def predict_accent_cnn(audio_path, model_cnn, label_encoder, sample_rate=22050, n_mfcc=13, max_length=max_length):
    try:
        # Load and preprocess audio
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        audio = librosa.util.normalize(audio)

        # Extract MFCCs, keeping the temporal dimension
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

        # Pad or truncate MFCC sequence to the max_length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        elif mfccs.shape[1] > max_length:
            mfccs = mfccs[:, :max_length]

        # Add channel and batch dimensions for CNN input
        features = np.expand_dims(mfccs, axis=-1)
        features = np.expand_dims(features, axis=0)

        # Predict accent using the CNN model
        prediction = model_cnn.predict(features)
        predicted_class_index = np.argmax(prediction)
        predicted_accent = label_encoder.inverse_transform([predicted_class_index])[0]

        return predicted_accent

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Example usage with a new audio file (replace with your file path)
new_audio_path = '/content/drive/My Drive/audio_1.wav'  # Replace with your audio file path
predicted_accent_cnn = predict_accent_cnn(new_audio_path, model_cnn, label_encoder)

if predicted_accent_cnn:
    print(f"The predicted accent using CNN is: {predicted_accent_cnn}")
else:
    print("Could not predict accent using CNN.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
The predicted accent using CNN is: american


## CNN Model Evaluation

### Subtask:
Evaluate the trained CNN model's performance on the test set.

**Reasoning**:
Evaluate the CNN model on the test data and print the test loss and accuracy.

In [28]:
# Evaluate the CNN model
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test_cnn, y_test_one_hot_cnn, verbose=0)

print(f"CNN Test Loss: {loss_cnn:.4f}")
print(f"CNN Test Accuracy: {accuracy_cnn:.4f}")

CNN Test Loss: 0.1058
CNN Test Accuracy: 0.9784
