<a href="https://colab.research.google.com/github/PotatoNinja14/102103165-SESS_LE1/blob/main/about-lab-eval/102103165_UmangSrivastava.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name: **Umang Srivastava**  
Email: `usrivastava1_be21@thapar.edu`  
Roll No: **102103165**  
Group: **4CO6**  
Start Timestamp: 20240911-1000  

## Question

  Consider the paper: <https://arxiv.org/abs/1804.03209>

  1. Read and summarise the paper in about 50 words.
  2. Download the dataset in the paper, statistically analyse and
     describe it, so that it may be useful for posterity. (Include code
     snippets in your .ipynb file to evidence your analysis.)
  3. Train a classifier so that you are able to distinguish the commands
     in the dataset.
  4. Report the performance results using standard benchmarks.
  5. Record about 30 samples of each command in your voice and create a
     new dataset (including a new user id for yourself).  You may use a
     timer on your computer to synchronise.
  6. Fine tune your classifier to perform on your voice.
  7. Report the results.

## Solution

## Preamble

In [15]:
import os
import re
import wave
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import IPython.display as display
import tensorflow_datasets as tfds
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
# !wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
# !tar -xvzf speech_commands_v0.02.tar.gz
# !pip install pydub

In [16]:
path = '/content'

## Data Analysis

In [14]:
commands = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
command_counts = {command: len(os.listdir(os.path.join(path, command))) for command in commands}

df = pd.DataFrame(command_counts.items(), columns=['Command', 'Count'])
df.sort_values(by='Count', ascending=False)

Unnamed: 0,Command,Count
12,five,4052
22,zero,4052
35,yes,4044
33,seven,3998
7,no,3941
8,nine,3934
25,down,3917
10,one,3890
17,go,3880
3,two,3880


In [17]:
speaker_ids = set()
for command in commands:
    command_dir = os.path.join(path, command)
    for filename in os.listdir(command_dir):
        if filename.endswith(".wav"):
            match = re.match(r'([a-f0-9]+)_nohash_\d+', filename)
            if match:
                speaker_ids.add(match.group(1))

print(f"Number of unique speakers: {len(speaker_ids)}")

Number of unique speakers: 2618


In [18]:
durations = []
for command in commands:
    command_dir = os.path.join(path, command)
    for filename in os.listdir(command_dir):
        if filename.endswith(".wav"):
            filepath = os.path.join(command_dir, filename)
            with wave.open(filepath, 'r') as audio_file:
                frames = audio_file.getnframes()
                rate = audio_file.getframerate()
                duration = frames / float(rate)
                durations.append(duration)
durations_np = np.array(durations)

print(f"Average duration: {np.mean(durations_np)} seconds")
print(f"Min duration: {np.min(durations_np)} seconds")
print(f"Max duration: {np.max(durations_np)} seconds")

Average duration: 0.9846485000236217 seconds
Min duration: 0.2133125 seconds
Max duration: 95.183125 seconds


## Modelling

In [None]:
commands = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

def preprocess_audio(file_path):
    raw_audio = tf.io.read_file(file_path)
    audio, _ = tf.audio.decode_wav(raw_audio)
    return tf.squeeze(audio, axis=-1)

X = []
y = []

label_mapping = {command: i for i, command in enumerate(commands)}

for command in commands:
    command_dir = os.path.join(path, command)

    for filename in os.listdir(command_dir):
        if filename.endswith('.wav'):
            file_path = os.path.join(command_dir, filename)

            audio_data = preprocess_audio(file_path)

            X.append(audio_data.numpy())
            y.append(label_mapping[command])

X = np.array(X)
y = np.array(y)

X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=16000, dtype='float32', padding='post', truncating='post')

X = X.reshape(-1, 16000, 1)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Training data: {len(X_train)} samples")
print(f"Validation data: {len(X_val)} samples")
print(f"Test data: {len(X_test)} samples")


In [None]:
def preprocess_audio(audio, target_length=16000):
    audio = tf.cast(audio, tf.float32) / 32768.0
    audio_length = tf.shape(audio)[0]
    pad_size = tf.maximum(target_length - audio_length, 0)
    audio = tf.cond(pad_size > 0, lambda: tf.pad(audio, [[0, pad_size]]), lambda: audio[:target_length])
    return audio

def preprocess(example, label):
    audio = preprocess_audio(example)
    audio = tf.expand_dims(audio, axis=-1)
    return audio, label

train_data = (X_train
              .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
              .cache()
              .shuffle(1000)
              .batch(32)
              .prefetch(tf.data.AUTOTUNE))

val_data = (X_val
            .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
            .batch(32)
            .prefetch(tf.data.AUTOTUNE))

test_data = (X_test
             .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
             .batch(32)
             .prefetch(tf.data.AUTOTUNE))

In [None]:
model = models.Sequential([
    layers.InputLayer(input_shape=(16000, 1)),
    layers.Conv1D(16, 9, activation='relu'),
    layers.MaxPooling1D(2),
    layers.Conv1D(32, 9, activation='relu'),
    layers.MaxPooling1D(2),
    layers.Conv1D(64, 9, activation='relu'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(commands), activation='softmax')
])
model.summary()
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_data,
                    validation_data=val_data,
                    epochs=6,
                    verbose=1)

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

print("Classification Report:\n")
print(classification_report(y_test, y_pred_classes, target_names=commands))
conf_matrix = confusion_matrix(y_test, y_pred_classes)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=commands, yticklabels=commands)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()