In [1]:
!pip install torch torchvision torchaudio



In [2]:
import os
import torchaudio
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
import numpy as np
path = os.path.join(r'C:\Users\21ste\Downloads\AudioClass\audio')

In [3]:
import random
import torchaudio.transforms as T

class AugmentedAudioDataset(Dataset):
    def __init__(self, audio_dir, target_num=48000, transform=None, sample_rate=16000, n_mels=64):
        self.audio_dir = audio_dir
        self.target_num_samples = sample_rate * 3          # 16000 (sample_rate) * 3 (seconds) = 48000
        self.audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]
        self.transform = transform
        self.sample_rate = sample_rate
        self.mel_transform = T.MelSpectrogram(sample_rate=self.sample_rate, n_mels=n_mels)
        self.noise = torch.randn(1, sample_rate) * 0.005  # Add some random noise

    def _pad_or_truncate(self, waveform):
        num_samples = waveform.shape[1]
        
        if num_samples > self.target_num_samples:
            # Truncate if too long
            waveform = waveform[:, :self.target_num_samples]
        elif num_samples < self.target_num_samples:
            # Pad with zeros if too short
            padding = self.target_num_samples - num_samples
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        return waveform

    def _resample_if_necessary(self, signal, sr):
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = os.path.join(self.audio_dir, self.audio_files[idx])
        waveform, sr = torchaudio.load(audio_path)
        '''
        # Data augmentation
        # 1. Add noise
        if random.random() < 0.5:  # 50% chance to add noise
            waveform += self.noise[:, :waveform.size(1)]

        # 2. Random pitch shift
        if random.random() < 0.5:
            n_steps = random.randint(-3, 3)  # Shift pitch by -3 to +3 semitones
            waveform = T.PitchShift(self.sample_rate, n_steps)(waveform)

        # 3. Time stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)  # Stretch between 80% and 120%
            waveform = T.TimeStretch(stretch_factor)(waveform)

        # 4. Random volume change
        if random.random() < 0.5:
            volume_factor = random.uniform(0.5, 1.5)  # Random volume change
            waveform *= volume_factor
        '''

        waveform = self._resample_if_necessary(waveform, self.sample_rate)
        #print(f"resample wave = {waveform.shape}")
        waveform = self._mix_down_if_necessary(waveform)
        waveform = self._pad_or_truncate(waveform)

        mel_spec = self.mel_transform(waveform)  # Convert to Mel Spectrogram

        # Apply any additional transforms (like resizing or augmentations)
        if self.transform:
            mel_spec = self.transform(mel_spec)
            
        mel_spec = mel_spec.squeeze()
        # Label assignment (e.g., "Hey Google" = 1, "Other" = 0)
        label = np.array([1, 0])  if 'output' in audio_path else np.array([0, 1])

        return mel_spec, label


In [4]:
data = AugmentedAudioDataset(path)
train_loader = DataLoader(data, batch_size=8, shuffle=True)

In [7]:
import torchaudio.transforms as T

# Transform raw audio into Mel Spectrogram (suitable for CNN input)
def preprocess_audio(audio_waveform, sample_rate=16000, n_mels=64):
    mel_spectrogram = T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)
    return mel_spectrogram(audio_waveform)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        super(LSTMModel, self).__init__()
        
        self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size1, hidden_size2, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_size2, hidden_size3, batch_first=True)
        
        self.fc1 = nn.Linear(hidden_size3, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_size)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.relu(x)
        
        x, _ = self.lstm2(x)
        x = self.relu(x)
        
        x, _ = self.lstm3(x)
        x = self.relu(x)
        
        # Take only the output of the last time step
        x = x[:, -1, :]
        
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        
        return x

# Assuming the input shape is (30, 1662) and actions.shape[0] is the number of classes
input_size = 241
hidden_size1 = 64
hidden_size2 = 128
hidden_size3 = 64
output_size = 1  # Replace this with the actual number of classes
model = LSTMModel(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)

In [10]:
import torch

# Adjust the Mel Spectrogram to fit MobileNetV2's input (3 channels)
def mel_to_rgb_input(mel_spec):
    # Repeat the Mel Spectrogram across 3 channels to simulate RGB input
    mel_rgb = mel_spec.repeat(1, 3, 1, 1)
    return mel_rgb

In [14]:
import torch.optim as optim
num_epochs = 15

# Initialize the model
model.train()

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):  # Number of epochs
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        labels = labels.float()

        

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        #print(outputs.shape)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print loss statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Finished Training")

Epoch [1/15], Loss: 0.0796
Epoch [2/15], Loss: 0.0295
Epoch [3/15], Loss: 0.0098
Epoch [4/15], Loss: 0.0523
Epoch [5/15], Loss: 0.0531
Epoch [6/15], Loss: 0.0531
Epoch [7/15], Loss: 0.0528
Epoch [8/15], Loss: 0.0531
Epoch [9/15], Loss: 0.0530
Epoch [10/15], Loss: 0.0528
Epoch [11/15], Loss: 0.0529
Epoch [12/15], Loss: 0.0530
Epoch [13/15], Loss: 0.0529
Epoch [14/15], Loss: 0.0529
Epoch [15/15], Loss: 0.0529
Finished Training


In [13]:
# Save the model
torch.save(model.state_dict(), 'model.pth')

In [52]:
# Creating the label map
label_map = ['Detected Name', 'Random Sound']

In [53]:
# Testing our model using our data
test_data = AugmentedAudioDataset(path)
wave, label = test_data[1495]
label = torch.from_numpy(label)
print(label_map[torch.argmax(label).item()])

Detected Skylar


In [48]:
# Creating a function that detects the wake word

# Set to evaluation mode
model.eval()

# Detect keywords (inference)
def detect_keyword(quantized_model, audio_waveform, sample_rate=16000):
    #mel_spec = preprocess_audio(audio_waveform, sample_rate)
    

    with torch.no_grad():
        output = quantized_model(audio_waveform.unsqueeze(0))
        pred = torch.argmax(output).item()

    if pred == 0:
        print("Keyword detected: 'Hey Skylar'")
    else:
        print("No keyword detected")

In [56]:
# Checking to see if the model is accurate
test_data = AugmentedAudioDataset(path)
wave, label = test_data[1485]
label = torch.from_numpy(label)
print(label_map[torch.argmax(label).item()])
detect_keyword(model, wave)

Detected Skylar
Keyword detected: 'Hey Skylar'


In [97]:
testdata = AugmentedAudioDataset(path)
test_loader = DataLoader(testdata, batch_size=1, shuffle=True)

In [19]:
def _mix_down_if_necessary(signal):
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

0.0005143990856595337

In [20]:
# Live WakeWord detection

import torch
import torchaudio
import numpy as np
import speech_recognition as sr
import torchaudio.transforms as T

# Load your trained model (replace with the actual path to your model)
model.eval()  # Set the model to evaluation mode

# Define audio processing parameters
sample_rate = 16000  # The sample rate used in your model training
n_mels = 64  # Number of Mel bands (must match your training setup)
target_length = 3 * sample_rate  # Target length (e.g., 3 seconds of audio)

# Create Mel Spectrogram transform
mel_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)

# SpeechRecognition recognizer instance
recognizer = sr.Recognizer()

# Function to process audio and make predictions
def predict_on_audio(audio_data):
    # Convert audio to a tensor and preprocess it
    audio_tensor = torch.tensor(audio_data).unsqueeze(0).float()

    # Normalize audio
    audio_tensor = audio_tensor / audio_tensor.abs().max()

    # Preprocess the audio: truncate or pad to the target length
    if audio_tensor.size(1) > target_length:
        audio_tensor = audio_tensor[:, :target_length]  # Truncate
    elif audio_tensor.size(1) < target_length:
        padding = torch.zeros(1, target_length - audio_tensor.size(1))
        audio_tensor = torch.cat((audio_tensor, padding), dim=1)  # Pad

    # Convert to Mel Spectrogram
    mel_spec = mel_transform(audio_tensor)

    # Add an extra dimension for batch size (1, n_mels, time)
    mel_spec = mel_spec.unsqueeze(0)
    mel_spec = mel_to_rgb_input(mel_spec)

    # Make prediction using the model
    with torch.no_grad():
        output = model(mel_spec)
        
    if output.item() > 0.7:
        print("Keyword detected")

# Function to capture live audio from the microphone and make predictions
def live_audio_detection():
    with sr.Microphone(sample_rate=sample_rate) as source:
        print("Adjusting for ambient noise... Please wait.")
        recognizer.adjust_for_ambient_noise(source)

        print("Listening for live audio... Press Ctrl+C to stop.")
        try:
            while True:
                print("Recording...")
                audio = recognizer.listen(source, phrase_time_limit=3)  # Capture 3 seconds of audio

                # Convert the audio data to numpy array (raw PCM format)
                audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16).astype(np.float32)

                # Make a prediction
                predict_on_audio(audio_data)
                
        except KeyboardInterrupt:
            print("Stopped listening.")

# Start live audio detection
live_audio_detection()

Adjusting for ambient noise... Please wait.
Listening for live audio... Press Ctrl+C to stop.
Recording...
Recording...
Recording...
Recording...
Recording...
Recording...
Keyword detected: 'Hey Skylar'
Recording...
Recording...
Recording...
Recording...
Recording...
Keyword detected: 'Hey Skylar'
Recording...
Keyword detected: 'Hey Skylar'
Recording...
Stopped listening.
