# Demo: Loading Pre-Trained Speech Command Model from Google Drive
This notebook demonstrates loading pre-trained model weights from Google Drive and running inference on an audio sample using the model.

### Steps:
1. Mount Google Drive.
2. Load the pre-trained model weights.
3. Prepare an audio sample for inference.
4. Perform inference and display the predicted command.


In [None]:
# Step 1: Mount Google Drive to access the model weights
from google.colab import drive
drive.mount('/content/drive')

### Step 2: Load the Pre-Trained Model
Make sure to provide the correct path to your saved model weights in Google Drive.

In [None]:
import torch
import torch.nn as nn
import torchaudio

# Define the model architecture (make sure it matches the one used in training)
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)

# Instantiate the model
model = SpeechCommandModel()

# Load the pre-trained weights from Google Drive (update the path accordingly)
model_path = '/content/drive/MyDrive/Lab_Eval1/weights.pth'
model.load_state_dict(torch.load(model_path))
model.eval()


### Step 3: Prepare an Audio Sample for Inference
Load an audio sample and convert it to the format required by the model (e.g., spectrogram).

In [None]:
# Load an audio file for inference (provide the path to the audio file)
waveform, sample_rate = torchaudio.load('/content/drive/MyDrive/Lab_Eval1/dataset_user_001/down_1.wav')

# Transform the waveform into a mel-spectrogram (required by the model)
transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=64
)
mel_spectrogram = transform(waveform)

# Add batch dimension and adjust for model input
input_tensor = mel_spectrogram.unsqueeze(0)


### Step 4: Perform Inference and Display the Predicted Command
Pass the processed audio through the model and interpret the result.

In [None]:
# Perform inference
with torch.no_grad():
    output = model(input_tensor)

# Get the predicted label (assuming it's a classification task)
predicted_label = torch.argmax(output, dim=1)

# Define the list of commands (update according to your dataset)
commands = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'zero', 'one', 'two', 'three', 'four',
            'five', 'six', 'seven', 'eight', 'nine', 'bed', 'bird', 'cat', 'dog', 'happy', 'house', 'marvin',
            'sheila', 'tree', 'wow', 'visual', 'backward', 'forward', 'follow', 'learn']

# Display the predicted command
print(f'Predicted command: {commands[predicted_label]}')