In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

# ZIPs from MyDrive:
drive_path = '/content/drive/MyDrive'
extract_path = '/content/dataset'

# List of zip files
zip_files = ['speech.zip', 'sing.zip', 'noise.zip']

# Extract each zip into the same folder
for zip_file in zip_files:
    zip_path = os.path.join(drive_path, zip_file)
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    else:
        print(f"File not found: {zip_path}")


In [None]:
def create_labeled_data(data_folder):
    label_map = {'sing': 0, 'speech': 1, 'noise': 2}
    data = []
    for class_name in os.listdir(data_folder):
        class_path = os.path.join(data_folder, class_name)
        if not os.path.isdir(class_path):
            continue
        label = label_map.get(class_name)
        if label is None:
            continue
        for filename in os.listdir(class_path):
            if filename.endswith('.wav'):
                filepath = os.path.join(class_path, filename)
                # Preprocess and convert to spectrogram immediately
                mel_spec = preprocessing(filepath)
                data.append((mel_spec, label))
    return data

In [None]:
import os
import pandas as pd
import numpy as np
import torchaudio
import torch
import torch.nn
from torchaudio import transforms as T

In [None]:
def create_waveforms_and_standard_waveform (filepath, sample_rate = 44100):
  waveform, org_samplerate = torchaudio.load(filepath)

  if org_samplerate != sample_rate:
    resampler = T.Resample(orig_freq = org_samplerate, new_freq=44100)
    waveform = resampler(waveform)

  return waveform, sample_rate


In [None]:
def create_single_channel(waveform):
  if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)
  return waveform

In [None]:
def create_melspectogram(waveform, sample_rate = 44100):
  mel_transform = T.MelSpectrogram(sample_rate=sample_rate, n_fft=1024, hop_length=512, n_mels=64)
  mel_spec = mel_transform(waveform)
  return mel_spec

In [None]:
def trim_or_pad (waveform, max_duraction = 5, sample_rate = 44100):
  max_len = max_duraction * sample_rate
  if waveform.shape[1] > max_len:
    waveform = waveform[:, :max_len]
  else:
    padding = max_len - waveform.shape[1]
    waveform = torch.nn.functional.pad(waveform, (0, padding))

  return waveform, sample_rate

In [None]:
def preprocessing (data, sample_rate = 44100, duration = 5):
  waveform, sample_rate = create_waveforms_and_standard_waveform(data, sample_rate)
  waveform = create_single_channel(waveform)
  waveform, sample_rate = trim_or_pad(waveform, duration, sample_rate)
  mel_spec = create_melspectogram(waveform, sample_rate)
  mel_spec = (mel_spec - mel_spec.mean()) / (mel_spec.std() + 1e-6)
  return mel_spec

In [None]:
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import random

class AudioDataset(Dataset):
    def __init__(self, data, augment=False):
        self.data = data
        self.augment = augment

    def __getitem__(self, idx):
        mel_spec, label = self.data[idx]

        # Ensure proper shape and type
        mel_spec = mel_spec.clone().detach()
        if len(mel_spec.shape) == 2:
            mel_spec = mel_spec.unsqueeze(0)

        # Convert to float32 if needed
        mel_spec = mel_spec.float()

        if self.augment:
            # Time masking (axis=2 is time)
            if random.random() > 0.5:
                time_mask_param = random.randint(10, 20)
                mel_spec = torchaudio.functional.mask_along_axis(
                    mel_spec, time_mask_param, mask_value=0, axis=2)

            # Frequency masking (axis=1 is frequency)
            if random.random() > 0.5:
                freq_mask_param = random.randint(5, 10)
                mel_spec = torchaudio.functional.mask_along_axis(
                    mel_spec, freq_mask_param, mask_value=0, axis=1)

            # Add noise
            if random.random() > 0.5:
                noise = torch.randn_like(mel_spec) * 0.01
                mel_spec = mel_spec + noise

        return mel_spec, torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.data)


In [None]:
# CNN model
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ELU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.3),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ELU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.3),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ELU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.3),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ELU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Linear(128, 3)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x

In [None]:
# 1. Create labeled data with spectrograms
data = create_labeled_data('/content/dataset')
dataset = AudioDataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# 2. Split into train/val
train_data, val_data = train_test_split(data, test_size=0.2, stratify=[label for _, label in data])

# 3. Create datasets
train_dataset = AudioDataset(train_data, augment=True)
val_dataset = AudioDataset(val_data, augment=False)

# 4. Create dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model setup
model = AudioCNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.0001,
    weight_decay=1e-5
)

In [None]:
import torch
import os
from datetime import datetime

# Create save directory with timestamp
save_dir = f'saved_models_new{datetime.now().strftime("%Y%m%d_%H%M%S")}'
os.makedirs(save_dir, exist_ok=True)

# Training configuration
best_val_acc = 0
patience = 5
trigger_times = 0
early_stop = False

for epoch in range(20):
    if early_stop:
        break

    # Training phase
    model.train()
    train_loss = 0
    for mel_spec, label in train_loader:
        if len(mel_spec.shape) == 3:
            mel_spec = mel_spec.unsqueeze(1)

        outputs = model(mel_spec)
        loss = criterion(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for mel_spec, label in val_loader:
            if len(mel_spec.shape) == 3:
                mel_spec = mel_spec.unsqueeze(1)
            outputs = model(mel_spec)
            loss = criterion(outputs, label)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == label).sum().item()
            total += label.size(0)

    # Calculate metrics
    val_acc = 100 * correct / total
    avg_train_loss = train_loss/len(train_loader)
    avg_val_loss = val_loss/len(val_loader)

    print(f"Epoch {epoch+1:02d}, "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, "
          f"Val Accuracy: {val_acc:.2f}%")

    # Early stopping and model saving logic
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        trigger_times = 0  # Reset patience counter

        # Save best model versions
        torch.save(model.state_dict(), f'{save_dir}/best_model_weights.pth')
        torch.save(model, f'{save_dir}/best_full_model.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_accuracy': val_acc,
            'val_loss': avg_val_loss,
            'train_loss': avg_train_loss,
        }, f'{save_dir}/best_checkpoint.pth')

        print(f"↳ New best model saved (acc: {val_acc:.2f}%)")
    else:
        trigger_times += 1
        print(f"↳ No improvement ({trigger_times}/{patience})")
        if trigger_times >= patience:
            print("!!! Early stopping triggered !!!")
            early_stop = True

# Final save regardless of early stopping
torch.save({
    'final_model_state_dict': model.state_dict(),
    'final_accuracy': best_val_acc,
    'final_epoch': epoch,
    'early_stopped': early_stop,
}, f'{save_dir}/final_model.pth')

# Training summary
print("\n=== Training Complete ===")
print(f"Best validation accuracy: {best_val_acc:.2f}%")
print(f"Stopped at epoch {epoch+1} {'(early stopped)' if early_stop else ''}")
print(f"\nSaved models in '{save_dir}':")
print(f"- best_model_weights.pth (state dict)")
print(f"- best_full_model.pth (complete model)")
print(f"- best_checkpoint.pth (full training state)")
print(f"- final_model.pth (final trained model)")

Epoch 01, Train Loss: 1.0835, Val Loss: 0.9023, Val Accuracy: 60.08%
↳ New best model saved (acc: 60.08%)
Epoch 02, Train Loss: 0.8721, Val Loss: 0.6213, Val Accuracy: 79.47%
↳ New best model saved (acc: 79.47%)
Epoch 03, Train Loss: 0.7592, Val Loss: 0.5593, Val Accuracy: 78.71%
↳ No improvement (1/5)
Epoch 04, Train Loss: 0.7153, Val Loss: 0.5666, Val Accuracy: 74.14%
↳ No improvement (2/5)
Epoch 05, Train Loss: 0.6486, Val Loss: 0.4488, Val Accuracy: 88.21%
↳ New best model saved (acc: 88.21%)
Epoch 06, Train Loss: 0.5948, Val Loss: 0.4530, Val Accuracy: 85.93%
↳ No improvement (1/5)
Epoch 07, Train Loss: 0.5843, Val Loss: 0.3532, Val Accuracy: 90.87%
↳ New best model saved (acc: 90.87%)
Epoch 08, Train Loss: 0.5728, Val Loss: 0.4082, Val Accuracy: 87.83%
↳ No improvement (1/5)
Epoch 09, Train Loss: 0.5235, Val Loss: 0.3500, Val Accuracy: 90.11%
↳ No improvement (2/5)
Epoch 10, Train Loss: 0.4991, Val Loss: 0.3163, Val Accuracy: 91.25%
↳ New best model saved (acc: 91.25%)
Epoch 11, 