In [1]:
!pip install torch torchvision 



In [2]:
import pandas as pd

# Load the metadata file (adjust path based on your folder)
metadata_path = 'UrbanSound8K/metadata/UrbanSound8K.csv'
df = pd.read_csv(metadata_path)
print(df.head())

      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


In [3]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define data transformations
data_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224 for ResNet
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])

# Load the dataset from the spectrograms folder
dataset = datasets.ImageFolder('spectrograms/', transform=data_transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Print dataset info
print(f"Number of samples: {len(dataset)}")
print(f"Classes: {dataset.classes}")

Number of samples: 8732
Classes: ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']


In [10]:
import time
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torchvision.models as models

# Data setup (from previous steps)
data_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = datasets.ImageFolder('spectrograms/', transform=data_transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)  # Use 4 workers for 4 cores

# Model and training (from previous steps)
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 10)
device = torch.device('cpu')  # Force CPU for this test
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train_model(dataloader, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_loss = running_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

# Time the run
start_time = time.time()
train_model(dataloader)
print(f"Time taken: {time.time() - start_time:.2f} seconds")



Epoch 1/5, Loss: 0.7015
Epoch 2/5, Loss: 0.3400
Epoch 3/5, Loss: 0.2331
Epoch 4/5, Loss: 0.1691
Epoch 5/5, Loss: 0.1351
Time taken: 1772.34 seconds


In [13]:
!pip install scikit-learn



In [16]:
from azureml.core import Run
import torch
import numpy as np

# Get the current run context (this works in an Azure ML experiment)
run = Run.get_context()

# Set model to evaluation mode
model.eval()
all_preds = []
all_labels = []

# Disable gradient computation for inference
with torch.no_grad():
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Compute basic metrics
accuracy = np.mean(all_preds == all_labels)
run.log('Accuracy', accuracy)

# Log confusion matrix (as a flattened array for now)
confusion = np.zeros((10, 10), dtype=int)  # 10 classes in UrbanSound8K
for i in range(len(all_labels)):
    confusion[all_labels[i]][all_preds[i]] += 1
run.log_table('Confusion Matrix', {'row': range(10), 'col': range(10), 'value': confusion.flatten()})

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix (logged to Azure ML run):")
print(confusion)

Attempted to log scalar metric Accuracy:
0.9636967475950526
Attempted to log table metric Confusion Matrix:
{'row': range(0, 10), 'col': range(0, 10), 'value': array([970,   0,   2,   1,   4,   9,   0,   2,   6,   6,   0, 425,   0,
         3,   0,   0,   0,   0,   0,   1,   0,   0, 872,  47,   0,   1,
         0,   0,   0,  80,   1,   0,   0, 988,   1,   3,   0,   0,   1,
         6,   3,   0,   0,  26, 945,   3,   0,  22,   0,   1,   2,   0,
         0,   1,   0, 995,   0,   0,   1,   1,   0,   0,   1,  11,   0,
         0, 355,   0,   0,   7,   2,   1,   0,   0,   6,   0,   0, 990,
         0,   1,   0,   1,   8,  16,   1,   2,   0,   2, 898,   1,   1,
         0,   1,   2,   0,   0,   0,   7,  12, 977])}
Accuracy: 0.9637
Confusion Matrix (logged to Azure ML run):
[[970   0   2   1   4   9   0   2   6   6]
 [  0 425   0   3   0   0   0   0   0   1]
 [  0   0 872  47   0   1   0   0   0  80]
 [  1   0   0 988   1   3   0   0   1   6]
 [  3   0   0  26 945   3   0  22   0   1]
 [  2  

In [18]:
# Save model
torch.save(model.state_dict(), 'model.pth')

# Register model (in a new cell after evaluation)
from azureml.core import Model
model = Model.register(workspace=Workspace.from_config(),
                      model_path='model.pth',
                      model_name='urban8k_cnn')
print(f"Model registered: {model.name} version: {model.version}")

Registering model urban8k_cnn
Model registered: urban8k_cnn version: 1
