In [26]:
import torch
import torchvision.models as models
import numpy as np
import pandas as pd
# Example: Using ResNet50 as the CNN backbone
backbone = models.resnet50(pretrained=True)
# Replace the final fully connected layer with an identity transform
backbone.fc = torch.nn.Identity()



In [27]:
class ProjectionHead(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=512, output_dim=128):
        super(ProjectionHead, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Example: Creating a projection head
# Assuming the output features of your backbone has 2048 dimensions
projection_head = ProjectionHead(input_dim=2048)


In [28]:
class NTXentLoss(torch.nn.Module):
    def __init__(self, temperature, device):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.device = device
        self.criterion = torch.nn.CrossEntropyLoss().to(device)

    def forward(self, z_i, z_j):
        N, Z = z_i.size()  # Batch size and feature dimension

        # Concatenate the positive pairs
        z = torch.cat((z_i, z_j), dim=0)

        # Calculate cosine similarity
        sim = torch.mm(z, z.T) / self.temperature
        sim_i_j = torch.diag(sim, N)
        sim_j_i = torch.diag(sim, -N)

        # Create positive and negative masks
        positive_mask = torch.cat((sim_j_i, sim_i_j), dim=0).reshape(2 * N, 1)
        negative_mask = sim > -1e6  # Mask to remove self-similarity

        labels = torch.from_numpy(np.array([range(N), range(N)])).view(2 * N).to(self.device)
        loss = self.criterion(sim, labels)

        return loss

# Example: Creating the NT-Xent Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
contrastive_loss = NTXentLoss(temperature=0.5, device=device)


In [29]:
class SimCLR(torch.nn.Module):
    def __init__(self, backbone, projection_head):
        super(SimCLR, self).__init__()
        self.backbone = backbone
        self.projection_head = projection_head

    def forward(self, x_i, x_j):
        h_i = self.backbone(x_i)
        h_j = self.backbone(x_j)

        z_i = self.projection_head(h_i)
        z_j = self.projection_head(h_j)

        return z_i, z_j

# Instantiate the SimCLR model
simclr_model = SimCLR(backbone, projection_head)


In [30]:
# Add a linear layer for classification (after training SimCLR model)
class Classifier(torch.nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc = torch.nn.Linear(feature_dim, num_classes)

    def forward(self, x):
        x = self.fc(x)
        return x


num_classes = 10  # Set the number of classes in UrbanSound8K
feature_dim= 2048
classifier = Classifier(feature_dim, num_classes).to(device)

In [31]:
import torchvision.transforms as transforms

def get_simclr_transformations(size, s=1):
    """Return a set of data transformations for SimCLR.

    Args:
    - size (int): Size of the square crop.
    - s (float): Strength of color jitter, typically between 0.5 and 1.5.

    Returns:
    - A torchvision transforms module.
    """
    color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)

    data_transforms = transforms.Compose([
        transforms.RandomResizedCrop(size=size),
        transforms.RandomHorizontalFlip(),
        color_jitter,
        transforms.RandomGrayscale(p=0.2),
        transforms.ToTensor(),
    ])
    return data_transforms

# Example usage
transform = get_simclr_transformations(size=224)


In [32]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class UrbanSoundDataset(Dataset):
    def __init__(self, root_dir, fold, csv_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.fold = fold
        self.annotations = pd.read_csv(csv_file)
        # Filter the annotations for the current fold
        self.current_fold_annotations = self.annotations[self.annotations['fold'] == self.fold]

    def __len__(self):
        return len(self.current_fold_annotations)

    def __getitem__(self, idx):
        img_filename = self.current_fold_annotations.iloc[idx]['slice_file_name']
        img_path = os.path.join(self.root_dir, f'fold{self.fold}', img_filename)
        image = Image.open(img_path).convert('RGB')
        label = self.current_fold_annotations.iloc[idx]['classID']

        # Apply the transformation twice to get two augmented versions of the same image
        xi = self.transform(image)
        xj = self.transform(image)

        return xi, xj, label


# Instantiate the dataset
dataset = UrbanSoundDataset(root_dir='./archive/', fold=1, csv_file="./archive/UrbanSound8K.csv", transform=transform)

# DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [33]:
num_epochs = 15
num_folds = 10
root_dir = './archive/'
feature_dim = 2048
num_classes = 10
batch_size = 32
csv_file = "./archive/UrbanSound8K.csv"
#base_lr = 0.3 * (batch_size / 256)  # Adjust batch_size according to your setup
base_lr = 0.001
weight_decay = 1e-6

# Training and validation loop
for fold in range(num_folds):
    print(f"Starting fold {fold+1}")

    # Setup training and validation data loaders
    train_dataset = UrbanSoundDataset(root_dir=root_dir, fold=fold+1, csv_file=csv_file, transform=transform)
    val_dataset = UrbanSoundDataset(root_dir=root_dir, fold=fold+1, csv_file=csv_file, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    # Initialize model, optimizer, and loss for each fold
    simclr_model = SimCLR(backbone, projection_head).to(device)
    classifier = Classifier(feature_dim, num_classes).to(device)
    optimizer = torch.optim.Adam(list(simclr_model.parameters()) + list(classifier.parameters()), lr=0.001)#, weight_decay=weight_decay)
    contrastive_loss = NTXentLoss(temperature=0.5, device=device)
    #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)

    # Training loop for the current fold
    for epoch in range(num_epochs):
        simclr_model.train()
        classifier.train()
        train_loss = 0
        """
        if epoch < 10:
            lr_scale = min(1., float(epoch + 1) / 10.)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * base_lr"""

        for (xi, xj, labels) in train_loader:
            xi, xj, labels = xi.to(device), xj.to(device), labels.to(device)

            # Forward pass
            zi, zj = simclr_model(xi, xj)

            # Compute contrastive loss
            features = simclr_model.backbone(xi)  # Get features from one of the augmented images
            classifier_output = classifier(features)
            loss_contrastive = contrastive_loss(zi.long(), zj.long())

            # loss_contrastive = contrastive_loss(zi, zj)
            # loss_classifier = torch.nn.functional.cross_entropy(classifier_output, labels)
            # loss_classifier = torch.nn.functional.cross_entropy(classifier_output, labels.long())
            loss_classifier = torch.nn.functional.cross_entropy(classifier_output, labels.long())

            loss = loss_contrastive + loss_classifier

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #scheduler.step()

            train_loss += loss.item()

        # Optional: Print the current learning rate
        #current_lr = scheduler.get_last_lr()[0]
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss/len(train_loader)}")
        #print(f"Epoch [{epoch+1}/{num_epochs}], Current LR: {current_lr}")

        # Validation step
        simclr_model.eval()
        classifier.eval()
        val_accuracy = 0
        total = 0
        correct = 0
        with torch.no_grad():
            val_loss = 0
            for (xi, xj, labels) in val_loader:
                xi, xj, labels = xi.to(device), xj.to(device), labels.to(device)
                zi, zj = simclr_model(xi, xj)
                loss_contrastive = contrastive_loss(zi, zj)

                features = simclr_model.backbone(xi)
                outputs = classifier(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                val_loss += loss.item()

            print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss/len(val_loader)}")
            accuracy = 100 * correct / total
            print(f"Fold {fold+1}, Validation Accuracy: {accuracy}%")

    # Save model after each fold
    torch.save({'simclr_model': simclr_model.state_dict(),
                'classifier': classifier.state_dict()},
               f'simclr_classifier_urbansound8k_fold{fold+1}.pth')


Starting fold 1


RuntimeError: expected scalar type Long but found Int

# logistic regression 

In [46]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pandas as pd
from PIL import Image
import os

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogisticRegression, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        x = self.fc(x)
        return x

class UrbanSoundDataset(Dataset):
    def __init__(self, root_dir, fold, csv_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.fold = fold
        self.annotations = pd.read_csv(csv_file)
        self.current_fold_annotations = self.annotations[self.annotations['fold'] == self.fold]

    def __len__(self):
        return len(self.current_fold_annotations)

    def __getitem__(self, idx):
        img_filename = self.current_fold_annotations.iloc[idx]['slice_file_name']
        img_path = os.path.join(self.root_dir, f'fold{self.fold}', img_filename)
        image = Image.open(img_path).convert('RGB')
        label = self.current_fold_annotations.iloc[idx]['classID']

        if self.transform:
            image = self.transform(image)

        return image, label

transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

dataset = UrbanSoundDataset(root_dir='./archive/', fold=1, csv_file="./archive/UrbanSound8K.csv", transform=transform)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

input_dim = 2048  
num_classes = 10  
lr_model = LogisticRegression(input_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr_model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    lr_model.train()
    total_loss = 0

    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)

        features = simclr_model.backbone(images)  
        outputs = lr_model(features)

        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(dataloader)}")
   
    lr_model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            features = simclr_model.backbone(images)
            outputs = lr_model(features)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f"Validation Accuracy: {accuracy}%")



Epoch [1/15], Training Loss: 1.6911799332925253
Validation Accuracy: 36.99885452462772%
Epoch [2/15], Training Loss: 0.9044251101357597
Validation Accuracy: 47.99541809851088%
Epoch [3/15], Training Loss: 0.6573553553649357
Validation Accuracy: 52.00458190148912%
Epoch [4/15], Training Loss: 0.523825249501637
Validation Accuracy: 54.29553264604811%
Epoch [5/15], Training Loss: 0.428718876093626
Validation Accuracy: 55.670103092783506%
Epoch [6/15], Training Loss: 0.367296858557633
Validation Accuracy: 54.29553264604811%
Epoch [7/15], Training Loss: 0.3491870766239507
Validation Accuracy: 54.63917525773196%
Epoch [8/15], Training Loss: 0.2965950902019228
Validation Accuracy: 53.608247422680414%
Epoch [9/15], Training Loss: 0.2929704732128552
Validation Accuracy: 55.09736540664376%
Epoch [10/15], Training Loss: 0.268234039789864
Validation Accuracy: 55.09736540664376%
Epoch [11/15], Training Loss: 0.22181155505989278
Validation Accuracy: 56.93012600229095%
Epoch [12/15], Training Loss: 0

# SVM

In [52]:
# Training loop
for epoch in range(num_epochs):
    svm_model = svm.LinearSVC()  
    train_features = []
    train_labels = []

    for images, labels in train_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        train_features.extend(features.detach().cpu().numpy()) 
        train_labels.extend(labels.cpu().numpy())

    svm_model.fit(train_features, train_labels)

val_features = []
val_labels = []

for images, labels in val_loader:
    images = images.to(device)

    features = simclr_model.backbone(images)
    val_features.extend(features.detach().cpu().numpy()) 
    val_labels.extend(labels.cpu().numpy())

val_predictions = svm_model.predict(val_features)
accuracy = (val_predictions == val_labels).mean() * 100
print(f"Validation Accuracy: {accuracy}%")




Validation Accuracy: 62.54295532646048%


# XGBClassifier

In [54]:
from xgboost import XGBClassifier

for epoch in range(num_epochs):
    xgb_model = XGBClassifier() 
    train_features = []
    train_labels = []

    for images, labels in train_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        train_features.extend(features.detach().cpu().numpy())  
        train_labels.extend(labels.cpu().numpy())

    xgb_model.fit(train_features, train_labels)

    val_features = []
    val_labels = []

    for images, labels in val_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        val_features.extend(features.detach().cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

    val_predictions = xgb_model.predict(val_features)
    accuracy = (val_predictions == val_labels).mean() * 100
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {accuracy}%")


Epoch [1/15], Validation Accuracy: 54.41008018327606%
Epoch [2/15], Validation Accuracy: 50.40091638029782%
Epoch [3/15], Validation Accuracy: 50.28636884306987%
Epoch [4/15], Validation Accuracy: 51.20274914089347%
Epoch [5/15], Validation Accuracy: 51.317296678121416%
Epoch [6/15], Validation Accuracy: 50.85910652920962%
Epoch [7/15], Validation Accuracy: 52.80641466208477%
Epoch [8/15], Validation Accuracy: 52.80641466208477%
Epoch [9/15], Validation Accuracy: 50.74455899198167%
Epoch [10/15], Validation Accuracy: 52.119129438717074%
Epoch [11/15], Validation Accuracy: 51.08820160366552%
Epoch [12/15], Validation Accuracy: 53.264604810996566%
Epoch [13/15], Validation Accuracy: 50.51546391752577%
Epoch [14/15], Validation Accuracy: 53.264604810996566%
Epoch [15/15], Validation Accuracy: 50.85910652920962%


# RFC

In [55]:
from sklearn.ensemble import RandomForestClassifier

for epoch in range(num_epochs):
    rf_model = RandomForestClassifier() 
    train_features = []
    train_labels = []

    for images, labels in train_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        train_features.extend(features.detach().cpu().numpy()) 
        train_labels.extend(labels.cpu().numpy())

    rf_model.fit(train_features, train_labels)

    val_features = []
    val_labels = []

    for images, labels in val_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        val_features.extend(features.detach().cpu().numpy()) 
        val_labels.extend(labels.cpu().numpy())

    val_predictions = rf_model.predict(val_features)
    accuracy = (val_predictions == val_labels).mean() * 100
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {accuracy}%")


Epoch [1/15], Validation Accuracy: 52.119129438717074%
Epoch [2/15], Validation Accuracy: 54.41008018327606%
Epoch [3/15], Validation Accuracy: 54.29553264604811%
Epoch [4/15], Validation Accuracy: 53.95189003436426%
Epoch [5/15], Validation Accuracy: 55.55555555555556%
Epoch [6/15], Validation Accuracy: 54.86827033218786%
Epoch [7/15], Validation Accuracy: 56.24284077892325%
Epoch [8/15], Validation Accuracy: 54.06643757159221%
Epoch [9/15], Validation Accuracy: 53.15005727376862%
Epoch [10/15], Validation Accuracy: 53.95189003436426%
Epoch [11/15], Validation Accuracy: 54.52462772050401%
Epoch [12/15], Validation Accuracy: 51.890034364261176%
Epoch [13/15], Validation Accuracy: 54.98281786941581%
Epoch [14/15], Validation Accuracy: 56.013745704467354%
Epoch [15/15], Validation Accuracy: 56.24284077892325%


# KNN

In [56]:
from sklearn.neighbors import KNeighborsClassifier

for epoch in range(num_epochs):
    knn_model = KNeighborsClassifier()
    train_features = []
    train_labels = []

    for images, labels in train_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        train_features.extend(features.detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    knn_model.fit(train_features, train_labels)

    val_features = []
    val_labels = []

    for images, labels in val_loader:
        images = images.to(device)

        features = simclr_model.backbone(images)
        val_features.extend(features.detach().cpu().numpy()) 
        val_labels.extend(labels.cpu().numpy())


    val_predictions = knn_model.predict(val_features)
    accuracy = (val_predictions == val_labels).mean() * 100
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {accuracy}%")


Epoch [1/15], Validation Accuracy: 68.8430698739977%
Epoch [2/15], Validation Accuracy: 68.49942726231386%
Epoch [3/15], Validation Accuracy: 66.09392898052691%
Epoch [4/15], Validation Accuracy: 69.41580756013745%
Epoch [5/15], Validation Accuracy: 70.2176403207331%
Epoch [6/15], Validation Accuracy: 67.58304696449026%
Epoch [7/15], Validation Accuracy: 69.18671248568155%
Epoch [8/15], Validation Accuracy: 69.18671248568155%
Epoch [9/15], Validation Accuracy: 68.38487972508591%
Epoch [10/15], Validation Accuracy: 69.18671248568155%
Epoch [11/15], Validation Accuracy: 69.3012600229095%
Epoch [12/15], Validation Accuracy: 68.27033218785796%
Epoch [13/15], Validation Accuracy: 67.12485681557845%
Epoch [14/15], Validation Accuracy: 68.04123711340206%
Epoch [15/15], Validation Accuracy: 67.92668957617411%
