<a href="https://colab.research.google.com/github/SIDLAD/CS-F425-Project/blob/main/Model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
RunningInColab = 'google.colab' in str(get_ipython())
if RunningInColab:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Imports
from torch.utils.data import Dataset, DataLoader
import os
import torch
import torchaudio
import torchaudio.transforms as transforms

In [10]:
# Hyperparameters
TARGET_SAMPLE_RATE = 16000
TARGET_LENGTH_SECONDS = 4
BATCH_SIZE = 32
EPOCHS = 20
FREQ_MASK_PARAM = 15
TIME_MASK_PARAM = 25

NUM_SAMPLES = TARGET_LENGTH_SECONDS/TARGET_SAMPLE_RATE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformations
transformations = [
    transforms.FrequencyMasking(freq_mask_param=FREQ_MASK_PARAM),
    transforms.TimeMasking(time_mask_param=TIME_MASK_PARAM),
    transforms.MelSpectrogram(
            sample_rate=TARGET_SAMPLE_RATE,
            n_fft = 1024,
            hop_length = 512,
            n_mels = 64
        )
]

In [13]:
class_mapping = {}

class AudioDataset(Dataset):
  def __init__(self,
               data_dir,
               transformations = transformations,
               target_sample_rate = TARGET_SAMPLE_RATE,
               num_samples = NUM_SAMPLES,
               device = device
               ):
    self.data_dir = data_dir
    self.classes = sorted(os.listdir(data_dir))
    self.file_paths = []
    self.targets = []
    self.transformations = transformations
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples
    self.device = device

    createMapping = False
    if(len(class_mapping) == 0):createMapping = True

    for i,class_name in enumerate(self.classes):
      class_dir = os.path.join(data_dir,class_name)
      if(createMapping):class_mapping[class_name] = i

      for filename in os.listdir(class_dir):
        filepath = os.path.join(class_dir,filename)
        self.file_paths.append(filepath)
        self.targets.append(class_mapping[class_name])

  def __len__(self):
    return len(self.file_paths)

  def __getitem__(self,idx):
    audio_path = self.file_paths[idx]
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform.to(self.device)
    waveform = self._resample_if_necessary(waveform,sample_rate)
    waveform = self._mix_down_if_necessary(waveform)
    waveform = self._cut_if_necessary(waveform)
    waveform = self._right_pad_if_necessary(waveform)

    if self.transformations:
      for transformation in self.transformations:
        waveform = transformation.to(self.device)(waveform)

    label = self.targets[idx]
    return waveform, label

  def _resample_if_necessary(self, waveform,sample_rate):
    if sample_rate != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sample_rate,self.target_sample_rate)
      waveform = resampler(waveform)
    return waveform

  def _mix_down_if_necessary(self,waveform):
    if waveform.shape[0] > 1:
      waveform = torch.mean(waveform,dim = 0,keepdim = True)

  #If the video was longer than TARGET_LENGTH_SECONDS, then we are cropping it to that many seconds by removing seconds equally from both the start and the end sides
  def _cut_if_necessary(self,waveform):
    if waveform.shape[1] > self.num_samples:
      mid = (waveform.shape[1] - 1)//2
      nsby2 = self.num_samples//2
      waveform = waveform[:,mid - nsby2 + 1: mid + self.num_samples - nsby2 + 1]
    return waveform

  def _right_pad_if_necessary(self,waveform):
    num_samples = waveform.shape[1]
    if num_samples < self.num_samples:
      num_missing_samples = self.num_samples - num_samples
      last_dim_padding = (0,num_missing_samples)
      waveform = torch.nn.functional.pad(waveform,last_dim_padding)
    return waveform

# Define data directories
train_dir: str
val_dir:str
if RunningInColab:
    train_dir = "/content/drive/MyDrive/audio_dataset/train"
    val_dir = "/content/drive/MyDrive/audio_dataset/val"
else:
    train_dir = "audio_dataset/train"
    val_dir = "audio_dataset/val"

train_dataset = AudioDataset(train_dir)
val_dataset = AudioDataset(val_dir)

train_loader = DataLoader(
    train_dataset,batch_size = BATCH_SIZE, shuffle = True
)
val_loader = DataLoader(
    val_dataset,batch_size = BATCH_SIZE
)

In [14]:
#TODO, not working. The above cell seems to do good. Pushing changes to github


import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models.resnet import ResNet, BasicBlock

# Hyperparameters
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
NUM_CLASSES = len(train_dataset.classes)  # Replace with the number of classes in your dataset
RESNET_LAYERS = [2, 2, 2, 2]  # Number of layers in each ResNet block

# Assuming you have your data loaders set up
# train_loader, val_loader

# Define the ResNet model
class ResNetAudio(ResNet):
    def __init__(self, block, layers, num_classes):
        super(ResNetAudio, self).__init__(block, layers)
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

model = ResNetAudio(BasicBlock, RESNET_LAYERS, NUM_CLASSES)
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr=LEARNING_RATE)  # Using RMSprop

# Training function
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {epoch_loss:.4f}')

# Validation function
def validate(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(val_loader)
    accuracy = 100 * correct / total
    print(f'Validation Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%')
    return accuracy

# Training loop
best_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    train(model, train_loader, optimizer, criterion, epoch)
    accuracy = validate(model, val_loader, criterion)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')

print(f'Best Validation Accuracy: {best_accuracy:.2f}%')

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same