In [1]:
import os
import gc
import ast
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import torchaudio
import IPython.display as ipd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [67]:
class config:
    seed = 42
    num_fold = 1
    sample_rate = 16000
    n_fft = 1024
    hop_length = 512
    n_mels = 64
    duration = 5
    num_classes = 6
    train_batch_size = 16
    valid_batch_size = 16
    model_name = 'swin_v2_s'
    epochs = 50
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    learning_rate = 1e-6

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(config.seed)

In [4]:
df = pd.read_csv("/scratch/network/mk8574/audio_sentiment_challenge/data/train.csv")
df.head()

Unnamed: 0,id,path,label
0,TRAIN_0000,./train/TRAIN_0000.wav,1
1,TRAIN_0001,./train/TRAIN_0001.wav,2
2,TRAIN_0002,./train/TRAIN_0002.wav,4
3,TRAIN_0003,./train/TRAIN_0003.wav,5
4,TRAIN_0004,./train/TRAIN_0004.wav,4


In [5]:
signal, sr = torchaudio.load('/scratch/network/mk8574/audio_sentiment_challenge/data/train/TRAIN_0001.wav')
print(signal.shape)
print(sr)

torch.Size([1, 41642])
16000


In [68]:
import augly.audio as audaugs
import augly.utils as utils

aug = audaugs.Compose([
    audaugs.AddBackgroundNoise(p = 0.1),
    audaugs.Clip(duration_factor = 0.7),
#    audaugs.TimeStretch(rate = 3.0),
#    audaugs.Speed(factor = 3.0),
    audaugs.Harmonic(p = 0.5),
    audaugs.InvertChannels(),
    audaugs.OneOf([audaugs.Clicks(p = 0.6),
                   audaugs.InsertInBackground(offset_factor = 0.25, p = 0.6)
                   ])
#    audaugs.ToMono()
])

In [69]:
from augly.audio.utils import validate_and_load_audio

In [84]:
class AudioSentDataset(Dataset):
    def __init__(self, df, transformation, target_sample_rate, duration, mode):
        self.audio_paths = df['path'].values
        if mode in ["train","valid"]:
            self.labels = df['label'].values
        self.transformation = transformation # transformation
        self.target_sample_rate = target_sample_rate # sample rate
        self.num_samples = target_sample_rate * duration
        self.mode = mode # ['train', 'valid', 'test']
        
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, index):
        audio_path = os.path.join('/scratch/network/mk8574/audio_sentiment_challenge/data', self.audio_paths[index])

        #signal, sr = torchaudio.load(audio_path) # loaded the audio
        signal, sr = validate_and_load_audio(audio_path)
        
        # Now we first checked if the sample rate is same as TARGET_SAMPLE_RATE and if it not equal we perform resampling
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        signal = torch.Tensor(signal)
        # IN CASE DATA IS STEREO:
        # Next we check the number of channels of the signal
        #signal -> (num_channels, num_samples) - Eg.-(2, 14000) -> (1, 14000)
        # if signal.shape[0]>1:
        #     signalnu = torch.mean(signal, axis=0, keepdim=True)
        
        print(signal.shape)
        # Lastly we check the number of samples of the signal
        #signal -> (num_channels, num_samples) - Eg.-(1, 14000) -> (1, self.num_samples)
        # If it is more than the required number of samples, we truncate the signal
        if signal.shape[0] > self.num_samples:
            signal = signal[:, :self.num_samples]
        
        # If it is less than the required number of samples, we pad the signal
        if signal.shape[0]<self.num_samples:
            num_missing_samples = self.num_samples - signal.shape[0]
            last_dim_padding = (0, num_missing_samples)
            signal = F.pad(signal, last_dim_padding)
        signal = np.array(signal)
        signal, _ = aug(signal, sample_rate = sr, metadata = [])
        # Finally all the process has been done and now we will extract mel spectrogram from the signal
        mel = self.transformation(signal)
        
        # For pretrained models, we need 3 channel image, so for that we concatenate the extracted mel
        image = torch.cat([mel, mel, mel])
    
        # Normalize the image
        max_val = torch.abs(image).max()
        
        image = image / max_val
        image=image[0]
        
        
        if self.mode in ['train', 'valid']:
            label = torch.tensor(self.labels[index])
            return image, label
        
        else:
            return image

In [7]:
from sklearn.model_selection import train_test_split

mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=config.sample_rate, 
                                                      n_fft=config.n_fft, 
                                                      hop_length=config.hop_length, 
                                                      n_mels=config.n_mels)
# Function to get data according to the folds
def get_data():
    df = pd.read_csv('/scratch/network/mk8574/audio_sentiment_challenge/data/train.csv')
    train_df, valid_df = train_test_split(df, test_size = 0.2, shuffle = True)
    
    train_dataset = AudioSentDataset(train_df, mel_spectrogram, config.sample_rate, config.duration, mode = 'train')
    valid_dataset = AudioSentDataset(valid_df, mel_spectrogram, config.sample_rate, config.duration, mode = 'valid')
    
    train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=True)
    
    return train_loader, valid_loader

In [66]:
class BirdCLEFResnet(nn.Module):
    def __init__(self):
        super(BirdCLEFResnet, self).__init__()
        self.base_model = models.__getattribute__(config.model_name)(pretrained=True)
        
        #self.base_model = torchaudio.models.hubert_pretrain_base(num_classes=6) 
        #for param in self.base_model.parameters():
            #param.requires_grad = False
            
        #in_features = self.base_model.head.out_features
        
        #self.base_model.head.out_features = nn.Linear(1028, config.num_classes)

    def forward(self, x,labels):
        x = self.base_model(x,labels)
        return x, labels

In [60]:
BirdCLEFResnet()

BirdCLEFResnet(
  (base_model): HuBERTPretrainModel(
    (wav2vec2): Wav2Vec2Model(
      (feature_extractor): FeatureExtractor(
        (conv_layers): ModuleList(
          (0): ConvLayerBlock(
            (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
            (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          )
          (1-4): 4 x ConvLayerBlock(
            (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          )
          (5-6): 2 x ConvLayerBlock(
            (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          )
        )
      )
      (encoder): Encoder(
        (feature_projection): FeatureProjection(
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (projection): Linear(in_features=512, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (pos_conv_embed): Convolutional

In [71]:
def loss_fn(outputs, labels):
    SMOOTH = 1e-10
    
    return nn.CrossEntropyLoss(label_smoothing = 0.3)(outputs + SMOOTH, labels)

def train(model, data_loader, optimizer, scheduler, device, epoch):
    model.train()
    
    running_loss = 0
    loop = tqdm(data_loader, position=0)
    for i, (mels, labels) in enumerate(loop):
        mels = mels.to(device)
        labels = labels.to(device)
        
        outputs = model(mels)
        _, preds = torch.max(outputs, 1)
        
        
        
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        
        optimizer.step()
        optimizer.zero_grad()
        
        if scheduler is not None:
            scheduler.step()
            
        running_loss += loss.item()
        
        loop.set_description(f"Epoch [{epoch+1}/{config.epochs}]")
        loop.set_postfix(loss=loss.item())

    return running_loss / len(data_loader)

In [72]:
def valid(model, data_loader, device, epoch):
    model.eval()
    
    running_loss = 0
    pred = []
    label = []
    
    loop = tqdm(data_loader, position=0)
    for mels, labels in loop:
        mels = mels.to(device)
        labels = labels.to(device)
        
        outputs = model(mels)
        _, preds = torch.max(outputs, 1)
        
        loss = loss_fn(outputs, labels)
        # print('Outputs:', outputs)
        # print('Labels:', labels)
            
        running_loss += loss.item()
        
        pred.extend(preds.view(-1).cpu().detach().numpy())
        label.extend(labels.view(-1).cpu().detach().numpy())
        
        loop.set_description(f"Epoch [{epoch+1}/{config.epochs}]")
        loop.set_postfix(loss=loss.item())
        
    valid_f1 = f1_score(label, pred, average='macro')
    label = torch.Tensor(label)
    pred = torch.Tensor(pred)
    valid_acc = (label == pred).float().sum() / label.shape[0]
    
    return running_loss/len(data_loader), valid_f1, valid_acc

In [73]:
a = torch.Tensor([1, 2, 3])
b = torch.Tensor([1, 4, 5])

print((a == b).float().sum())

tensor(1.)


In [74]:
def run():
    train_loader, valid_loader = get_data()
    
    model = BirdCLEFResnet().to(config.device)
    
    optimizer = Adam(model.parameters(), lr=config.learning_rate)
    
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=10)
    
    best_valid_f1 = 0
    for epoch in range(config.epochs):
        train_loss = train(model, train_loader, optimizer, scheduler, config.device, epoch)
        valid_loss, valid_f1, valid_acc = valid(model, valid_loader, config.device, epoch)
        
        print(f"Validation F1 - {valid_f1}, Accuracy - {valid_acc}")
        torch.save(model.state_dict(), f'./model.bin')
        print(f"Saved model checkpoint at ./model.bin")

    return best_valid_f1

In [85]:
from torch import autograd
autograd.set_detect_anomaly(True)
run()

  0%|          | 0/250 [00:00<?, ?it/s]

torch.Size([28829])


  0%|          | 0/250 [00:00<?, ?it/s]


TypeError: 'int' object is not callable

In [None]:
mfcc = torchaudio.transforms.MFCC(sample_rate = config.sample_rate,
                                 n_mfcc = 20,
                                 log_mels = False)

In [None]:
def predict():
    PATH = "/scratch/network/mk8574/audio_sentiment_challenge/baseline_dy/model_20231122015145.bin"
    
    test_df = pd.read_csv('../data/test.csv')
    
    
    model = BirdCLEFResnet().to(config.device)
    
    model.load_state_dict(torch.load(PATH))
    
    test_dataset = AudioSentDataset(test_df, mfcc, config.sample_rate, config.duration, mode = 'test')
    test_loader = DataLoader(test_dataset, batch_size=32,shuffle=False)
    
    
    test_df = test_df.drop(['path'], axis = 1)
    ans = []
    for i, mels in enumerate(tqdm(test_loader, position=0)):   
        mels = mels.to(config.device)

        mels = torch.argmax(model(mels), dim = 1)
        
        ans.append(mels)
    print(ans)
    z = [y.item() for x in ans for y in x]

    test_df['label'] = z
    test_df.to_csv('submission.csv', index = False)
    
    print(test_df)
    
    return

In [None]:
predict()