# Packages

In [1]:
import scipy
import librosa
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
import scipy.io.wavfile
import time
import IPython
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import Subset
import json
from utils import AudioProcessing, audioPreprocessing, voting
from dataset import audioDataSet
from models import Net, effnetv2_xl, MobileNetV3_Large
from helper import train_audio, evaluate_audio


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Data Preprocessing

In [2]:
gt = pd.read_csv('./files/train.csv')
gt.head()

Unnamed: 0,id,container id,scenario,background,illumination,width at the top,width at the bottom,height,depth,container capacity,container mass,filling type,filling level,filling density,filling mass,object mass,handover starting frame,handover start timestamp,handover hand,action,nframes,folder_num,file_name,num,subject,filling_type,filling_level,back,light,camera_id,start,end
0,0,2,2,1,0,69.0,42.0,72.0,-1.0,185.0,2.0,2,1,0.82,76.0,78.0,-1,-1,-1,1.0,291576,2,s2_fi2_fu1_b1_l0,70,2,2,1,1,0,2,0.75,3.5
1,1,7,0,0,0,193.0,193.0,241.0,69.0,3209.397,59.0,0,0,0.0,0.0,59.0,-1,-1,-1,0.0,118483,7,s0_fi0_fu0_b0_l0,0,0,0,0,0,0,2,-1.0,-1.0
2,2,2,0,1,0,69.0,42.0,72.0,-1.0,185.0,2.0,3,1,1.0,93.0,95.0,-1,-1,-1,1.0,572008,2,s0_fi3_fu1_b1_l0,22,0,3,1,1,0,2,3.4,6.5
3,3,8,0,1,0,135.0,135.0,164.0,56.0,1239.84,31.0,0,0,0.0,0.0,31.0,-1,-1,-1,0.0,141680,8,s0_fi0_fu0_b1_l0,2,0,0,0,1,0,2,-1.0,-1.0
4,4,4,1,1,0,88.0,56.0,91.0,-1.0,296.0,86.0,1,1,0.34,45.0,131.0,-1,-1,-1,1.0,138681,4,s1_fi1_fu1_b1_l0,34,1,1,1,1,0,2,0.75,1.8


In [4]:
base_path = '/content/drive/MyDrive/COSRMAL_CHALLENGE/'
audio_folder = '/content/drive/MyDrive/COSRMAL_CHALLENGE/train/audio/'
os.makedirs(os.path.join(base_path, 'audios'), exist_ok=True)
mfcc_path = (os.path.join(base_path, 'audios', 'mfcc'))
raw_path = (os.path.join(base_path, 'audios', 'raw'))

audioPreprocessing(audio_folder, gt, base_path, mfcc_path)

100%|██████████| 684/684 [09:43<00:00,  1.04s/it]

# Train

In [None]:
mydataset = audioDataSet(base_path)

## Net

In [98]:
bs = 100
train_split = 0.8
lr = 1e-5
epochs = 200
n_samples = len(mydataset)
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,  weight_decay=1e-5)

best_loss = float('inf')
best_acc = 0

num_train = int(train_split * n_samples)
num_val = n_samples - num_train

train_set, val_set = torch.utils.data.random_split(mydataset, [num_train, num_val])

assert len(train_set) == num_train, "Same"
assert len(val_set) == num_val, "Same"


train_loader   = DataLoader(train_set,
                            batch_size=bs,
                            shuffle=True)
val_loader   = DataLoader(val_set,
                          batch_size=bs,
                          shuffle=True)
for epoch in range(epochs):
  loss_train, correct_train = train_audio(model, train_loader, optimizer, device)
  loss_val, correct_val = evaluate_audio(model, val_loader, device, criterion = nn.CrossEntropyLoss())
  print("Epoch {}/{} train loss:{:.4f} train acc:{:.2f}% ".format(epoch+1,epochs, loss_train, 100 * correct_train/num_train))
  print("Epoch {}/{} val loss:{:.4f} val acc:{:.2f}% ".format(epoch+1,epochs, loss_val, 100 * correct_val/num_val))

  if loss_val < best_loss:
    best_loss = loss_val
    torch.save(model, os.path.join(base_path, 'audios', "best_loss.pth"))
  
  if correct_val > best_acc:
    best_acc = correct_val
    torch.save(model, os.path.join(base_path, 'audios', "best_val.pth"))


Epoch 1/200 train loss:1.0360 train acc:73.44% 
Epoch 1/200 val loss:0.9534 val acc:79.19% 
Epoch 2/200 train loss:0.9431 train acc:80.36% 
Epoch 2/200 val loss:0.9258 val acc:81.64% 
Epoch 3/200 train loss:0.9241 train acc:81.98% 
Epoch 3/200 val loss:0.9357 val acc:80.56% 
Epoch 4/200 train loss:0.9132 train acc:82.80% 
Epoch 4/200 val loss:0.9041 val acc:84.02% 
Epoch 5/200 train loss:0.8858 train acc:86.28% 
Epoch 5/200 val loss:1.0089 val acc:72.62% 
Epoch 6/200 train loss:0.8672 train acc:87.78% 
Epoch 6/200 val loss:0.8978 val acc:84.50% 
Epoch 7/200 train loss:0.8605 train acc:88.34% 
Epoch 7/200 val loss:0.9934 val acc:74.57% 
Epoch 8/200 train loss:0.8526 train acc:89.10% 
Epoch 8/200 val loss:0.8889 val acc:85.34% 
Epoch 9/200 train loss:0.8470 train acc:89.68% 
Epoch 9/200 val loss:0.8650 val acc:87.79% 
Epoch 10/200 train loss:0.8426 train acc:90.15% 
Epoch 10/200 val loss:0.8547 val acc:88.86% 
Epoch 11/200 train loss:0.8390 train acc:90.55% 
Epoch 11/200 val loss:0.8496 

## MobileNet

In [105]:
from models import MobileNetV3_Large

bs = 100
train_split = 0.8
lr = 1e-3
epochs = 200
n_samples = len(mydataset)
model = MobileNetV3_Large(num_classes=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,  weight_decay=1e-5)

best_loss = float('inf')
best_acc = 0

num_train = int(train_split * n_samples)
num_val = n_samples - num_train

train_set, val_set = torch.utils.data.random_split(mydataset, [num_train, num_val])

assert len(train_set) == num_train, "Same"
assert len(val_set) == num_val, "Same"


train_loader   = DataLoader(train_set,
                            batch_size=bs,
                            shuffle=True)
val_loader   = DataLoader(val_set,
                          batch_size=bs,
                          shuffle=True)
for epoch in range(epochs):
  loss_train, correct_train = train_audio(model, train_loader, optimizer, device)
  loss_val, correct_val = evaluate_audio(model, val_loader, device, criterion = nn.CrossEntropyLoss())
  print("{}/{} train loss:{:.4f} train acc:{:.2f}% val loss:{:.4f} val acc:{:.2f}%".format(
      epoch+1,epochs, loss_train, 100 * correct_train/num_train,
      loss_val, 100 * correct_val/num_val))

  if loss_val < best_loss:
    best_loss = loss_val
    torch.save(model, os.path.join(base_path, 'audios', "bl-mobile.pth"))
  
  if correct_val > best_acc:
    best_acc = correct_val
    torch.save(model, os.path.join(base_path, 'audios', "bv-mobile.pth"))
  




1/200 train loss:0.5775 train acc:79.35% val loss:0.5985 val acc:79.21%
2/200 train loss:0.3572 train acc:87.87% val loss:0.3697 val acc:86.89%
3/200 train loss:0.2895 train acc:89.89% val loss:0.3728 val acc:87.68%
4/200 train loss:0.2644 train acc:90.48% val loss:0.3346 val acc:88.31%
5/200 train loss:0.2458 train acc:91.26% val loss:0.5997 val acc:73.06%
6/200 train loss:0.2228 train acc:91.91% val loss:0.4331 val acc:88.59%
7/200 train loss:0.2042 train acc:92.81% val loss:0.2907 val acc:89.39%
8/200 train loss:0.1795 train acc:93.65% val loss:0.6188 val acc:78.47%
9/200 train loss:0.1554 train acc:94.29% val loss:0.2454 val acc:91.56%
10/200 train loss:0.1468 train acc:94.63% val loss:0.5624 val acc:80.92%
11/200 train loss:0.1265 train acc:95.43% val loss:0.3431 val acc:89.66%
12/200 train loss:0.1140 train acc:95.92% val loss:0.4471 val acc:88.10%
13/200 train loss:0.1017 train acc:96.37% val loss:0.3115 val acc:91.62%
14/200 train loss:0.0938 train acc:96.56% val loss:0.3609 va

## EfficientNet

In [None]:
my_save_path = '/content/drive/MyDrive/COSRMAL_CHALLENGE'
bs = 100
train_split = 0.8
lr = 1e-4
epochs = 200
n_samples = len(mydataset)
model = effnetv2_xl().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,  weight_decay=1e-5)

best_loss = float('inf')
best_acc = 0

num_train = int(train_split * n_samples)
num_val = n_samples - num_train

train_set, val_set = torch.utils.data.random_split(mydataset, [num_train, num_val])

assert len(train_set) == num_train, "Same"
assert len(val_set) == num_val, "Same"


train_loader   = DataLoader(train_set,
                            batch_size=bs,
                            shuffle=True,
                            num_workers=1)
val_loader   = DataLoader(val_set,
                          batch_size=bs,
                          shuffle=True,
                          num_workers=1)

for epoch in range(epochs):
  loss_train, correct_train = train_audio(model, train_loader, optimizer, device)
  loss_val, correct_val = evaluate_audio(model, val_loader, device, criterion = nn.CrossEntropyLoss())

  print("{}/{} train loss:{:.4f} train acc:{:.2f}% val loss:{:.4f} val acc:{:.2f}%".format(
      epoch+1,epochs, loss_train, 100 * correct_train/num_train,
      loss_val, 100 * correct_val/num_val))
  
  
  if correct_val > best_acc:
    best_acc = correct_val
    torch.save(model.state_dict(), os.path.join(my_save_path, 
                                              'audios', 
                                              'efficient',
                                              "XL-{:.2f}.pth".format(100 * correct_val/num_val)))

# Evaluation

In [None]:
model_pretrained = torch.load(os.path.join(base_path, 'audios', "bv-mobile.pth"))
model_pretrained.to(device)
model_pretrained.eval()

voting_dir = '/content/drive/MyDrive/COSRMAL_CHALLENGE/audios'

voting(audio_folder, voting_dir, model_pretrained, device, save_size=64)

In [121]:
f = open(os.path.join(voting_dir, "voting.json"))
vote_js = json.load(f)

vote = pd.DataFrame(vote_js).T
vote.head()

Unnamed: 0,data_num,file,count_pred,final_pred,pred
0,0,0,"[19, 0, 19, 0]",2,"[0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1,1,1,"[13, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,2,2,"[56, 0, 0, 21]",3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,3,"[16, 0, 0, 0]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,4,"[7, 9, 0, 0]",1,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]"


In [136]:
gt = pd.read_csv('files/train.csv')
acc = np.sum(gt['filling_type'].to_numpy() == vote['final_pred'].to_numpy()) / len(gt['filling_type'])
print('Acc: {:.2f}%'.format(100 * acc))

Acc: 100.00%
