# Data Preprocessing

In [None]:
import scipy
import librosa
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
import scipy.io.wavfile
import time
import IPython
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import Subset
import json
from torchinfo import summary
from utils import AudioProcessing, audioPreprocessing, audioPreprocessing_t1, voting
from models import Net, effnetv2_xl, MobileNetV3_Large, CNN_LSTM, CNN_LSTM_att
from dataset import MyLSTMDataset
from helper import train_lstm, evaluate_audio

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
gt = pd.read_csv('files/train.csv')
gt.head()

Unnamed: 0,id,container id,scenario,background,illumination,width at the top,width at the bottom,height,depth,container capacity,container mass,filling type,filling level,filling density,filling mass,object mass,handover starting frame,handover start timestamp,handover hand,action,nframes,folder_num,file_name,num,subject,filling_type,filling_level,back,light,camera_id,start,end
0,0,2,2,1,0,69.0,42.0,72.0,-1.0,185.0,2.0,2,1,0.82,76.0,78.0,-1,-1,-1,1.0,291576,2,s2_fi2_fu1_b1_l0,70,2,2,1,1,0,2,0.75,3.5
1,1,7,0,0,0,193.0,193.0,241.0,69.0,3209.397,59.0,0,0,0.0,0.0,59.0,-1,-1,-1,0.0,118483,7,s0_fi0_fu0_b0_l0,0,0,0,0,0,0,2,-1.0,-1.0
2,2,2,0,1,0,69.0,42.0,72.0,-1.0,185.0,2.0,3,1,1.0,93.0,95.0,-1,-1,-1,1.0,572008,2,s0_fi3_fu1_b1_l0,22,0,3,1,1,0,2,3.4,6.5
3,3,8,0,1,0,135.0,135.0,164.0,56.0,1239.84,31.0,0,0,0.0,0.0,31.0,-1,-1,-1,0.0,141680,8,s0_fi0_fu0_b1_l0,2,0,0,0,1,0,2,-1.0,-1.0
4,4,4,1,1,0,88.0,56.0,91.0,-1.0,296.0,86.0,1,1,0.34,45.0,131.0,-1,-1,-1,1.0,138681,4,s1_fi1_fu1_b1_l0,34,1,1,1,1,0,2,0.75,1.8


In [None]:
efficient = '/content/drive/MyDrive/COSRMAL_CHALLENGE/audios/efficient/XL-97.14.pth'
base_path = '/content/drive/MyDrive/COSRMAL_CHALLENGE/'
audio_folder = '/content/drive/MyDrive/COSRMAL_CHALLENGE/train/audio'
T2_mid_dir = os.path.join(base_path, 'T2_mid')
T2_pred_dir = os.path.join(base_path, 'T2_pred')
os.makedirs(T2_mid_dir,exist_ok=True)
os.makedirs(T2_pred_dir,exist_ok=True)

model = effnetv2_xl()
model.load_state_dict(torch.load(efficient))
model.to(device)
model.eval()

audioPreprocessing_t1(audio_folder, gt,T2_mid_dir, T2_pred_dir, model, device)

# Train

In [None]:
myDataSet = MyLSTMDataset(base_path, gt['filling_level'].to_numpy())

953


## CNN_LSTM

In [None]:
bs = 16
train_split = 0.8
lr = 1e-4
epochs = 200
n_samples = len(myDataSet)
assert n_samples == 684, "684"

model = CNN_LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,  weight_decay=1e-5)

best_loss = float('inf')
best_acc = 0

num_train = 584
num_val = n_samples - num_train

train_set, val_set = torch.utils.data.random_split(myDataSet, [num_train, num_val])

assert len(train_set) == num_train, "Same"
assert len(val_set) == num_val, "Same"


train_loader   = DataLoader(train_set,
                            batch_size=bs,
                            shuffle=True)
val_loader   = DataLoader(val_set,
                          batch_size=bs,
                          shuffle=False)

for epoch in range(epochs):
  #start_time = time.time()
  loss_train, correct_train = train_lstm(model, train_loader, optimizer, device)
  loss_val, correct_val = evaluate_audio(model, val_loader, criterion = nn.CrossEntropyLoss())
  #elapsed_time = time.time() - start_time
  print("Epoch {}/{} train loss:{:.4f} train acc:{:.2f}% ".format(epoch+1,epochs, loss_train, 100 * correct_train/num_train))
  print("Epoch {}/{} val loss:{:.4f} val acc:{:.2f}% ".format(epoch+1,epochs, loss_val, 100 * correct_val/num_val))

  
  if correct_val > best_acc:
    best_acc = correct_val
    best_train = correct_train
    torch.save(model, os.path.join(base_path, 'audios', "best_lstm.pth"))
  
  if correct_val == best_acc and best_train < correct_train:
    best_acc = correct_val
    best_train = correct_train
    torch.save(model, os.path.join(base_path, 'audios', "best_lstm.pth"))

    
  




Epoch 1/200 train loss:1.0452 train acc:46.40% 
Epoch 1/200 val loss:0.9191 val acc:41.00% 
Epoch 2/200 train loss:0.9919 train acc:50.00% 
Epoch 2/200 val loss:0.9260 val acc:61.00% 
Epoch 3/200 train loss:0.9881 train acc:49.14% 
Epoch 3/200 val loss:0.8794 val acc:58.00% 
Epoch 4/200 train loss:0.9932 train acc:51.54% 
Epoch 4/200 val loss:0.8525 val acc:66.00% 
Epoch 5/200 train loss:0.9659 train acc:49.66% 
Epoch 5/200 val loss:0.8356 val acc:60.00% 
Epoch 6/200 train loss:0.8339 train acc:46.92% 
Epoch 6/200 val loss:0.7376 val acc:67.00% 
Epoch 7/200 train loss:0.7594 train acc:64.21% 
Epoch 7/200 val loss:0.6884 val acc:63.00% 
Epoch 8/200 train loss:0.6278 train acc:68.32% 
Epoch 8/200 val loss:0.5749 val acc:76.00% 
Epoch 9/200 train loss:0.6022 train acc:63.36% 
Epoch 9/200 val loss:0.5800 val acc:73.00% 
Epoch 10/200 train loss:0.5727 train acc:64.38% 
Epoch 10/200 val loss:0.5843 val acc:73.00% 
Epoch 11/200 train loss:0.5532 train acc:64.73% 
Epoch 11/200 val loss:0.5212 

## CNN_LSTM_ATT

In [None]:
bs = 16
train_split = 0.8
lr = 1e-4
epochs = 200
n_samples = len(myDataSet)
assert n_samples == 684, "684"

model = CNN_LSTM_att().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,  weight_decay=1e-5)

best_loss = float('inf')
best_acc = 0

num_train = 584
num_val = n_samples - num_train

train_set, val_set = torch.utils.data.random_split(myDataSet, [num_train, num_val])

assert len(train_set) == num_train, "Same"
assert len(val_set) == num_val, "Same"


train_loader   = DataLoader(train_set,
                            batch_size=bs,
                            shuffle=True)
val_loader   = DataLoader(val_set,
                          batch_size=bs,
                          shuffle=False)

for epoch in range(epochs):
  #start_time = time.time()
  loss_train, correct_train = train_lstm(model, train_loader, optimizer)
  loss_val, correct_val = evaluate_audio(model, val_loader, criterion = nn.CrossEntropyLoss())
  #elapsed_time = time.time() - start_time
  print("Epoch {}/{} train loss:{:.4f} train acc:{:.2f}% ".format(epoch+1,epochs, loss_train, 100 * correct_train/num_train))
  print("Epoch {}/{} val loss:{:.4f} val acc:{:.2f}% ".format(epoch+1,epochs, loss_val, 100 * correct_val/num_val))

  # if loss_val < best_loss:
  #   best_loss = loss_val
  #   torch.save(model, os.path.join(base_path, 'audios', "best_loss.pth"))
  
  if correct_val > best_acc:
    best_acc = correct_val
    best_train = correct_train
    torch.save(model, os.path.join(base_path, 'audios', "best_lstm_att.pth"))
  
  if correct_val == best_acc and best_train < correct_train:
    best_acc = correct_val
    best_train = correct_train
    torch.save(model, os.path.join(base_path, 'audios', "best_lstm_att.pth"))

    
  




Epoch 1/200 train loss:1.0357 train acc:42.64% 
Epoch 1/200 val loss:0.9610 val acc:35.00% 
Epoch 2/200 train loss:1.0011 train acc:51.54% 
Epoch 2/200 val loss:1.0445 val acc:59.00% 
Epoch 3/200 train loss:0.9815 train acc:48.97% 
Epoch 3/200 val loss:1.0195 val acc:54.00% 
Epoch 4/200 train loss:0.9543 train acc:55.65% 
Epoch 4/200 val loss:0.9913 val acc:57.00% 
Epoch 5/200 train loss:0.9740 train acc:49.83% 
Epoch 5/200 val loss:0.9485 val acc:58.00% 
Epoch 6/200 train loss:0.8934 train acc:51.88% 
Epoch 6/200 val loss:0.7982 val acc:51.00% 
Epoch 7/200 train loss:0.7878 train acc:55.14% 
Epoch 7/200 val loss:0.6595 val acc:74.00% 
Epoch 8/200 train loss:0.7241 train acc:61.47% 
Epoch 8/200 val loss:0.6126 val acc:70.00% 
Epoch 9/200 train loss:0.5849 train acc:67.64% 
Epoch 9/200 val loss:0.6113 val acc:68.00% 
Epoch 10/200 train loss:0.6306 train acc:64.04% 
Epoch 10/200 val loss:0.5440 val acc:67.00% 
Epoch 11/200 train loss:0.5608 train acc:66.95% 
Epoch 11/200 val loss:0.5514 