## Fait par : Taha Tamir - Ayoub Alalou

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd
import torch
import torch.nn as  nn
import torch.nn.functional as F
import math, random
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
import glob# data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/sample_submission.csv
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/train.csv
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/test.csv
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/wk5zgcbhqstuo12fyxn0a.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/sbead2m8u0wifyplht6v.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/ilfd9v3pks75b8x6wy2a.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/g534z18s0txbhciwqlnk.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/qt6vl4jushkf7g5webcym.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/m8g5sz61ec7uftob2na3.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/o6013rbhaz94tlqmg8nc.wav
/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/mys93xou2

In [2]:
def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)
def resample(aud, newsr):
    sig, sr = aud

    if (sr == newsr):
      # Nothing to do
      return aud

    num_channels = sig.shape[0]
    # Resample first channel
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      # Resample the second channel and merge both channels
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
def rechannel(aud, new_channel):
    sig, sr = aud

    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud

    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel   
      resig = torch.cat([sig[0, :],sig[0, :], sig[1, :]])
      resig = resig.view(3,-1)

    return ((resig, sr))

In [3]:
def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)

In [4]:
def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)
def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [5]:
df = pd.read_csv('/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/train.csv')

In [6]:
mu_f = glob.glob('/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/train/*')
def get_path(id):
    for i in mu_f:
        if id in i:
            return i 

In [7]:
df['file_path']= df['id'].apply(get_path)

In [8]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
class SoundDS(Dataset):
  def __init__(self, df, data_path,train=True):
    self.df = df
    self.data_path = data_path
    self.duration = 2970
    self.sr = 48000
    self.channel = 3
    self.train=train
#     self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path[idx]
    # Get the Class ID
    if self.train:
        class_id = self.df.loc[self.df['file_path']==audio_file,'label'].values[0]

    aud = open(audio_file)
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = resample(aud, self.sr)
    rechan =rechannel(reaud, self.channel)

    dur_aud = pad_trunc(rechan, self.duration)
#     shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    if self.train:
        return sgram, class_id
    if not self.train:
        return sgram

In [9]:
from torch.utils.data import random_split

myds = SoundDS(df, mu_f)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.83)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32, shuffle=False)

In [10]:
import torch
import torch.nn as  nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
        super(Bottleneck, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.batch_norm1 = nn.BatchNorm2d(out_channels)
        
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.batch_norm2 = nn.BatchNorm2d(out_channels)
        
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0)
        self.batch_norm3 = nn.BatchNorm2d(out_channels*self.expansion)
        
        self.i_downsample = i_downsample
        self.stride = stride
        self.relu = nn.ReLU()
        
    def forward(self, x):
        identity = x.clone()
        x = self.relu(self.batch_norm1(self.conv1(x)))
        
        x = self.relu(self.batch_norm2(self.conv2(x)))
        
        x = self.conv3(x)
        x = self.batch_norm3(x)
        
        #downsample if needed
        if self.i_downsample is not None:
            identity = self.i_downsample(identity)
        #add identity
        x+=identity
        x=self.relu(x)
        
        return x

class Block(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, i_downsample=None, stride=1):
        super(Block, self).__init__()
       

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
        self.batch_norm1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
        self.batch_norm2 = nn.BatchNorm2d(out_channels)

        self.i_downsample = i_downsample
        self.stride = stride
        self.relu = nn.ReLU()

    def forward(self, x):
      identity = x.clone()

      x = self.relu(self.batch_norm2(self.conv1(x)))
      x = self.batch_norm2(self.conv2(x))

      if self.i_downsample is not None:
          identity = self.i_downsample(identity)
      print(x.shape)
      print(identity.shape)
      x += identity
      x = self.relu(x)
      return x

In [11]:
m = nn.Sigmoid()
class ResNet(nn.Module):
    def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
        super(ResNet, self).__init__()
        self.in_channels = 64
        
        self.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.batch_norm1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64)
        self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
        self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
        self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512*ResBlock.expansion, num_classes)
        
    def forward(self, x):
        x = self.relu(self.batch_norm1(self.conv1(x)))
        x = self.max_pool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)
        
        return m(x)
        
    def _make_layer(self, ResBlock, blocks, planes, stride=1):
        ii_downsample = None
        layers = []
        
        if stride != 1 or self.in_channels != planes*ResBlock.expansion:
            ii_downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes*ResBlock.expansion)
            )
            
        layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
        self.in_channels = planes*ResBlock.expansion
        
        for i in range(blocks-1):
            layers.append(ResBlock(self.in_channels, planes))
            
        return nn.Sequential(*layers)

In [12]:
def ResNet50(num_classes, channels=3):
    return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)
myModel = ResNet50(1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [13]:
def training(model, train_dl, num_epochs,optim):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.BCELoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=optim)
#   scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
#                                                 steps_per_epoch=int(len(train_dl)),
#                                                 epochs=num_epochs,
#                                                 anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
#         inputs_m, inputs_s = inputs.mean(), inputs.std()
#         inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).to(torch.float))
        loss.backward()
        optimizer.step()
#         scheduler.step()
        for i in range(outputs.shape[0]):
            outputs[i][0]= 1 if outputs[i][0]>=0.5 else 0
        # Keep stats for Loss and Accuracy
        running_loss += loss.item()
#         print(outputs)
        # Get the predicted class with the highest score
#         _, prediction = torch.max(outputs,1)
        
        # Count of predictions that matched the target label
        correct_prediction += (outputs.squeeze(1) == labels).sum().item()
        
        total_prediction += outputs.shape[0]
#         print(labels.shape)

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')


In [14]:
num_epochs=10   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs,0.0001)

Epoch: 0, Loss: 0.70, Accuracy: 0.50
Epoch: 1, Loss: 0.70, Accuracy: 0.55
Epoch: 2, Loss: 0.53, Accuracy: 0.76
Epoch: 3, Loss: 0.48, Accuracy: 0.78
Epoch: 4, Loss: 0.40, Accuracy: 0.79
Epoch: 5, Loss: 0.34, Accuracy: 0.83
Epoch: 6, Loss: 0.22, Accuracy: 0.93
Epoch: 7, Loss: 0.23, Accuracy: 0.91
Epoch: 8, Loss: 0.20, Accuracy: 0.94
Epoch: 9, Loss: 0.13, Accuracy: 0.95


In [15]:
def inference (model, val_dl):
    model.eval()
    correct_prediction = 0
    total_prediction = 0

  # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

          # Normalize the inputs
#             inputs_m, inputs_s = inputs.mean(), inputs.std()
#             inputs = (inputs - inputs_m) / inputs_s

          # Get predictions
            outputs = model(inputs)
            for i in range(outputs.shape[0]):
                outputs[i][0]= 1 if outputs[i][0]>=0.5 else 0
#             print(outputs)
                # Keep stats for Loss and Accuracy


            correct_prediction += (outputs.squeeze(1) == labels).sum().item()

            total_prediction += outputs.shape[0]

 
    acc = correct_prediction/total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set
myModel.eval()
inference(myModel, val_dl)

Accuracy: 0.83, Total items: 24


In [16]:
test = pd.read_csv('/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/test.csv')
mu_ft = glob.glob('/kaggle/input/moroccan-darija-trigger-word-classification-ed-2/data/test/*')
def get_path(id):
    for i in mu_ft:
        if id in i:
            return i 
test['file_path']= test['id'].apply(get_path)

In [17]:
testds = SoundDS(test, mu_ft,False)

# # Random split of 80:20 between training and validation
# num_items = len(myds)
# num_train = round(num_items * 0.83)
# num_val = num_items - num_train
# train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
test_dl = torch.utils.data.DataLoader(testds, batch_size=82)

In [18]:
with torch.no_grad():
    for data in test_dl:
      # Get the input features and target labels, and put them on the GPU
        myModel.eval()
        inputs = data.to(device)
          # Normalize the inputs
#         print(inputs.shape)
#         inputs_m, inputs_s = inputs.mean(), inputs.std()
#         inputs = (inputs - inputs_m) / inputs_s

          # Get predictions
        outputs = myModel(inputs)
        print(outputs.shape)
        for i in range(outputs.shape[0]):
            outputs[i][0]= 1 if outputs[i][0]>=0.5 else 0

torch.Size([82, 1])


In [19]:
outputs.sum()

tensor(58., device='cuda:0')

In [20]:
test['label']=outputs.cpu()

In [22]:
Audio(test['file_path'][72])

In [23]:
test = test.astype({'label': 'int'})

In [24]:
submission = test.drop('file_path',1)
submission.to_csv('submission.csv',index=False)

  """Entry point for launching an IPython kernel.
