# Single Played/Recorded Sample Pair Validation

Given a *played* audio called `played_audio.wav` and a *recorded* audio called `rec_audio.wav`, this notebook extracts their Mel-Spectrograms and performs a prediction. Elaboration times are calculated for benchmarking purposes.

### Instructions

- All needed files are within the `monotest-samples.zip` file. This archive contains:
  - A pair of *played* and *recorded* audio files (`played_audio.wav` and `rec_audio.wav`)
  - The `monotest.csv` file. It is a dataset file that only contains one entry, that is, the Mel-Spectrograms generated from the `played_audio.wav`/`rec_audio.wav` audio pair. The label in this .csv file is ignored by this code.
  - The `111-th04-100acc.pt` file. It is a model that was pretrained with our solution, and that got 100% accuracy on the testing dataset, that is, it correctly classified all samples within that dataset. Please note that this model never saw the `played_audio.wav` and `rec_audio.wav` included in this archive.
- Just extract everything on your local machine and move all files in the same directory of this notebook. If you run this on Google Colab, you can put them in the `/content/` folder, that is, the directory already shown in the left panel when Google Colab is started.

### Running this on Windows
It works as long as you install the following requirements (other versions of these packages are not tested, but they might work):

- Librosa 0.9.2
- Matplotlib 3.6.0
- Numba 0.56.2
- Numpy 1.23.3
- Pandas 1.5.0
- Pillow 9.2.0
- Resampy 0.4.2
- Scikit-Image 0.19.3
- Scikit-Learn 1.1.2
- Torch 1.12.1
- Torchvision 0.13.1

### Running this on Raspberry Pi 4 Model B (armv7l)

Getting all the packages to work correctly on Raspberry Pi was quite tricky. Here are some brief instructions on how to do so:

1. sudo apt-get install llvm
2. LLVM_CONFIG=/usr/bin/llvm-config sudo pip3 install llvmlite==0.32.1
3. sudo apt-get install libatlas3-base
4. sudo pip3 install numba==0.49
5. sudo pip3 install librosa==0.8.0
6. Install the remaining packages in any order

- Matplotlib 3.0.2
- Numpy 1.21.6
- Pandas 1.3.5
- Pillow 9.2.0
- Resampy 0.2.2
- Scikit-Image 0.19.3
- Scikit-Learn 1.0.2
- Torch 1.7.0 (from https://github.com/Kashu7100/pytorch-armv7l )
- Torchvision 0.8.0 (from https://github.com/Kashu7100/pytorch-armv7l )

Please install the exact versions stated above if you want to run this solution on Raspberry Pi, or you will probably encounter several problems during the installation process or during the execution of this notebook.

In [None]:
import librosa
import numpy as np
import pandas as pd
import os
import skimage.io
from PIL import Image
import torch
import torch.nn as nn
from torchvision import transforms
import time

base_path = ""
played_audio_path = "played_audio.wav"
rec_audio_path = "rec_audio.wav"

def scale_minmax(X, min=0.0, max=1.0):
  X_std = (X - X.min()) / (X.max() - X.min())
  X_scaled = X_std * (max - min) + min
  return X_scaled

def trim(img, offset=0):
  return img[:,0+offset:]

def preprocess(audiofile, num):
  y, sr = librosa.load(audiofile, sr=None)
  img = generate_mel_spectrogram(y, sr, num)
  if (num == 1):
    finalName = "played_audio_melspec.png"
  else:
    finalName = "rec_audio_melspec.png"
  print("Saving: " + finalName)
  skimage.io.imsave(finalName, img)

def generate_mel_spectrogram(y, sr, num):
  mel = librosa.feature.melspectrogram(y=y, sr=sr)
  mel_db = librosa.power_to_db(mel, ref=np.max)
  img = scale_minmax(mel_db, 0, 255).astype(np.uint8)
  img = np.flip(img, axis=0)
  img = 255-img
  if (num == 1):
    img = trim(img, 0)
  else:
    img = trim(img, 20)
  return img

if __name__ ==  '__main__':
  ppStartTime = time.time()
  preprocess(played_audio_path, 1)
  preprocess(rec_audio_path, 2)
  print("Preprocessing done in " + str(time.time() - ppStartTime) + " seconds.")
  print("")

# - - - - - - - - - DATASET DESCRIPTION - - - - - - - - - - #

class SiameseDataset():
    def __init__(self,training_csv=None,transform=None):
        self.train_df = pd.read_csv(base_path + "" + training_csv, header=None)
        self.transform = transform

    def __getitem__(self,index):
        img1_path = os.path.join(base_path, self.train_df.iat[index,0])
        img1 = Image.open(img1_path)
        img1 = img1.convert("L")
        img2_path = os.path.join(base_path, self.train_df.iat[index,1])
        img2 = Image.open(img2_path)
        img2 = img2.convert("L")

        if self.transform is not None:
            img1 = self.transform(img1)
            img2 = self.transform(img2)
        return img1, img2, torch.from_numpy(np.array([int(self.train_df.iat[index,2])],dtype=np.float32)), img1_path, img2_path

    def __len__(self):
        return len(self.train_df)

# - - - - - - - - - NEURAL NETWORK DESCRIPTION - - - - - - - - - - #
    
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()

        self.cnn1 = nn.Sequential(
            nn.Conv2d(1, 60, kernel_size=7, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),
            nn.BatchNorm2d(60),
            nn.Dropout2d(p=.25),

            nn.Conv2d(60, 48, kernel_size=7, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),
            nn.BatchNorm2d(48),
            nn.Dropout2d(p=.25),

            nn.Conv2d(48, 36, kernel_size=5, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),
            nn.BatchNorm2d(36),
            nn.Dropout2d(p=.25),

            nn.Conv2d(36, 24, kernel_size=5, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),
            nn.BatchNorm2d(24),
            nn.Dropout2d(p=.25),

            nn.Conv2d(24, 12, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),
            nn.BatchNorm2d(12),
            nn.Dropout2d(p=.25)
        )

        self.fc1 = nn.Sequential(
            nn.Linear(456, 300),
            nn.ReLU(inplace=True),

            nn.Linear(300, 100),
            nn.ReLU(inplace=True),

            nn.Linear(100, 20),
            nn.ReLU(inplace=True)
        )
        
    def forward_once(self, x):
        output = self.cnn1(x)
        output = output.view(output.size()[0], -1)
        output = self.fc1(output)
        return output

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return output1, output2

# - - - - - - - - - VALIDATION - - - - - - - - - - #

def validate(dataloader, last, threshold, network):
  for i, data in enumerate(dataloader,0):
    x0, x1, label, fullname1, fullname2 = data
    output1,output2 = network(x0.to(device),x1.to(device))
    pdist = torch.nn.functional.pairwise_distance(output1, output2)
    if label==torch.FloatTensor([[0]]):
      label="Benign."
    else:
      label="Malicious!"
    prediction = "Malicious!" if pdist.item()>=threshold else "Benign."
    print("Now evaluating: " + str(fullname1) + " and " + str(fullname2))
    print("Predicted Pairwise Distance: ", pdist.item())
    print("Prediction: ", prediction)
    print("") 

if __name__ ==  '__main__':
  setupStartTime = time.time()
  net = SiameseNetwork()
  test_th = 0.4
  test_dataset = SiameseDataset("monotest.csv", transform=transforms.Compose([transforms.Resize((128,650)), transforms.ToTensor()])) #The label in this csv file is not used
  test_dataloader = torch.utils.data.DataLoader(test_dataset,num_workers=1,batch_size=1,shuffle=True)
  device = torch.device('cpu')
  model_to_load = "111-th04-100acc.pt"
  net.load_state_dict(torch.load(model_to_load, map_location=torch.device('cpu')))
  net.eval()
  print("Network initialized in " + str(time.time() - setupStartTime) + " seconds.")
  print("")
  
  validationStartTime = time.time()
  validate(test_dataloader, True, test_th, net)
  print("Prediction elaborated in " + str(time.time() - validationStartTime) + " seconds.")
  