# Small Dataset

To start with the problem, we are going to use a small subset for testing.

In [1]:
import pandas as pd
import numpy as np

import torchaudio
import torchaudio.transforms as T

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import librosa

In [2]:
df = pd.read_csv("../datasets/AnimalSoundFull.csv")
df.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1572324720,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
1,1572324719,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
2,1572324718,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
3,1572324717,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
4,1572324716,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...


In [3]:
df_aves = df[df["class"] == "Aves"].reset_index(drop=True)
df_aves.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1572324720,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
1,1572324719,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
2,1572324718,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
3,1572324717,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
4,1572324716,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...


In [4]:
df_mammalia = df[df["class"] == "Mammalia"].reset_index(drop=True)
df_mammalia.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1451082269,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
1,1451082268,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
2,1451082267,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
3,1451082266,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
4,1451082265,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...


In [5]:
df_aves.shape, df_mammalia.shape

((13737, 8), (2147, 8))

In [6]:
num_samples = 1000
np.random.seed(42)

df_small_aves = df_aves.sample(n=num_samples)
df_small_mammalia = df_mammalia.sample(n=num_samples)

df_small = pd.concat([df_small_aves, df_small_mammalia]).reset_index(drop=True).drop(columns=["identifier",
                                                                                              "species",
                                                                                              "genus",
                                                                                              "family",
                                                                                              "phylum"
                                                                                             ])
df_small

Unnamed: 0,gbifID,class,file_name
0,1052813851,Aves,Chordata/Aves/Turdidae/Turdus/Turdus_philomelo...
1,1229950816,Aves,Chordata/Aves/Accipitridae/Aquila/Aquila_rapax...
2,779855502,Aves,Chordata/Aves/Fringillidae/Fringilla/Fringilla...
3,1229950910,Aves,Chordata/Aves/Accipitridae/Buteo/Buteo_rufinus...
4,779853615,Aves,Chordata/Aves/Emberizidae/Emberiza/Emberiza_ho...
...,...,...,...
1995,779847924,Mammalia,Chordata/Mammalia/Callitrichidae/Callithrix/Ca...
1996,779854254,Mammalia,Chordata/Mammalia/Equidae/Equus/Equus_caballus...
1997,779847968,Mammalia,Chordata/Mammalia/Camelidae/Camelus/Camelus_ba...
1998,779852429,Mammalia,Chordata/Mammalia/Sciuridae/Cynomys/Cynomys_lu...


In [7]:
def getSpectrogram(row):
  wf, sample_rate = torchaudio.load("../data/" + row.file_name)
  
  n_fft = 1024
  win_length = None
  hop_length = 512
  n_mels = 128

  mel_spectrogram = T.MelSpectrogram(
      sample_rate=sample_rate,
      n_fft=n_fft,
      win_length=win_length,
      hop_length=hop_length,
      center=True,
      pad_mode="reflect",
      power=2.0,
      norm='slaney',
      onesided=True,
      n_mels=n_mels,
      mel_scale="htk",
  )
  
  melspec = mel_spectrogram(wf)[0]
  
  height = 128*2
  width = height*4
  dpi = 100
  
  fig = plt.figure(frameon=False, figsize=(width/dpi, height/dpi), dpi=dpi)

  
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  
  im = ax.imshow(librosa.power_to_db(melspec), origin='lower', aspect="auto")
  
  file_name = "../spectrograms/small_dataset/" + row["class"].lower() + "/" + str(row["gbifID"]) + ".jpg"
  
  plt.savefig(file_name)
  plt.close()
      
  return

In [8]:
tqdm.pandas(desc="Creating Spectrograms")

In [9]:
_ = df_small.progress_apply(getSpectrogram, axis=1)

Creating Spectrograms:   0%|          | 0/2000 [00:00<?, ?it/s]



In [10]:
df_small.to_csv("../datasets/Aves-Mammalia.csv", index=False)