# Small Dataset for LSTM

We are going to use a small subset for testing the LSTM model.

In [1]:
import pandas as pd
import numpy as np

import torchaudio
import torchaudio.transforms as T

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import librosa

import h5py

In [2]:
df = pd.read_csv("../datasets/AnimalSoundFull.csv")
df.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1572324720,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
1,1572324719,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
2,1572324718,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
3,1572324717,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
4,1572324716,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...


In [3]:
df_aves = df[df["class"] == "Aves"].reset_index(drop=True)
df_aves.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1572324720,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
1,1572324719,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
2,1572324718,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
3,1572324717,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
4,1572324716,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...


In [4]:
df_mammalia = df[df["class"] == "Mammalia"].reset_index(drop=True)
df_mammalia.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1451082269,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
1,1451082268,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
2,1451082267,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
3,1451082266,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...
4,1451082265,http://www.tierstimmenarchiv.de/recordings/Pan...,Panthera leo,Panthera,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Panthera/Panthera_le...


In [5]:
df_aves.shape, df_mammalia.shape

((13737, 8), (2147, 8))

In [6]:
num_samples = 1000
np.random.seed(42)

df_small_aves = df_aves.sample(n=num_samples)
df_small_mammalia = df_mammalia.sample(n=num_samples)

df_small = pd.concat([df_small_aves, df_small_mammalia]).reset_index(drop=True).drop(columns=["identifier",
                                                                                              "species",
                                                                                              "genus",
                                                                                              "family",
                                                                                              "phylum"
                                                                                             ])
df_small

Unnamed: 0,gbifID,class,file_name
0,1052813851,Aves,Chordata/Aves/Turdidae/Turdus/Turdus_philomelo...
1,1229950816,Aves,Chordata/Aves/Accipitridae/Aquila/Aquila_rapax...
2,779855502,Aves,Chordata/Aves/Fringillidae/Fringilla/Fringilla...
3,1229950910,Aves,Chordata/Aves/Accipitridae/Buteo/Buteo_rufinus...
4,779853615,Aves,Chordata/Aves/Emberizidae/Emberiza/Emberiza_ho...
...,...,...,...
1995,779847924,Mammalia,Chordata/Mammalia/Callitrichidae/Callithrix/Ca...
1996,779854254,Mammalia,Chordata/Mammalia/Equidae/Equus/Equus_caballus...
1997,779847968,Mammalia,Chordata/Mammalia/Camelidae/Camelus/Camelus_ba...
1998,779852429,Mammalia,Chordata/Mammalia/Sciuridae/Cynomys/Cynomys_lu...


In [7]:
"""
  n_fft = 1024
  win_length = None
  hop_length = 512
  n_mels = 128

  mel_spectrogram = T.MelSpectrogram(
      sample_rate=sample_rate,
      n_fft=n_fft,
      win_length=win_length,
      hop_length=hop_length,
      center=True,
      pad_mode="reflect",
      power=2.0,
      norm='slaney',
      onesided=True,
      n_mels=n_mels,
      mel_scale="htk",
  )
  
  melspec = mel_spectrogram(wf)[0]
  
  print(melspec.shape)
  
  
  height = 128*2
  width = height*4
  dpi = 100
  
  fig = plt.figure(frameon=False, figsize=(width/dpi, height/dpi), dpi=dpi)

  
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  
  im = ax.imshow(librosa.power_to_db(melspec), origin='lower', aspect="auto")
  
  file_name = "../spectrograms/small_dataset/" + str(row["gbifID"]) + ".jpg"
  
  plt.savefig(file_name)
  plt.close()
  """

'\n  n_fft = 1024\n  win_length = None\n  hop_length = 512\n  n_mels = 128\n\n  mel_spectrogram = T.MelSpectrogram(\n      sample_rate=sample_rate,\n      n_fft=n_fft,\n      win_length=win_length,\n      hop_length=hop_length,\n      center=True,\n      pad_mode="reflect",\n      power=2.0,\n      norm=\'slaney\',\n      onesided=True,\n      n_mels=n_mels,\n      mel_scale="htk",\n  )\n  \n  melspec = mel_spectrogram(wf)[0]\n  \n  print(melspec.shape)\n  \n  \n  height = 128*2\n  width = height*4\n  dpi = 100\n  \n  fig = plt.figure(frameon=False, figsize=(width/dpi, height/dpi), dpi=dpi)\n\n  \n  ax = plt.Axes(fig, [0., 0., 1., 1.])\n  ax.set_axis_off()\n  fig.add_axes(ax)\n  \n  im = ax.imshow(librosa.power_to_db(melspec), origin=\'lower\', aspect="auto")\n  \n  file_name = "../spectrograms/small_dataset/" + str(row["gbifID"]) + ".jpg"\n  \n  plt.savefig(file_name)\n  plt.close()\n  '

In [8]:
def getMFCC(row, file):  
  y, sample_rate = librosa.load("../data/" + row.file_name)
  MFCC = librosa.feature.mfcc(y=y, sr=sample_rate)
  file.create_dataset(str(row.gbifID), data=MFCC)
  
  return 

In [9]:
tqdm.pandas(desc="Creating MFCC")

In [10]:
out_file = h5py.File("test_npz.h5", "w")

In [11]:
_ = df_small.progress_apply(getMFCC, file=out_file, axis=1)

out_file.close()

Creating MFCC:   0%|          | 0/2000 [00:00<?, ?it/s]



































































































In [12]:
with h5py.File("test_npz.h5", "r")as f:
  print(f.keys())

<KeysViewHDF5 ['1052802722', '1052802725', '1052802740', '1052804067', '1052804092', '1052805991', '1052806142', '1052806147', '1052806480', '1052806614', '1052806620', '1052806641', '1052806664', '1052807345', '1052807388', '1052807398', '1052807406', '1052807411', '1052807418', '1052807424', '1052807427', '1052807460', '1052810323', '1052810500', '1052810505', '1052810868', '1052811296', '1052811391', '1052811397', '1052811398', '1052811400', '1052811402', '1052811404', '1052811405', '1052811419', '1052811428', '1052811441', '1052811451', '1052811461', '1052811475', '1052811482', '1052811493', '1052811502', '1052811510', '1052811511', '1052811518', '1052811526', '1052811543', '1052811552', '1052811568', '1052811630', '1052811637', '1052811762', '1052811775', '1052811880', '1052811893', '1052812243', '1052812301', '1052812305', '1052812635', '1052812766', '1052813185', '1052813198', '1052813282', '1052813283', '1052813477', '1052813492', '1052813494', '1052813609', '1052813617', '1052

In [13]:
for a in loaded.keys():
  print(a)

NameError: name 'loaded' is not defined

In [None]:
df_small.head()

In [None]:
#df_small.to_csv("../datasets/Aves-Mammalia.csv", index=False)