# Small Dataset with a random selection from the whole dataset

To start with the problem, we are going to use a small subset for testing.

In [1]:
import pandas as pd
import numpy as np

import torchaudio
import torchaudio.transforms as T

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import librosa

In [2]:
df = pd.read_csv("../datasets/AnimalSoundFull.csv")
df.head()

Unnamed: 0,gbifID,identifier,species,genus,family,class,phylum,file_name
0,1572324720,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
1,1572324719,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
2,1572324718,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
3,1572324717,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...
4,1572324716,http://www.tierstimmenarchiv.de/recordings/Cre...,Crex crex,Crex,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Crex/Crex_crex/15723247...


In [3]:
num_samples = 2000
np.random.seed(42)

df_small = df.sample(n=num_samples).reset_index(drop=True).drop(columns=["identifier"])
df_small

Unnamed: 0,gbifID,species,genus,family,class,phylum,file_name
0,1230246828,Pycnonotus nigricans,Pycnonotus,Pycnonotidae,Aves,Chordata,Chordata/Aves/Pycnonotidae/Pycnonotus/Pycnonot...
1,779853992,Emberiza hortulana,Emberiza,Emberizidae,Aves,Chordata,Chordata/Aves/Emberizidae/Emberiza/Emberiza_ho...
2,779849670,Cervus elaphus,Cervus,Cervidae,Mammalia,Chordata,Chordata/Mammalia/Cervidae/Cervus/Cervus_elaph...
3,1269848168,Numenius phaeopus,Numenius,Scolopacidae,Aves,Chordata,Chordata/Aves/Scolopacidae/Numenius/Numenius_p...
4,779860246,Neofelis nebulosa,Neofelis,Felidae,Mammalia,Chordata,Chordata/Mammalia/Felidae/Neofelis/Neofelis_ne...
...,...,...,...,...,...,...,...
1995,779852839,Dryobates minor,Dryobates,Picidae,Aves,Chordata,Chordata/Aves/Picidae/Dryobates/Dryobates_mino...
1996,991883728,Coccothraustes coccothraustes,Coccothraustes,Fringillidae,Aves,Chordata,Chordata/Aves/Fringillidae/Coccothraustes/Cocc...
1997,779865264,Rallus aquaticus,Rallus,Rallidae,Aves,Chordata,Chordata/Aves/Rallidae/Rallus/Rallus_aquaticus...
1998,1229953095,Colinus virginianus,Colinus,Odontophoridae,Aves,Chordata,Chordata/Aves/Odontophoridae/Colinus/Colinus_v...


In [4]:
def getSpectrogram(row):
  wf, sample_rate = torchaudio.load("../data/" + row.file_name)
  
  n_fft = 1024
  win_length = None
  hop_length = 512
  n_mels = 128

  mel_spectrogram = T.MelSpectrogram(
      sample_rate=sample_rate,
      n_fft=n_fft,
      win_length=win_length,
      hop_length=hop_length,
      center=True,
      pad_mode="reflect",
      power=2.0,
      norm='slaney',
      onesided=True,
      n_mels=n_mels,
      mel_scale="htk",
  )
  
  melspec = mel_spectrogram(wf)[0]
  
  height = 128*2
  width = height*4
  dpi = 100
  
  fig = plt.figure(frameon=False, figsize=(width/dpi, height/dpi), dpi=dpi)

  
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  
  im = ax.imshow(librosa.power_to_db(melspec), origin='lower', aspect="auto")
  
  file_name = "../spectrograms/small_dataset_broad/" + str(row["gbifID"]) + ".jpg"
  
  plt.savefig(file_name)
  plt.close()
      
  return

In [5]:
tqdm.pandas(desc="Creating Spectrograms")

In [6]:
_ = df_small.progress_apply(getSpectrogram, axis=1)

Creating Spectrograms:   0%|          | 0/2000 [00:00<?, ?it/s]



In [7]:
df_small.to_csv("../datasets/smallDatasetBroad.csv", index=False)