In [1]:
import torch, torchvision
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science
import librosa
import matplotlib.pyplot as plt  # allows creation of insightful plots

sys.path.append('../audio_preprocessing')
sys.path.append('../src')
sys.path.append('../model_training_utils')


import preprocessing_func_2
from generator_to_dataset_2 import NormalisedDataSet
from gdsc_utils import PROJECT_DIR
import model_training
import model_eval

os.chdir(PROJECT_DIR) # changing our directory to root

In [2]:
# df_train = pd.read_csv('data/small_train_with_path.csv')
# df_val = pd.read_csv('data/small_val_with_path.csv')
# df_train.head()

In [3]:
# df = pd.concat([df_train, df_val], ignore_index=True)
# df.head()

In [4]:
df = pd.read_csv('data/metadata.csv')

In [5]:
df.head()

Unnamed: 0,file_name,unique_file,path,species,label,subset,sample_rate,num_frames,length
0,Roeselianaroeselii_XC751814-dat028-019_edit1.wav,Roeselianaroeselii_XC751814-dat028-019,data/train/Roeselianaroeselii_XC751814-dat028-...,Roeselianaroeselii,56,train,44100,4586400,104.0
1,Roeselianaroeselii_XC752367-dat006-010.wav,Roeselianaroeselii_XC752367-dat006-010,data/train/Roeselianaroeselii_XC752367-dat006-...,Roeselianaroeselii,56,train,44100,337571,7.654671
2,Yoyettacelis_GBIF2465208563_IN36000894_50988.wav,Yoyettacelis_GBIF2465208563_IN36000894_50988,data/train/Yoyettacelis_GBIF2465208563_IN36000...,Yoyettacelis,64,train,44100,220500,5.0
3,Gomphocerippusrufus_XC752285-dat001-045.wav,Gomphocerippusrufus_XC752285-dat001-045,data/train/Gomphocerippusrufus_XC752285-dat001...,Gomphocerippusrufus,26,train,44100,693715,15.730499
5,Phaneropteranana_XC755717-221013-Phaneroptera-...,Phaneropteranana_XC755717-221013-Phaneroptera-...,data/train/Phaneropteranana_XC755717-221013-Ph...,Phaneropteranana,41,train,44100,88200,2.0


In [6]:
import soundfile as sf

def process_and_save_wav(paths, labels, chunk_size:int=66150, wav_max_amplitude:float=0.5, sr=44100):
    new_paths_steam = "data/big_data_processed_train_and_val"
    new_paths = []
    new_labels = []
    i = 0
    for (path, label) in zip(paths, labels):
        wav = preprocessing_func_2.load_wav(path=path)
        wav = preprocessing_func_2.normalise_wav(wav=wav, wav_max_amplitude=wav_max_amplitude)
        peaks = preprocessing_func_2.find_wav_peaks(wav=wav, distance_between_peaks=chunk_size)
        if peaks is None:
            wav = preprocessing_func_2.process_small_wav(wav=wav, chunk_size=chunk_size)
            sf.write(f"{new_paths_steam}/{i}.wav", wav, sr)
            new_paths.append(f"{new_paths_steam}/{i}.wav")
            new_labels.append(label)
            i += 1
            continue
        if len(peaks) > 50:
            peaks = np.random.permutation(peaks)[:50]
        for chunks in preprocessing_func_2.split_wav_by_peaks(wav=wav, peaks=peaks, chunk_size=chunk_size):
            sf.write(f"{new_paths_steam}/{i}.wav", chunks, sr)
            new_paths.append(f"{new_paths_steam}/{i}.wav")
            new_labels.append(label)
            i += 1
    return new_paths, new_labels

In [7]:
paths, labels = list(df["path"]), list(df["label"])

new_paths, new_labels = process_and_save_wav(paths, labels)

In [8]:
my_dict = {
    "file_path": new_paths,
    "label": new_labels
}

processed_data_df = pd.DataFrame(my_dict)
processed_data_df.to_csv("data/big_data_processed_train_and_val.csv")

In [9]:
processed_data_df

Unnamed: 0,file_path,label
0,data/big_data_processed/0.wav,56
1,data/big_data_processed/1.wav,56
2,data/big_data_processed/2.wav,56
3,data/big_data_processed/3.wav,56
4,data/big_data_processed/4.wav,56
...,...,...
21871,data/big_data_processed/21871.wav,27
21872,data/big_data_processed/21872.wav,27
21873,data/big_data_processed/21873.wav,24
21874,data/big_data_processed/21874.wav,24
