In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import os
import shutil
import soundfile as sf

The **ESC-50 dataset** is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories:

| <sub>Animals</sub> | <sub>Natural soundscapes & water sounds </sub> | <sub>Human, non-speech sounds</sub> | <sub>Interior/domestic sounds</sub> | <sub>Exterior/urban noises</sub> |
| :--- | :--- | :--- | :--- | :--- |
| <sub>Dog</sub> | <sub>Rain</sub> | <sub>Crying baby</sub> | <sub>Door knock</sub> | <sub>Helicopter</sub></sub> |
| <sub>Rooster</sub> | <sub>Sea waves</sub> | <sub>Sneezing</sub> | <sub>Mouse click</sub> | <sub>Chainsaw</sub> |
| <sub>Pig</sub> | <sub>Crackling fire</sub> | <sub>Clapping</sub> | <sub>Keyboard typing</sub> | <sub>Siren</sub> |
| <sub>Cow</sub> | <sub>Crickets</sub> | <sub>Breathing</sub> | <sub>Door, wood creaks</sub> | <sub>Car horn</sub> |
| <sub>Frog</sub> | <sub>Chirping birds</sub> | <sub>Coughing</sub> | <sub>Can opening</sub> | <sub>Engine</sub> |
| <sub>Cat</sub> | <sub>Water drops</sub> | <sub>Footsteps</sub> | <sub>Washing machine</sub> | <sub>Train</sub> |
| <sub>Hen</sub> | <sub>Wind</sub> | <sub>Laughing</sub> | <sub>Vacuum cleaner</sub> | <sub>Church bells</sub> |
| <sub>Insects (flying)</sub> | <sub>Pouring water</sub> | <sub>Brushing teeth</sub> | <sub>Clock alarm</sub> | <sub>Airplane</sub> |
| <sub>Sheep</sub> | <sub>Toilet flush</sub> | <sub>Snoring</sub> | <sub>Clock tick</sub> | <sub>Fireworks</sub> |
| <sub>Crow</sub> | <sub>Thunderstorm</sub> | <sub>Drinking, sipping</sub> | <sub>Glass breaking</sub> | <sub>Hand saw</sub> |

In [2]:
metadata = pd.read_csv('ESC-50-master/meta/esc50.csv')

metadata.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [3]:
metadata = metadata[['category','filename']].copy()

metadata.head()

Unnamed: 0,category,filename
0,dog,1-100032-A-0.wav
1,chirping_birds,1-100038-A-14.wav
2,vacuum_cleaner,1-100210-A-36.wav
3,vacuum_cleaner,1-100210-B-36.wav
4,thunderstorm,1-101296-A-19.wav


In [4]:
metadata['category'].unique()

array(['dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm',
       'door_wood_knock', 'can_opening', 'crow', 'clapping', 'fireworks',
       'chainsaw', 'airplane', 'mouse_click', 'pouring_water', 'train',
       'sheep', 'water_drops', 'church_bells', 'clock_alarm',
       'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow',
       'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter',
       'drinking_sipping', 'rain', 'insects', 'laughing', 'hen', 'engine',
       'breathing', 'crying_baby', 'hand_saw', 'coughing',
       'glass_breaking', 'snoring', 'toilet_flush', 'pig',
       'washing_machine', 'clock_tick', 'sneezing', 'rooster',
       'sea_waves', 'siren', 'cat', 'door_wood_creaks', 'crickets'],
      dtype=object)

In [5]:
metadata.shape

(2000, 2)

In [6]:
animal_categories = [
    'dog',
    'chirping_biards',
    'crow','sheep',
    'frog',
    'cow',
    'insects',
    'hen',
    'pig',
    'rooster',
    'cat'
]

animal_metadata = metadata[metadata['category'].isin(animal_categories)]

animal_metadata['category'].unique()




array(['dog', 'crow', 'sheep', 'frog', 'cow', 'insects', 'hen', 'pig',
       'rooster', 'cat'], dtype=object)

In [7]:
animal_metadata.shape

(400, 2)

create folder if does not exist

In [8]:
folder_path = 'cleaned_audio'
skip_delete = False

# Check if the folder does not exist
if not os.path.exists(folder_path):
    # Create the folder
    os.makedirs(folder_path)
    result = f"Folder '{folder_path}' was created."
else:
    # If skip_delete is False, delete everything in the folder
    if not skip_delete:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                result = f'Failed to delete {file_path}. Reason: {e}'
        result = f"Existing contents of '{folder_path}' have been deleted."
    else:
        result = "Skipping deletion as per 'skip_delete' flag."

result



"Existing contents of 'cleaned_audio' have been deleted."

In [9]:
source_directory = 'ESC-50-master/audio'

def adjust_volume(data, volume_change):
    return data * volume_change

if not skip_delete:
    for index, row in animal_metadata.iterrows():
        filename = row['filename']
        category = row['category']

        #Copy original file over
        source_file = os.path.join(source_directory, filename)
        destination_file = os.path.join(folder_path, filename)
        shutil.copy2(source_file, destination_file)

        # try:
        #     y, sr = librosa.load(source_file, sr=None)

        #     #Adjust volumes
        #     y_higher = adjust_volume(y,2)
        #     y_lower = adjust_volume(y,0.5)

        #     #Generate new filenames
        #     base, ext = os.path.splitext(filename)
        #     higher_volume_filename = f"{base}_higher{ext}"
        #     lower_volume_filename = f"{base}_lower{ext}"

        #     #Save the adjusted audio files
        #     higher_file_path = os.path.join(folder_path, higher_volume_filename)
        #     lower_file_path = os.path.join(folder_path, lower_volume_filename)
        #     sf.write(higher_file_path, y_higher, sr)
        #     sf.write(lower_file_path, y_lower, sr)

        #     # Add new entries to the dataframe
        #     new_rows = [
        #         {'category': category, 'filename': higher_volume_filename},
        #         {'category': category, 'filename': lower_volume_filename}
        #     ]
        #     animal_metadata = pd.concat([animal_metadata, pd.DataFrame(new_rows)], ignore_index=True)

        # except Exception as e:
        #     print(f"Error processing file {filename}: {e}")

        result = "Files copied successfully."
else:
    result = "Skipping file copying and manipulation as per 'skip_delete' flag."

result

'Files copied successfully.'

In [10]:
animal_metadata.shape

(400, 2)

1. Mel-frequency Cepstral Coefficients (MFCCs)
Why? MFCCs are very effective in capturing the timbral aspects of sound. They've been widely used in speech recognition and can differentiate various types of sounds including animal vocalizations. The coefficients effectively capture the rate changes in spectral bands and are useful for identifying different kinds of animal sounds.
2. Spectral Features
Spectral Centroid: Useful for identifying the "brightness" of a sound, which can differentiate between high-pitched and low-pitched animal calls.
Spectral Bandwidth: Provides information about the spread of the spectrum, which can help distinguish between broad and narrow-band sounds.
Spectral Contrast: Useful for identifying different sound textures and can help differentiate between sounds in different environments or contexts.
Spectral Rolloff: It indicates the frequency below which a certain percentage of the total spectral energy is contained. It can be used to distinguish between harmonic (musical) and inharmonic sounds.
3. Chroma Features
Chroma STFT: Captures the harmonic content of audio which might be useful for some species where the harmonic structure of the sound is important for classification.
4. Rhythm Features
Tempo: While not always directly applicable to animal sounds, the tempo can be useful in contexts where the rhythm of sounds (like the repetitive calls of some species) plays a role in identification.
5. Zero-Crossing Rate (ZCR)
Why? It measures the rate of sign changes along a signal. Useful for distinguishing between percussive sounds and more sustained sounds. Animal calls often have distinctive patterns that can be captured by ZCR.
6. Root Mean Square (RMS) Energy
Why? It provides a measure of the sound's energy over time. For animal sound classification, variations in energy can help differentiate between vocalizations.
7. Time-domain Features
Although less common than frequency-domain features for this type of classification, features like waveform shape and amplitude envelope could provide useful insights into the dynamics of animal vocalizations.

In [11]:
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Extracting various audio features
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))
    rms_energy = np.mean(librosa.feature.rms(y=y))
    
    # Flatten the feature arrays into a single list
    features = np.hstack((mfccs, spectral_centroid, spectral_bandwidth, spectral_contrast, spectral_rolloff, chroma_stft, zero_crossing_rate, rms_energy))
    return features

# Prepare to collect the features
features = []

# Extract features for each audio file in the DataFrame
for _, row in animal_metadata.iterrows():
    filename = row['filename']
    source_file = os.path.join(folder_path, filename)
    try:
        features.append(extract_features(source_file))
    except Exception as e:
        print(f"Error processing {source_file}: {e}")
        features.append([np.nan]*37)  # Adjusted for the total number of features

# Define feature names (adjusted for clarity and correctness)
feature_names = (
    ['mfcc_' + str(i) for i in range(1, 14)] +
    ['spectral_centroid', 'spectral_bandwidth'] +
    ['spectral_contrast_' + str(i) for i in range(1, 8)] +
    ['spectral_rolloff'] +
    ['chroma_stft_' + str(i) for i in range(1, 13)] +
    ['zero_crossing_rate', 'rms_energy']
)

# Convert the list of features to a DataFrame
features_df = pd.DataFrame(features, columns=feature_names)

# Concatenate with the original DataFrame (check alignment)
animal_metadata = pd.concat([animal_metadata.reset_index(drop=True), features_df], axis=1)

# Now 'animal_metadata' contains your original data plus the new audio features

In [12]:
animal_metadata

Unnamed: 0,category,filename,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,chroma_stft_5,chroma_stft_6,chroma_stft_7,chroma_stft_8,chroma_stft_9,chroma_stft_10,chroma_stft_11,chroma_stft_12,zero_crossing_rate,rms_energy
0,dog,1-100032-A-0.wav,-581.739929,8.207122,-6.658803,-4.290906,-3.034478,0.259279,-1.963946,-0.271577,...,0.015058,0.018501,0.017698,0.028559,0.063234,0.053491,0.022580,0.027154,0.007329,0.007518
1,crow,1-103298-A-9.wav,-173.182404,131.663696,-23.350657,44.626896,-16.704086,34.943077,13.708267,24.017622,...,0.732421,0.713032,0.681489,0.679387,0.704024,0.667303,0.626718,0.625275,0.041329,0.123869
2,dog,1-110389-A-0.wav,-586.657959,11.902005,-0.987902,-1.792529,-2.177405,0.969337,0.288988,-0.362819,...,0.071546,0.073381,0.075201,0.076470,0.066509,0.061248,0.062431,0.067577,0.002298,0.006422
3,sheep,1-121951-A-8.wav,-551.090820,160.014633,-20.451891,19.595392,11.168517,7.803196,5.037781,7.483987,...,0.783631,0.827221,0.755697,0.738392,0.667673,0.627172,0.567690,0.553862,0.020001,0.009328
4,frog,1-15689-A-4.wav,-417.220917,53.747379,-38.695053,2.486163,2.382831,23.929157,1.585072,7.268365,...,0.564025,0.605285,0.561806,0.552817,0.604001,0.601423,0.627455,0.737398,0.076411,0.042017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,crow,5-261325-A-9.wav,-209.718155,170.387741,-70.376450,-10.876296,-8.433419,34.003033,23.409658,14.701620,...,0.589676,0.632856,0.593397,0.550534,0.481719,0.436234,0.364745,0.336989,0.056753,0.092404
396,hen,5-263831-A-6.wav,-255.692429,169.110596,-30.520596,-12.869926,-0.491376,-3.083074,-5.353972,6.771190,...,0.533731,0.527566,0.556591,0.560734,0.461802,0.365441,0.341314,0.418589,0.051378,0.085000
397,hen,5-263831-B-6.wav,-270.886200,166.410309,-42.401577,-7.616354,-3.115259,3.510293,-1.538110,9.906310,...,0.507882,0.417974,0.388774,0.519711,0.406317,0.334552,0.343435,0.361146,0.053308,0.085106
398,sheep,5-61635-A-8.wav,-193.228912,130.766678,-38.089508,7.711935,-7.598218,27.439699,-22.992725,16.840811,...,0.545188,0.405764,0.309511,0.250844,0.233278,0.301266,0.433649,0.565833,0.075558,0.060704
