In [155]:
import os
import fnmatch
import pandas as pd
import shutil
from pydub import AudioSegment

In [156]:
# path constructor
def path_constructor(parent, child):
    return os.path.join(parent, child)

In [157]:
rootFolder = "bird_data"
jsonFolder = path_constructor(rootFolder,"json")
audioFolder = path_constructor(rootFolder,"audio")

In [158]:
audioFiles = os.listdir(audioFolder)

In [159]:
bird_df = pd.read_csv(path_constructor(rootFolder, "birdList.csv"))

In [160]:
bird_df

Unnamed: 0,common_name,scientific_name
0,American Goldfinch,Spinus tristis
1,American Robin,Turdus migratorius
2,Barn Swallow,Hirundo rustica
3,Blue Jay,Cyanocitta cristata
4,Blue-grey Gnatcatcher,Polioptila caerulea
5,Broad-winged Hawk,Buteo platypterus
6,Carolina Chickadee,Poecile carolinensis
7,Carolina Wren,Thryothorus ludovicianus
8,Cedar Waxwing,Bombycilla cedrorum
9,Eastern Bluebird,Sialia sialis


In [161]:
# replace blanks with _ in strings
def replace_(alist):
    zlist = []
    for a in alist:
        aux=a.replace(" ","_")
        zlist.append(aux)
    return zlist
    

In [162]:
birds = replace_(bird_df["common_name"])
birds

['American_Goldfinch',
 'American_Robin',
 'Barn_Swallow',
 'Blue_Jay',
 'Blue-grey_Gnatcatcher',
 'Broad-winged_Hawk',
 'Carolina_Chickadee',
 'Carolina_Wren',
 'Cedar_Waxwing',
 'Eastern_Bluebird',
 'Northern_Cardinal',
 'Rose-breasted_Grosbeak',
 'Ruby-crowned_Kinglet',
 'Tufted_Titmouse']

In [163]:
# create a list of existing audio files for a bird
def audioList(fileList, birdName, file_extension):
    birdList=[]
    for file in fileList:
        if fnmatch.fnmatch(file, f'{birdName}*XC*.{file_extension}'):
            birdList.append(file)
    return birdList

In [164]:
# check the number of audio files per bird
number_of_files = []
for b in birds:
    temp=len(audioList(audioFiles,b,"mp3"))
    number_of_files.append(temp)

bird_df["number_of_files"]=number_of_files

In [165]:
bird_df

Unnamed: 0,common_name,scientific_name,number_of_files
0,American Goldfinch,Spinus tristis,110
1,American Robin,Turdus migratorius,302
2,Barn Swallow,Hirundo rustica,475
3,Blue Jay,Cyanocitta cristata,296
4,Blue-grey Gnatcatcher,Polioptila caerulea,179
5,Broad-winged Hawk,Buteo platypterus,66
6,Carolina Chickadee,Poecile carolinensis,109
7,Carolina Wren,Thryothorus ludovicianus,172
8,Cedar Waxwing,Bombycilla cedrorum,101
9,Eastern Bluebird,Sialia sialis,64


In [166]:
# we select only the birds with more than 100 audio files
birds_df = bird_df[bird_df["number_of_files"]>100].reset_index(drop=True)

In [167]:
birds_df

Unnamed: 0,common_name,scientific_name,number_of_files
0,American Goldfinch,Spinus tristis,110
1,American Robin,Turdus migratorius,302
2,Barn Swallow,Hirundo rustica,475
3,Blue Jay,Cyanocitta cristata,296
4,Blue-grey Gnatcatcher,Polioptila caerulea,179
5,Carolina Chickadee,Poecile carolinensis,109
6,Carolina Wren,Thryothorus ludovicianus,172
7,Cedar Waxwing,Bombycilla cedrorum,101
8,Northern Cardinal,Cardinalis cardinalis,173
9,Ruby-crowned Kinglet,Regulus calendula,116


In [168]:
# and drop last row to have only 10 birds
birds_df.drop(labels=10, inplace=True)
birds_df

Unnamed: 0,common_name,scientific_name,number_of_files
0,American Goldfinch,Spinus tristis,110
1,American Robin,Turdus migratorius,302
2,Barn Swallow,Hirundo rustica,475
3,Blue Jay,Cyanocitta cristata,296
4,Blue-grey Gnatcatcher,Polioptila caerulea,179
5,Carolina Chickadee,Poecile carolinensis,109
6,Carolina Wren,Thryothorus ludovicianus,172
7,Cedar Waxwing,Bombycilla cedrorum,101
8,Northern Cardinal,Cardinalis cardinalis,173
9,Ruby-crowned Kinglet,Regulus calendula,116


In [145]:
# we create a dict with the first 100 audio files per bird
selected_birds = replace_(birds_df["common_name"])
audioDict = {}
for b in selected_birds:
    temp = audioList(audioFiles,b,"mp3")
    audioDict[b] = temp[:100]

In [169]:
# check the number of files per bird
for k,v in audioDict.items():
    print(k,len(v))

American_Goldfinch 100
American_Robin 100
Barn_Swallow 100
Blue_Jay 100
Blue-grey_Gnatcatcher 100
Carolina_Chickadee 100
Carolina_Wren 100
Cedar_Waxwing 100
Northern_Cardinal 100
Ruby-crowned_Kinglet 100


In [170]:
# dataframe with the audio files names
audioData_df = pd.DataFrame.from_dict(audioDict, orient="columns")
audioData_df.head()

Unnamed: 0,American_Goldfinch,American_Robin,Barn_Swallow,Blue_Jay,Blue-grey_Gnatcatcher,Carolina_Chickadee,Carolina_Wren,Cedar_Waxwing,Northern_Cardinal,Ruby-crowned_Kinglet
0,American_Goldfinch_XC114342.mp3,American_Robin_XC112597.mp3,Barn_Swallow_XC113501.mp3,Blue_Jay_XC110056.mp3,Blue-grey_Gnatcatcher_XC130506.mp3,Carolina_Chickadee_XC111140.mp3,Carolina_Wren_XC112512.mp3,Cedar_Waxwing_XC121795.mp3,Northern_Cardinal_XC110059.mp3,Ruby-crowned_Kinglet_XC131366.mp3
1,American_Goldfinch_XC124312.mp3,American_Robin_XC114082.mp3,Barn_Swallow_XC123032.mp3,Blue_Jay_XC114085.mp3,Blue-grey_Gnatcatcher_XC130507.mp3,Carolina_Chickadee_XC112508.mp3,Carolina_Wren_XC116314.mp3,Cedar_Waxwing_XC121796.mp3,Northern_Cardinal_XC112830.mp3,Ruby-crowned_Kinglet_XC131957.mp3
2,American_Goldfinch_XC133564.mp3,American_Robin_XC114083.mp3,Barn_Swallow_XC123035.mp3,Blue_Jay_XC116372.mp3,Blue-grey_Gnatcatcher_XC130508.mp3,Carolina_Chickadee_XC112509.mp3,Carolina_Wren_XC122447.mp3,Cedar_Waxwing_XC121797.mp3,Northern_Cardinal_XC130962.mp3,Ruby-crowned_Kinglet_XC138055.mp3
3,American_Goldfinch_XC133565.mp3,American_Robin_XC122037.mp3,Barn_Swallow_XC123036.mp3,Blue_Jay_XC116373.mp3,Blue-grey_Gnatcatcher_XC130511.mp3,Carolina_Chickadee_XC112510.mp3,Carolina_Wren_XC125657.mp3,Cedar_Waxwing_XC121798.mp3,Northern_Cardinal_XC130966.mp3,Ruby-crowned_Kinglet_XC156726.mp3
4,American_Goldfinch_XC141469.mp3,American_Robin_XC130246.mp3,Barn_Swallow_XC123066.mp3,Blue_Jay_XC116374.mp3,Blue-grey_Gnatcatcher_XC130512.mp3,Carolina_Chickadee_XC112633.mp3,Carolina_Wren_XC125658.mp3,Cedar_Waxwing_XC121799.mp3,Northern_Cardinal_XC141210.mp3,Ruby-crowned_Kinglet_XC159571.mp3


In [171]:
# csv of the audio files per bird
audioData_df.to_csv(os.path.join(rootFolder,"audioData_mp3_.csv"), index=False)

In [172]:
# copy selected audio files to the soundfiles folder
for b in selected_birds:
    fileList = audioData_df[b]
    for file in fileList:
        destination = path_constructor(rootFolder,"mp3_files")
        origin = path_constructor(audioFolder,file)
        shutil.copy(origin, destination)

In [173]:
# convert mp3 files to wav
mp3Folder = path_constructor(rootFolder,"mp3_files")
wavFolder = path_constructor(rootFolder,"wav_files")
mp3Files=audioFiles = os.listdir(mp3Folder)

In [174]:
# function to convert mp3 to wav, full file path for origin and just folder destination
for file in mp3Files:
    wav = f'{file[:-4]}.wav'
    origin = path_constructor(mp3Folder, file)
    destination = path_constructor(wavFolder,wav)
    sound = AudioSegment.from_file(origin)
    sound.export(destination, format='wav')
    

In [81]:
# create a dict with the wavfiles
wavFiles = os.listdir(wavFolder)
wavDict = {}
for b in selected_birds:
    temp = audioList(wavFiles,b,"wav")
    wavDict[b] = temp

In [175]:
wavData_df = pd.DataFrame.from_dict(wavDict, orient="columns")
wavData_df.head()

Unnamed: 0,American_Goldfinch,American_Robin,Barn_Swallow,Blue_Jay,Blue-grey_Gnatcatcher,Carolina_Chickadee,Carolina_Wren,Cedar_Waxwing,Northern_Cardinal,Ruby-crowned_Kinglet
0,American_Goldfinch_XC114342.wav,American_Robin_XC112597.wav,Barn_Swallow_XC113501.wav,Blue_Jay_XC110056.wav,Blue-grey_Gnatcatcher_XC130506.wav,Carolina_Chickadee_XC111140.wav,Carolina_Wren_XC112512.wav,Cedar_Waxwing_XC121795.wav,Northern_Cardinal_XC110059.wav,Ruby-crowned_Kinglet_XC129806.wav
1,American_Goldfinch_XC124312.wav,American_Robin_XC114082.wav,Barn_Swallow_XC123032.wav,Blue_Jay_XC114085.wav,Blue-grey_Gnatcatcher_XC130507.wav,Carolina_Chickadee_XC112508.wav,Carolina_Wren_XC116314.wav,Cedar_Waxwing_XC121796.wav,Northern_Cardinal_XC112830.wav,Ruby-crowned_Kinglet_XC131366.wav
2,American_Goldfinch_XC133564.wav,American_Robin_XC114083.wav,Barn_Swallow_XC123035.wav,Blue_Jay_XC116372.wav,Blue-grey_Gnatcatcher_XC130508.wav,Carolina_Chickadee_XC112509.wav,Carolina_Wren_XC122447.wav,Cedar_Waxwing_XC121797.wav,Northern_Cardinal_XC130962.wav,Ruby-crowned_Kinglet_XC131957.wav
3,American_Goldfinch_XC133565.wav,American_Robin_XC122037.wav,Barn_Swallow_XC123036.wav,Blue_Jay_XC116373.wav,Blue-grey_Gnatcatcher_XC130511.wav,Carolina_Chickadee_XC112510.wav,Carolina_Wren_XC125657.wav,Cedar_Waxwing_XC121798.wav,Northern_Cardinal_XC130966.wav,Ruby-crowned_Kinglet_XC138055.wav
4,American_Goldfinch_XC141469.wav,American_Robin_XC130246.wav,Barn_Swallow_XC123066.wav,Blue_Jay_XC116374.wav,Blue-grey_Gnatcatcher_XC130512.wav,Carolina_Chickadee_XC112633.wav,Carolina_Wren_XC125658.wav,Cedar_Waxwing_XC121799.wav,Northern_Cardinal_XC141210.wav,Ruby-crowned_Kinglet_XC156726.wav


In [176]:
wavData_df.to_csv(os.path.join(rootFolder,"audioData_wav_.csv"), index=False)

In [177]:
bird_calls_df=pd.DataFrame(wavFiles, columns=["fname"])
bird_calls_df["label"]=theData.fname.str[:-13]
bird_calls_df.head()

Unnamed: 0,fname,label
0,American_Goldfinch_XC114342.wav,American_Goldfinch
1,American_Goldfinch_XC124312.wav,American_Goldfinch
2,American_Goldfinch_XC133564.wav,American_Goldfinch
3,American_Goldfinch_XC133565.wav,American_Goldfinch
4,American_Goldfinch_XC141469.wav,American_Goldfinch


In [179]:
bird_calls_df.to_csv(os.path.join(rootFolder,"bird_calls.csv"), index=False)