In [8]:
import os
import fnmatch
import pandas as pd

In [9]:
# path constructor
def path_constructor(parent, child):
    return os.path.join(parent, child)

In [30]:
rootFolder = "bird_data"
jsonFolder = path_constructor(rootFolder,"json")
audioFolder = path_constructor(rootFolder,"audio")

In [31]:
audioFiles = os.listdir(audioFolder)

In [32]:
bird_df = pd.read_csv(path_constructor(rootFolder, "birdList.csv"))

In [33]:
bird_df

Unnamed: 0,common_name,scientific_name
0,American Goldfinch,Spinus tristis
1,American Robin,Turdus migratorius
2,Barn Swallow,Hirundo rustica
3,Blue Jay,Cyanocitta cristata
4,Blue-grey Gnatcatcher,Polioptila caerulea
5,Broad-winged Hawk,Buteo platypterus
6,Carolina Chickadee,Poecile carolinensis
7,Carolina Wren,Thryothorus ludovicianus
8,Cedar Waxwing,Bombycilla cedrorum
9,Eastern Bluebird,Sialia sialis


In [67]:
# create a list of existing audio files for a bird
def audioList(fileList, birdName):
    birdList=[]
    for file in fileList:
        if fnmatch.fnmatch(file, f'{birdName}*XC*.mp3'):
            birdList.append(file)
    return birdList

In [74]:
number_of_files = []
for b in birds:
    temp=len(audioList(audioFiles,b))
    number_of_files.append(temp)

bird_df["number_of_files"]=number_of_files

In [75]:
bird_df

Unnamed: 0,common_name,scientific_name,number_of_files
0,American Goldfinch,Spinus tristis,110
1,American Robin,Turdus migratorius,302
2,Barn Swallow,Hirundo rustica,475
3,Blue Jay,Cyanocitta cristata,296
4,Blue-grey Gnatcatcher,Polioptila caerulea,179
5,Broad-winged Hawk,Buteo platypterus,66
6,Carolina Chickadee,Poecile carolinensis,109
7,Carolina Wren,Thryothorus ludovicianus,172
8,Cedar Waxwing,Bombycilla cedrorum,101
9,Eastern Bluebird,Sialia sialis,64


In [76]:
# we select only the birds with more than 100 audio files
birds_df = bird_df[bird_df["number_of_files"]>100].reset_index(drop=True)

In [77]:
birds_df

Unnamed: 0,common_name,scientific_name,number_of_files
0,American Goldfinch,Spinus tristis,110
1,American Robin,Turdus migratorius,302
2,Barn Swallow,Hirundo rustica,475
3,Blue Jay,Cyanocitta cristata,296
4,Blue-grey Gnatcatcher,Polioptila caerulea,179
5,Carolina Chickadee,Poecile carolinensis,109
6,Carolina Wren,Thryothorus ludovicianus,172
7,Cedar Waxwing,Bombycilla cedrorum,101
8,Northern Cardinal,Cardinalis cardinalis,173
9,Ruby-crowned Kinglet,Regulus calendula,117


In [86]:
# list of birds names to use to find the audio files
birds=[]
for name in birds_df["common_name"]:
    birds_common_name = name.replace(" ","_")
    birds.append(birds_common_name)
    

In [87]:
# we create a dict with the first 100 audio files per bird
audioDict = {}
for b in birds:
    temp = audioList(audioFiles,b)
    audioDict[b] = temp[:100]

In [88]:
for k,v in audioDict.items():
    print(k,len(v))

American_Goldfinch 100
American_Robin 100
Barn_Swallow 100
Blue_Jay 100
Blue-grey_Gnatcatcher 100
Carolina_Chickadee 100
Carolina_Wren 100
Cedar_Waxwing 100
Northern_Cardinal 100
Ruby-crowned_Kinglet 100
Tufted_Titmouse 100


In [91]:
audioData_df = pd.DataFrame.from_dict(audioDict, orient="columns")
audioData_df.head()

Unnamed: 0,American_Goldfinch,American_Robin,Barn_Swallow,Blue_Jay,Blue-grey_Gnatcatcher,Carolina_Chickadee,Carolina_Wren,Cedar_Waxwing,Northern_Cardinal,Ruby-crowned_Kinglet,Tufted_Titmouse
0,American_Goldfinch_XC114342.mp3,American_Robin_XC112597.mp3,Barn_Swallow_XC113501.mp3,Blue_Jay_XC110056.mp3,Blue-grey_Gnatcatcher_XC130506.mp3,Carolina_Chickadee_XC111140.mp3,Carolina_Wren_XC112512.mp3,Cedar_Waxwing_XC121795.mp3,Northern_Cardinal_XC110059.mp3,Ruby-crowned_Kinglet_XC129806.mp3,Tufted_Titmouse_XC124762.mp3
1,American_Goldfinch_XC124312.mp3,American_Robin_XC114082.mp3,Barn_Swallow_XC123032.mp3,Blue_Jay_XC114085.mp3,Blue-grey_Gnatcatcher_XC130507.mp3,Carolina_Chickadee_XC112508.mp3,Carolina_Wren_XC116314.mp3,Cedar_Waxwing_XC121796.mp3,Northern_Cardinal_XC112830.mp3,Ruby-crowned_Kinglet_XC131366.mp3,Tufted_Titmouse_XC131036.mp3
2,American_Goldfinch_XC133564.mp3,American_Robin_XC114083.mp3,Barn_Swallow_XC123035.mp3,Blue_Jay_XC116372.mp3,Blue-grey_Gnatcatcher_XC130508.mp3,Carolina_Chickadee_XC112509.mp3,Carolina_Wren_XC122447.mp3,Cedar_Waxwing_XC121797.mp3,Northern_Cardinal_XC130962.mp3,Ruby-crowned_Kinglet_XC131957.mp3,Tufted_Titmouse_XC135702.mp3
3,American_Goldfinch_XC133565.mp3,American_Robin_XC122037.mp3,Barn_Swallow_XC123036.mp3,Blue_Jay_XC116373.mp3,Blue-grey_Gnatcatcher_XC130511.mp3,Carolina_Chickadee_XC112510.mp3,Carolina_Wren_XC125657.mp3,Cedar_Waxwing_XC121798.mp3,Northern_Cardinal_XC130966.mp3,Ruby-crowned_Kinglet_XC138055.mp3,Tufted_Titmouse_XC137054.mp3
4,American_Goldfinch_XC141469.mp3,American_Robin_XC130246.mp3,Barn_Swallow_XC123066.mp3,Blue_Jay_XC116374.mp3,Blue-grey_Gnatcatcher_XC130512.mp3,Carolina_Chickadee_XC112633.mp3,Carolina_Wren_XC125658.mp3,Cedar_Waxwing_XC121799.mp3,Northern_Cardinal_XC141210.mp3,Ruby-crowned_Kinglet_XC156726.mp3,Tufted_Titmouse_XC138137.mp3


In [92]:
# csv of the audio files per bird
audioData_df.to_csv(os.path.join(rootFolder,"audioData.csv"), index=False)