In [1]:
import os
import sys
import time
import urllib.request, json

import re
import pandas as pd

In [2]:
DATA_PATH_FILTERED = "filtered_birds_df.csv"
DATA_DIR = "audiodata/mp3"

# Create dir if it doesn't exist
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [3]:
def download_birds_audio(df, birdName, do_print=True):
    """
    Download the audio files from the dataframe, by birdName.
    Returns IDs that have invalid url in a list.
    """
    if do_print: print(f"Downloading for {birdName}")
    nonValidIDList = []
    newbirdName = birdName.replace(" ", "_").lower()
    path = DATA_DIR + "/" + newbirdName
    
    # Create directory if it does not exist.
    if not os.path.exists(path):
        os.makedirs(path)
    birdf = df.loc[df['birdName'] == birdName]
    
    # Downloading the file as .mp3
    for idx in range(len(birdf)):
        iD = birdf.iloc[idx]['iD']
        filename = f"{path}/{newbirdName}_{iD}.mp3"
        if not os.path.isfile(filename):
            url = f"http://{birdf.iloc[idx]['url']}"
            try:
                urllib.request.urlretrieve(url, filename)
            except:
                # Add unretrievable files' id
                nonValidIDList.append(iD)
            time.sleep(0.1) # delay between API requests

    if do_print: print("Downloading finished.")
    return nonValidIDList


def download_all_birds_audio(df, birdslist):
    """
    Download all the audio files from the dataframe.
    Returns IDs that have invalid url in a dictionary where birdName are the dict keys.
    """
    nonValidFiles = {}
    for birdName in birdslist:
        nonValidIDList = download_birds_audio(df, birdName)
        nonValidFiles[birdName] = nonValidIDList
    
    return nonValidFiles

In [4]:
df = pd.read_csv(DATA_PATH_FILTERED)

In [5]:
print(df.shape)
df.head()

(6149, 6)


Unnamed: 0,iD,quality,length,country,url,birdName
0,548389,A,59,United Kingdom,www.xeno-canto.org/548389/download,Sturnus vulgaris
1,546936,A,32,France,www.xeno-canto.org/546936/download,Sturnus vulgaris
2,546935,A,65,France,www.xeno-canto.org/546935/download,Sturnus vulgaris
3,543800,A,36,Sweden,www.xeno-canto.org/543800/download,Sturnus vulgaris
4,542770,A,44,Poland,www.xeno-canto.org/542770/download,Sturnus vulgaris


In [6]:
birdsList = list(set(df['birdName']))
NumBirdsList = [len(df.loc[df['birdName'] == bird]) for bird in birdsList]
print(birdsList)
print(NumBirdsList)

['Parus major', 'Passer domesticus', 'Columba palumbus', 'Sturnus vulgaris', 'Cyanistes caeruleus', 'Fringilla coelebs', 'Erithacus rubecula', 'Phylloscopus collybita', 'Carduelis carduelis', 'Hirundo rustica', 'Linaria cannabina', 'Regulus regulus', 'Turdus merula']
[1084, 224, 168, 198, 424, 950, 686, 750, 334, 227, 171, 287, 646]


In [7]:
# Sanity check of the urls
for url in df['url']:
    if url[:4] != "www.":
        print("Wrong url")
        break;

### Download .mp3 files

In [8]:
%%time
nonValidFiles = download_all_birds_audio(df, birdsList)

Downloading for Parus major
Downloading finished.
Downloading for Passer domesticus
Downloading finished.
Downloading for Columba palumbus
Downloading finished.
Downloading for Sturnus vulgaris
Downloading finished.
Downloading for Cyanistes caeruleus
Downloading finished.
Downloading for Fringilla coelebs
Downloading finished.
Downloading for Erithacus rubecula
Downloading finished.
Downloading for Phylloscopus collybita
Downloading finished.
Downloading for Carduelis carduelis
Downloading finished.
Downloading for Hirundo rustica
Downloading finished.
Downloading for Linaria cannabina
Downloading finished.
Downloading for Regulus regulus
Downloading finished.
Downloading for Turdus merula
Downloading finished.
Wall time: 5.98 s


### Filter unvalid URLs

In [9]:
nonValidList = []
for IDList in nonValidFiles.values():
    for ID in IDList:
        nonValidList.append(ID)

updatedf = df.loc[~df['iD'].isin(nonValidList)].reset_index(drop=True)
print(f"{len(df)-len(updatedf)} non valid elements removed")
print("Updated df shape:",updatedf.shape)
updatedf.head()

9 non valid elements removed
Updated df shape: (6140, 6)


Unnamed: 0,iD,quality,length,country,url,birdName
0,548389,A,59,United Kingdom,www.xeno-canto.org/548389/download,Sturnus vulgaris
1,546936,A,32,France,www.xeno-canto.org/546936/download,Sturnus vulgaris
2,546935,A,65,France,www.xeno-canto.org/546935/download,Sturnus vulgaris
3,543800,A,36,Sweden,www.xeno-canto.org/543800/download,Sturnus vulgaris
4,542770,A,44,Poland,www.xeno-canto.org/542770/download,Sturnus vulgaris
