# Introduction

Downloading any data avaiable from the Xeno-Canto Database for the birds of interest from our region.  
Modified from [here](https://www.kaggle.com/code/gpreda/download-birds-songs-recording-metadata)

In [41]:
import requests
import os
import pandas as pd
import json
import requests
from tqdm.notebook import tqdm
from pathlib import Path

In [2]:
OUTPUT_CSV_PATH =  '/media/olly/T7/Kaytoo/Data/xeno_canto_original_metadata.csv'
BIRD_MAP_PATH = '/media/olly/T7/Kaytoo/Data/bird_map_2012-13.csv'
SOUND_FILE_DOWNLOADS = '/media/olly/T7/Kaytoo/Data/Xeno_Canto_Soundfiles'
LOCAL_COUNTRIES = ['Australia', 'New Zealand', 'Vanuatu', 'Papua New Guinea', 'Solomon Islands', 'Fiji']

In [3]:
Path(SOUND_FILE_DOWNLOADS).mkdir(parents=True, exist_ok=True)

The functions used to download content from **xeno-canto-org** are the following:  

* **get_first_page_per_country** - get, for a specific country, the first page of response, as well as metadata for the next pages;  
* **get_page_per_country** - get a specific page per country - this is called after a first page was downloaded and is called for each subsequent page;  
* **inspect_json** - print metadata for the first response;  
* **get_recordings** - retrieve payload from a downloaded page;   
* **download_suite_from_country** - end-to-end suite for downloading content for a specific country - call the above described functions.

In [4]:
def get_first_page_per_bird(gen, sp):
    """
    @country: the country for which we download metadata content 
    @returns: the content downloaded
    """
    api_search = f"https://www.xeno-canto.org/api/2/recordings?query=gen:{gen} {sp} ssp:\"= \""
    response = requests.get(api_search)
    if response.status_code == 200:
        response_payload = json.loads(response.content)
        return response_payload
    else:
        return None

def get_page_per_bird(gen, sp, page):
    """
    @country: the country for which we download metadata content 
    @returns: the content downloaded
    """
    api_search = f"https://www.xeno-canto.org/api/2/recordings?query=gen:{gen} {sp} ssp:\"= \"&page={page}"
    response = requests.get(api_search)
    if response.status_code == 200:
        response_payload = json.loads(response.content)
        return response_payload
    else:
        return None

def get_first_page_per_country(country):
    """
    @country: the country for which we download metadata content 
    @returns: the content downloaded
    """
    api_search = f"https://www.xeno-canto.org/api/2/recordings?query=cnt:{country}"
    response = requests.get(api_search)
    if response.status_code == 200:
        response_payload = json.loads(response.content)
        return response_payload
    else:
        return None
    
def get_page_per_country(country, page):
    """
    @country: the country for which we download metadata content 
    @page: the current page to be downloaded
    @returns: the content downloaded
    """
    api_search = f"https://www.xeno-canto.org/api/2/recordings?query=cnt:{country}&page={page}"
    response = requests.get(api_search)
    if response.status_code == 200:
        response_payload = json.loads(response.content)
        return response_payload
    else:
        return None

def inspect_json(json_data):
    """
    @json_data: json data to be inspected
    """
    print(f"recordings: {json_data['numRecordings']}")
    print(f"species: {json_data['numSpecies']}")
    print(f"page: {json_data['page']}")
    print(f"number pages: {json_data['numPages']}")

def inspect_bird_json(json_data):
    print(f"recordings: {json_data['numRecordings']}")

def get_recordings(payload):
    """
    @payload: json data from which we extract the bird recordings metadata collection
    @returns: birds recordings metadata collection
    """
    return payload["recordings"]

def download_suite_from_country(country, country_initial_payload):
    """
    @country: the country for which we download metadata content 
    @country_initial_payload: the initial downloaded payload for the country (1st page). We download all the other pages.
    @returns: the content recordings (all pages, including the original one)
    """
    pages = country_initial_payload["numPages"]
    
    all_recordings = []
    all_recordings = all_recordings + get_recordings(country_initial_payload)
    for page in tqdm(range(2,pages+1)):
        payload = get_page_per_country(country, page)
        recordings = get_recordings(payload)
        all_recordings = all_recordings + recordings
    
    return all_recordings

def download_suite_for_bird(gen, sp, bird_initial_payload):
    """
    @country: the country for which we download metadata content 
    @country_initial_payload: the initial downloaded payload for the country (1st page). We download all the other pages.
    @returns: the content recordings (all pages, including the original one)
    """
    pages = bird_initial_payload["numPages"]
    
    all_recordings = []
    all_recordings = all_recordings + get_recordings(bird_initial_payload)
    for page in range(2,pages+1):
        payload = get_page_per_bird(gen, sp, page)
        recordings = get_recordings(payload)
        all_recordings = all_recordings + recordings
    
    return all_recordings

Make an e-bird to scientific name map

In [56]:
bird_name_df = pd.read_csv(BIRD_MAP_PATH)
bird_name_df.head()

Unnamed: 0,Code,CommonName,eBird,ScientificName
0,1,Silvereye,silver3,Zosterops lateralis
1,2,Bellbird,nezbel1,Anthornis melanura
2,3,Grey Warbler,gryger1,Gerygone igata
3,4,Tomtit,tomtit1,Petroica macrocephala
4,5,Tomtit,tomtit1,Petroica macrocephala


In [57]:
bird_name_df['ScientificName'].unique()

array(['Zosterops lateralis', 'Anthornis melanura', 'Gerygone igata',
       'Petroica macrocephala', 'Fringilla coelebs',
       'Rhipidura fuliginosa ', 'Rhipidura atra', 'Turdus merula ',
       'Prosthemadera novaeseelandiae', 'Acanthisitta chloris ',
       'Hemiphaga novaeseelandiae', 'Petroica longipes',
       'Petroica australis', 'Sturnus vulgaris', 'Cyanoramphus auriceps',
       'Prunella modularis', 'Ixobrychus novaezelandiae',
       'Certhia americana', 'Emberiza cirlus', 'Alectoris chukar',
       'Cacatua sulphurea', 'Fulica sp.', 'Zapornia pusilla',
       'Zapornia tabuensis', 'Urodynamis taitensis',
       'Chrysococcyx lucidus', 'Tachybaptus ruficollis',
       'Charadrius bicinctus', 'Charadrius melanops',
       'Anarhynchus obscurus', 'Anas platyrhynchos x superciliosa',
       'Hymenolaimus malacorhynchos', 'Anas superciliosa',
       'Anas platyrhynchos', 'Bubulcus ibis', 'Egretta garzetta',
       'Falco novaeseelandiae', 'Poodytes punctatus', 'Morus serrator

In [6]:
sci_ebird_map = dict(zip(bird_name_df['ScientificName'], bird_name_df['eBird']))

### 3. Application: download all metadata of recordings from a country

We are using the utility funtions to download and save the meta information for birdsongs recording for a specific country.

In [7]:
def download_save_all_meta_for_country(country):
    birds = get_first_page_per_country(country)
    inspect_json(birds)
    print(f"recordings in first batch: {len(get_recordings(birds))}")
    suite = download_suite_from_country(country, birds)
    data_df = pd.DataFrame.from_records(suite)
    print(f"suite length: {data_df.shape[0]}")
    return data_df

In [8]:
def download_save_all_meta_for_bird(gen, sp):
    birds = get_first_page_per_bird(gen, sp)
    inspect_bird_json(birds)
    print(f"recordings in first batch: {len(get_recordings(birds))}")
    suite = download_suite_for_bird(gen, sp, birds)
    data_df = pd.DataFrame.from_records(suite)
    print(f"suite length: {data_df.shape[0]}")
    return data_df

Try filtering by cnt == 'New Zealand'

In [9]:
nz_df = download_save_all_meta_for_country('New Zealand')

recordings: 203
species: 11
page: 1
number pages: 1
recordings in first batch: 203


0it [00:00, ?it/s]

suite length: 203


In [10]:
nz_df.head(3)

Unnamed: 0,id,gen,sp,ssp,group,en,rec,cnt,loc,lat,...,rmk,bird-seen,animal-seen,playback-used,temp,regnr,auto,dvc,mic,smp
0,198201,Aythya,novaeseelandiae,,birds,New Zealand Scaup,nick talbot,New Zealand,"Rotorua, Rotorua District, Bay Of Plenty",-38.1101,...,A small flock of birds swimming on Lake Rotorua.,yes,yes,no,,,no,,,44100
1,293310,Hemiphaga,novaeseelandiae,,birds,New Zealand Pigeon,Dan Lane,New Zealand,Tiritiri Matangi Island,-36.601,...,Natural song from a bird perched about 2m up i...,yes,yes,no,,,no,,,44100
2,378290,Hemiphaga,novaeseelandiae,,birds,New Zealand Pigeon,Matthias Feuersenger,New Zealand,"Blumine Island, Marlborough District, Marlborough",-41.1706,...,New Zealand Pigeon calmly sitting in mid strat...,yes,yes,no,,,no,,,44100


In [11]:
nz_df.shape

(203, 38)

In [13]:
bird_dfs = []
for bird in tqdm(sci_ebird_map.keys()):
    print(f'Downloading metadata for {sci_ebird_map[bird]}')
    bird_name = bird.split()
    if len(bird_name) == 2:
        gen, sp = bird_name
        bird_df = download_save_all_meta_for_bird(gen, sp)
        if not bird_df.empty:
            bird_dfs.append(bird_df)

df = pd.concat(bird_dfs).reset_index(drop=True)

  0%|          | 0/124 [00:00<?, ?it/s]

Downloading metadata for silver3
recordings: 78
recordings in first batch: 78
suite length: 78
Downloading metadata for nezbel1
recordings: 62
recordings in first batch: 62
suite length: 62
Downloading metadata for gryger1
recordings: 28
recordings in first batch: 28
suite length: 28
Downloading metadata for tomtit1
recordings: 12
recordings in first batch: 12
suite length: 12
Downloading metadata for comcha
recordings: 5032
recordings in first batch: 500
suite length: 5032
Downloading metadata for nezfan1
recordings: 21
recordings in first batch: 21
suite length: 21
Downloading metadata for blafan1
recordings: 11
recordings in first batch: 11
suite length: 11
Downloading metadata for eurbla
recordings: 5989
recordings in first batch: 500
suite length: 5989
Downloading metadata for tui1
recordings: 95
recordings in first batch: 95
suite length: 95
Downloading metadata for riflem1
recordings: 25
recordings in first batch: 25
suite length: 25
Downloading metadata for nezpig2
recordings: 

In [14]:
df.head()

Unnamed: 0,id,gen,sp,ssp,group,en,rec,cnt,loc,lat,...,rmk,bird-seen,animal-seen,playback-used,temp,regnr,auto,dvc,mic,smp
0,895593,Zosterops,lateralis,,birds,Silvereye,Drew Davison,Australia,"Lions Dryandra Woodland Village, Shire of Cuba...",-32.7833,...,,yes,yes,no,,,no,Samsung S22,,44100
1,841102,Zosterops,lateralis,,birds,Silvereye,Sreekar,Australia,"Coral Sea, Gladstone Regional, Queensland",-23.4423,...,,no,no,no,,,no,Zoom F3,Rode NTG2,44100
2,836020,Zosterops,lateralis,,birds,Silvereye,Sreekar,Australia,"Carindale, Brisbane City, Queensland",-27.5289,...,,no,no,no,,,no,Zoom F3,Rode NTG2,48000
3,821562,Zosterops,lateralis,,birds,Silvereye,David Boyle,New Zealand,"Taiko Camp, Chatham Island",-44.0742,...,,no,no,no,,,yes,Zoom H5,Clippy XLR EM272Z1 Matched Stereo Pair,48000
4,821561,Zosterops,lateralis,,birds,Silvereye,David Boyle,New Zealand,"Taiko Camp, Chatham Island",-44.0742,...,Both common calls,no,no,no,,,yes,Zoom H5,Clippy XLR EM272Z1 Matched Stereo Pair,48000


In [15]:
df.columns

Index(['id', 'gen', 'sp', 'ssp', 'group', 'en', 'rec', 'cnt', 'loc', 'lat',
       'lng', 'alt', 'type', 'sex', 'stage', 'method', 'url', 'file',
       'file-name', 'sono', 'osci', 'lic', 'q', 'length', 'time', 'date',
       'uploaded', 'also', 'rmk', 'bird-seen', 'animal-seen', 'playback-used',
       'temp', 'regnr', 'auto', 'dvc', 'mic', 'smp'],
      dtype='object')

In [16]:
df.shape

(41736, 38)

In [35]:
df['cnt'].unique()

array(['Australia', 'New Zealand', 'Vanuatu', 'Lithuania', 'Spain',
       'Italy', 'France', 'Portugal', 'Estonia', 'Sweden',
       'United Kingdom', 'Germany', 'Ireland', 'Poland', 'Norway',
       'Greece', 'Bulgaria', 'Russian Federation', 'Denmark',
       'Netherlands', 'Belgium', 'Hungary', 'Turkey', 'Croatia',
       'Slovakia', 'Ukraine', 'Albania', 'Finland', 'Cyprus',
       'Czech Republic', 'Romania', 'Azerbaijan', 'Georgia', 'Austria',
       'Switzerland', 'Latvia', 'Belarus', 'Bosnia Herzegovina', 'Malta',
       'Luxembourg', 'Andorra', 'Serbia', 'Morocco', 'South Africa',
       'Iran', 'Montenegro', 'India', 'China', 'Papua New Guinea',
       'Indonesia', 'Iceland', 'Kyrgyzstan', 'Jordan', 'Uzbekistan',
       'Israel', 'Libya', 'Slovenia', 'Tunisia', 'Canada',
       'Liechtenstein', 'United States', 'Argentina', 'Mexico',
       'Kazakhstan', 'Malaysia', 'Brazil', 'Guatemala', 'El Salvador',
       'Macedonia', 'Algeria', 'Mongolia', 'East Timor', 'Singapore',
  

In [36]:
df=df[df['cnt'].isin(LOCAL_COUNTRIES)]
df.shape

In [38]:
urls = df['file'].to_list()

def extract_file_extension(file_name):
    return file_name.split('.')[-1]

df['extensions'] = df['file-name'].apply(extract_file_extension)
df['new_file_name'] = 'XC' + df['id'] + '_' + df['gen'] + '_' + df['sp'] + '.' + df['extensions']
file_names = df['new_file_name'].to_list()

In [42]:
file_names[:5]

['XC895593_Zosterops_lateralis.mp3',
 'XC841102_Zosterops_lateralis.mp3',
 'XC836020_Zosterops_lateralis.mp3',
 'XC821562_Zosterops_lateralis.wav',
 'XC821561_Zosterops_lateralis.wav']

In [43]:
df.columns

Index(['id', 'gen', 'sp', 'ssp', 'group', 'en', 'rec', 'cnt', 'loc', 'lat',
       'lng', 'alt', 'type', 'sex', 'stage', 'method', 'url', 'file',
       'file-name', 'sono', 'osci', 'lic', 'q', 'length', 'time', 'date',
       'uploaded', 'also', 'rmk', 'bird-seen', 'animal-seen', 'playback-used',
       'temp', 'regnr', 'auto', 'dvc', 'mic', 'smp', 'extensions',
       'new_file_name'],
      dtype='object')

In [48]:
df[['length']].head()

Unnamed: 0,length
0,1:12
1,0:29
2,1:03
3,0:57
4,0:55


In [63]:
df_bad_fantail = df[(df['gen'] == 'Rhipidura') & (df['sp']=='atra')]
df_bad_fantail.head(3)

Unnamed: 0,id,gen,sp,ssp,group,en,rec,cnt,loc,lat,...,animal-seen,playback-used,temp,regnr,auto,dvc,mic,smp,extensions,new_file_name
5233,596215,Rhipidura,atra,,birds,Black Fantail,Scott Connop,Papua New Guinea,"Below Tari Gap, Southern Highlands Province",-5.9703,...,yes,no,,,no,,,44100,mp3,XC596215_Rhipidura_atra.mp3
5234,548451,Rhipidura,atra,,birds,Black Fantail,Nikolaos Sarikakis,Papua New Guinea,"Mount Hagen, Dei District, Western Highlands P...",-5.8582,...,unknown,unknown,,,no,,,48000,mp3,XC548451_Rhipidura_atra.mp3
5235,141932,Rhipidura,atra,,birds,Black Fantail,Frank Lambert,Papua New Guinea,"Kumul Lodge, Enga province",-5.7912,...,yes,no,,,no,,,44100,mp3,XC141932_Rhipidura_atra.mp3


Remove those dodgy foreign fantails

In [65]:
df=df[~((df['gen'] == 'Rhipidura') + (df['sp']=='atra'))]
df.shape

(1583, 40)

In [66]:
cols_to_keep = ['id', 'gen', 'sp', 'en', 'also', 'cnt','lat', 'lng', 'rec', 'type',  'length', 'new_file_name',]
df[cols_to_keep].to_csv(OUTPUT_CSV_PATH, index=False)
df[cols_to_keep].head()

Unnamed: 0,id,gen,sp,en,also,cnt,lat,lng,rec,type,length,new_file_name
0,895593,Zosterops,lateralis,Silvereye,"[Phylidonyris novaehollandiae, Anthochaera car...",Australia,-32.7833,116.972,Drew Davison,call,1:12,XC895593_Zosterops_lateralis.mp3
1,841102,Zosterops,lateralis,Silvereye,[Hypotaenidia philippensis],Australia,-23.4423,151.9148,Sreekar,song,0:29,XC841102_Zosterops_lateralis.mp3
2,836020,Zosterops,lateralis,Silvereye,[],Australia,-27.5289,153.1227,Sreekar,song,1:03,XC836020_Zosterops_lateralis.mp3
3,821562,Zosterops,lateralis,Silvereye,[],New Zealand,-44.0742,-176.6368,David Boyle,call,0:57,XC821562_Zosterops_lateralis.wav
4,821561,Zosterops,lateralis,Silvereye,[],New Zealand,-44.0742,-176.6368,David Boyle,"call, flight call",0:55,XC821561_Zosterops_lateralis.wav


In [33]:
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
                
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

for url, name in tqdm(zip(urls,file_names), total=len(urls), desc="Downloading files"):
    save_path = os.path.join(SOUND_FILE_DOWNLOADS, name)
    download_file(url, save_path)


Downloading files:   0%|          | 0/1613 [00:00<?, ?it/s]