In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_species_list(num_species=-1):
    print("getting species list")
    #get page of all xento-canto species
    url = 'https://xeno-canto.org/collection/species/all'
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    #get headers of table
    headers = []
    for i in species_table.find_all('thead'):
         title = i.text
         headers.append(title)
    headers = headers[0].strip().split("\n")
    
    #add header for url and create DF
    headers.append('url')
    species_list = pd.DataFrame(columns = headers)
    
    #Get data from each row
    for j in species_table.find_all('tr'):
         #get row data
         row_data = j.find_all('td')
        
         #get all text from row
         row = [i.text for i in row_data]
        
         #get the link
         row.append(row_data[0].a["href"])
        
         #add to dataframe
         length = len(species_list)
         species_list.loc[length] = row

    #Decide to return full list or do RS     
    if (num_species == -1 or num_species > species_list.shape[0]):
        return species_list
    return species_list.sample(num_species).reset_index(drop=True)

#get_species_list()
#test1 = get_species_list(5)
#test1

In [3]:
def get_species_data(link_to_XC):
    print("getting species file metadata for: " + link_to_XC)
    #get soup for Species
    url = 'https://xeno-canto.org' + link_to_XC
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    # Obtain every title of columns with tag <th>
    headers = []
    for i in species_table.find_all('thead'):
     title = i.text
     headers.append(title)
    
    headers = headers[0].strip()
    headers = headers.split("\n")
    headers[1] = "Common name / Scientific"
    headers.append("copyright")
    headers.append("filename")
    headers.append("download url")
    
    
    species_list = pd.DataFrame(columns = headers[1:])

    #Get data from each row
    for j in species_table.find_all('tr'):
         #get row data
         row_data = j.find_all('td')
                    
         if (row_data == []):
                continue
                
         #print(row_data) 
         #get all text from row
         row = [i.text.strip() for i in row_data][1:]
         
         #get copyright, download link and filename
         copyright = row_data[12].a.span["title"]
         filename = row_data[11].a["download"]
         download_url = row_data[11].a["href"]
         row.append(copyright)
         row.append(filename)
         row.append(download_url)

         #add to dataframe
         length = len(species_list)
         species_list.loc[length] = row
    return species_list

#test2 = get_species_data(test1.iloc[0]["url"])
#test2

In [4]:
def download_by_url(file_data, download_folder):
    print(file_data)
    url = file_data[1]
    url = 'https://xeno-canto.org' + url
    r = requests.get(url, allow_redirects=True)
    path = download_folder+file_data[0]
    open(path, "wb").write(r.content)
    return path

def download_data(species_df, num_of_files, download_folder):
    print("downloading data")
    if (num_of_files != -1 and species_df.shape[0] > num_of_files):
        species_df = species_df.sample(num_of_files, replace=False)
    species_df["file_location"] = species_df[["filename","download url"]].apply(download_by_url,axis=1, args=(tuple([download_folder])))
    return species_df

#download_data(test2, 100, "./data/")

In [5]:
def download_xento_canto_data(num_species, num_of_files, download_folder):
    file_df = pd.DataFrame()
    
    species_list = get_species_list(num_species)
    for url in species_list["url"]:
        try:
            species_files = get_species_data(url)
            temp_file_df = download_data(species_files, num_of_files, download_folder)
            if (file_df.empty):
                file_df = temp_file_df.reset_index(drop=True)
            else:
                file_df = file_df.append(temp_file_df.reset_index(drop=True)).reset_index(drop=True)
        except Exception as e:
            print(url + " did not download")
            print(e)
    return file_df
        

In [6]:
metadata = download_xento_canto_data(5, 5, "./data/")
metadata

getting species list
getting species file metadata for: /species/Myrmotherula-luctuosa
downloading data
filename        XC556442 - Silvery-flanked Antwren - Myrmother...
download url                                     /556442/download
Name: 6, dtype: object
filename        XC341379 - Silvery-flanked Antwren - Myrmother...
download url                                     /341379/download
Name: 13, dtype: object
filename        XC504541 - Silvery-flanked Antwren - Myrmother...
download url                                     /504541/download
Name: 9, dtype: object
filename        XC163810 - Silvery-flanked Antwren - Myrmother...
download url                                     /163810/download
Name: 28, dtype: object
filename        XC581125 - Silvery-flanked Antwren - Myrmother...
download url                                     /581125/download
Name: 5, dtype: object
getting species file metadata for: /species/Ploceus-baglafecht
downloading data
filename        XC450382 - Baglafecht W

Unnamed: 0,Common name / Scientific,Length,Recordist,Date,Time,Country,Location,Elev. (m),Type,Remarks,Actions,Cat.nr.,copyright,filename,download url,file_location
0,Silvery-flanked Antwren (Myrmotherula luctuosa),0:16,Caio Brito,2019-06-13,08:05,Brazil,REGUA - Trilha Marrom,45,chamado,Macho adulto\nbird-seen:yes\nplayback-used:yes...,ABCDE,XC556442,Creative Commons Attribution-NonCommercial-Sha...,XC556442 - Silvery-flanked Antwren - Myrmother...,/556442/download,./data/XC556442 - Silvery-flanked Antwren - My...
1,Silvery-flanked Antwren (Myrmotherula luctuosa...,0:51,Jeremy Minns,2004-03-18,07:21,Brazil,"Mata do Crasto, Município de Santa Luzia do It...",0,"male, song",humid forest\nbird-seen:yes\nplayback-used:no\...,ABCDE,XC341379,Creative Commons Attribution-NonCommercial-Sha...,XC341379 - Silvery-flanked Antwren - Myrmother...,/341379/download,./data/XC341379 - Silvery-flanked Antwren - My...
2,Silvery-flanked Antwren (Myrmotherula luctuosa),1:33,Ciro Albano,2019-09-28,05:30,Brazil,"RPPN Estação Veracel, Santa Cruz Cabrália, Bahia",90,"adult, female, male, song",bird-seen:yes\nplayback-used:yes\n [also]\n[s...,ABCDE,XC504541,Creative Commons Attribution-NonCommercial-Sha...,XC504541 - Silvery-flanked Antwren - Myrmother...,/504541/download,./data/XC504541 - Silvery-flanked Antwren - My...
3,Silvery-flanked Antwren (Myrmotherula luctuosa),0:19,Frank Lambert,1993-03-00,?,Brazil,"Reserva Natural da Vale - Linhares, Espirito S...",70,"male, song",There are two birds singing on this track - th...,ABCDE,XC163810,Creative Commons Attribution-NonCommercial-NoD...,XC163810 - Silvery-flanked Antwren - Myrmother...,/163810/download,./data/XC163810 - Silvery-flanked Antwren - My...
4,Silvery-flanked Antwren (Myrmotherula luctuosa),0:19,Odirlei Vieira da Fonseca,2020-07-26,10:00,Brazil,"Reserva Ecológica dos Petroleiros, Tinguá, No...",110,"adult, female, song",bird-seen:yes\nplayback-used:yes\n [sono],ABCDE,XC581125,Creative Commons Attribution-NonCommercial-Sha...,XC581125 - Silvery-flanked Antwren - Myrmother...,/581125/download,./data/XC581125 - Silvery-flanked Antwren - My...
5,Baglafecht Weaver (Ploceus baglafecht),0:21,Frank Lambert,2018-11-13,16:32,Kenya,Kirinyaga County,2100,"[prob also flight calls], song",bird-seen:yes\nplayback-used:no\n [sono],ABCDE,XC450382,Creative Commons Attribution-NonCommercial-NoD...,XC450382 - Baglafecht Weaver - Ploceus baglafe...,/450382/download,./data/XC450382 - Baglafecht Weaver - Ploceus ...
6,Baglafecht Weaver (Ploceus baglafecht),0:27,Stein Ø. Nilsen,2018-12-27,06:00,Tanzania,"Igumbilo Ranch, Makete, Njombe, Mbeya Region",2600,song,Thanks to David Moyer for ID help!\nbird-seen:...,ABCDE,XC490710,Creative Commons Attribution-NonCommercial-Sha...,XC490710 - Baglafecht Weaver - Ploceus baglafe...,/490710/download,./data/XC490710 - Baglafecht Weaver - Ploceus ...
7,Baglafecht Weaver (Ploceus baglafecht reichenowi),0:10,James Bradley,2018-11-01,06:30,Kenya,"Wildebeast Camp, Nairobi",1800,call,bird-seen:yes\nplayback-used:no\n [also]\n[sono],ABCDE,XC444334,Creative Commons Attribution-NonCommercial-Sha...,XC444334 - Baglafecht Weaver - Ploceus baglafe...,/444334/download,./data/XC444334 - Baglafecht Weaver - Ploceus ...
8,Baglafecht Weaver (Ploceus baglafecht),0:14,Meena Haribal,2019-09-15,07:30,Kenya,"Karen, Nairobi",1900,song,In a suburban Wildebeest Ecocamp. Idied by Jam...,ABCDE,XC605853,Creative Commons Attribution-NonCommercial-Sha...,XC605853 - Baglafecht Weaver - Ploceus baglafe...,/605853/download,./data/XC605853 - Baglafecht Weaver - Ploceus ...
9,Baglafecht Weaver (Ploceus baglafecht),0:33,Martin St-Michel,2015-02-26,06:08,Uganda,"Rubanda, Kabale, Western Region",2400,"call, song","Habitat: garden, fields.\nbird-seen:yes\nplayb...",ABCDE,XC235919,Creative Commons Attribution-NonCommercial-Sha...,XC235919 - Baglafecht Weaver - Ploceus baglafe...,/235919/download,./data/XC235919 - Baglafecht Weaver - Ploceus ...


In [7]:
metadata.to_csv("./metadata")