In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_species_list(num_species=-1):
    #print("getting species list")
    #get page of all xento-canto species
    url = 'https://xeno-canto.org/collection/species/all'
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    #get headers of table
    headers = []
    for i in species_table.find_all('thead'):
         title = i.text
         headers.append(title)
    headers = headers[0].strip().split("\n")
    
    #add header for url and create DF
    headers.append('url')
    species_list = pd.DataFrame(columns = headers)
    
    #Get data from each row
    for j in species_table.find_all('tr'):
         #get row data
         row_data = j.find_all('td')
        
         #get all text from row
         row = [i.text for i in row_data]
        
         #get the link
         row.append(row_data[0].a["href"])
        
         #add to dataframe
         length = len(species_list)
         species_list.loc[length] = row

    #Decide to return full list or do RS     
    if (num_species == -1 or num_species > species_list.shape[0]):
        return species_list
    return species_list.sample(num_species).reset_index(drop=True)

#get_species_list()
#test1 = get_species_list(5)
#test1

In [3]:
def get_species_data(link_to_XC):
    #print("getting species file metadata for: " + link_to_XC)
    #get soup for Species
    url = 'https://xeno-canto.org' + link_to_XC
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    # Obtain every title of columns with tag <th>
    headers = []
    for i in species_table.find_all('thead'):
     title = i.text
     headers.append(title)

    headers = headers[0].strip()
    headers = headers.split("\n")
    headers[1] = "Common name / Scientific"
    headers.append("copyright")
    headers.append("filename")
    headers.append("download url")


    species_list = pd.DataFrame(columns = headers[1:])
    
    
    page_count = 1
    while len(species_table.find_all('tr')) != 1:
        
        #print(len(species_table.find_all('tr')))
        
        #Get data from each row
        for j in species_table.find_all('tr'):
            #get row data
            row_data = j.find_all('td')

            if (row_data == []):
                   continue

            #print(row_data) 
            #get all text from row
            row = [i.text.strip() for i in row_data][1:]

            #get copyright, download link and filename
            copyright = row_data[12].a.span["title"]
            try:
                 filename = row_data[11].a["download"]
            except:
                filename = "PROTECTED SPECIES"
            download_url = row_data[11].a["href"]
            row.append(copyright)
            row.append(filename)
            row.append(download_url)

            #add to dataframe
            length = len(species_list)
            species_list.loc[length] = row
        
        ## Get data for next page of audio data
        page_count += 1
        url = 'https://xeno-canto.org' + link_to_XC + "?pg=" + str(page_count)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        species_table = soup.find_all("table", {"class": "results"})[0]
    return species_list

#test2 = get_species_data(test1.iloc[0]["url"])
#test2

In [4]:
def download_by_url(file_data, download_folder):
    print(file_data)
    url = file_data[1]
    url = 'https://xeno-canto.org' + url
    r = requests.get(url, allow_redirects=True)
    path = download_folder+file_data[0]
    open(path, "wb").write(r.content)
    return path

def download_data(species_df, num_of_files, download_folder):
    print("downloading data")
    if (num_of_files != -1 and species_df.shape[0] > num_of_files):
        species_df = species_df.sample(num_of_files, replace=False)
    species_df["file_location"] = species_df[["filename","download url"]].apply(download_by_url,axis=1, args=(tuple([download_folder])))
    return species_df

#download_data(test2, 100, "./data/")

In [5]:
def download_xento_canto_data(num_species, num_of_files, download_folder, species_list=pd.DataFrame()):
    file_df = pd.DataFrame()
    if (species_list.empty):
        species_list = get_species_list(num_species)
    for url in species_list["url"]:
        try:
            species_files = get_species_data(url)
            temp_file_df = download_data(species_files, num_of_files, download_folder)
            if (file_df.empty):
                file_df = temp_file_df.reset_index(drop=True)
            else:
                file_df = file_df.append(temp_file_df.reset_index(drop=True)).reset_index(drop=True)
        except Exception as e:
            print(url + " did not download")
            print(e)
    return file_df
        

In [6]:
#metadata = download_xento_canto_data(-1, -1, "./data/")
#metadata

In [7]:
#metadata.to_csv("./metadata")

In [8]:
#get_species_list()
test1 = get_species_list(-1)
test1

Unnamed: 0,Common name,Scientific name,Status,No.,No. Back,url
0,\n\nCommon Ostrich\n\n,Struthio camelus,,7,0,/species/Struthio-camelus
1,\n\nSomali Ostrich\n\n,Struthio molybdophanes,,1,0,/species/Struthio-molybdophanes
2,\n\nGreater Rhea\n\n,Rhea americana,,23,1,/species/Rhea-americana
3,\n\nLesser Rhea\n\n,Rhea pennata,,4,0,/species/Rhea-pennata
4,\n\nSouthern Brown Kiwi\n\n,Apteryx australis,,3,0,/species/Apteryx-australis
...,...,...,...,...,...,...
10995,\n\nDanube Wide-winged Bush-cricket\n\n,Zeuneriana amplipennis,,1,0,/species/Zeuneriana-amplipennis
10996,\n\nIbera Seedeater\n\n,Sporophila digiacomoi,,44,0,/species/Sporophila-digiacomoi
10997,\n\nWhite-tailed Cisticola\n\n,Cisticola tax.nov.kilombero_2,,37,0,/species/Cisticola-tax.nov.kilombero_2
10998,\n\nKilombero Cisticola\n\n,Cisticola tax.nov.kilombero_1,,91,1,/species/Cisticola-tax.nov.kilombero_1


In [9]:
!pip install tqdm
from tqdm import tqdm
file_df = pd.DataFrame()
for url in tqdm(test1["url"]):
    try:
        species_files = get_species_data(url)
    except:
        continue
    if (file_df.empty):
        file_df = species_files.reset_index(drop=True)
    else:
        file_df = file_df.append(species_files.reset_index(drop=True)).reset_index(drop=True)
file_df



100%|█████████████████████████████████████████████████████████████████████████| 11000/11000 [11:10:06<00:00,  3.66s/it]


Unnamed: 0,Common name / Scientific,Length,Recordist,Date,Time,Country,Location,Elev. (m),Type,Remarks,Actions,Cat.nr.,copyright,filename,download url
0,Common Ostrich (Struthio camelus australis),0:53,Frank Lambert,2019-10-30,08:05,South Africa,"Polokwane Game Reserve, Polokwane, Limpopo",1300,"call, juvenile","A small juvenile, evidently separated from its...",ABCDE,XC516153,Creative Commons Attribution-NonCommercial-NoD...,XC516153 - Common Ostrich - Struthio camelus a...,/516153/download
1,Common Ostrich (Struthio camelus),0:26,Jeremy Hegge,2014-11-20,04:00,South Africa,"Mmabolela Reserve, Limpopo",750,call,Recording modified: Frequencies above 640hz re...,ABCDE,XC208209,Creative Commons Attribution-NonCommercial-Sha...,XC208209 - Common Ostrich - Struthio camelus.mp3,/208209/download
2,Common Ostrich (Struthio camelus),0:04,Jeremy Hegge,2014-11-21,06:00,South Africa,"Mmabolela Reserve, Limpopo",750,call,Recording modified: Frequencies above 640hz br...,ABCDE,XC208128,Creative Commons Attribution-NonCommercial-Sha...,XC208128 - Common Ostrich - Struthio camelus.mp3,/208128/download
3,Common Ostrich (Struthio camelus),0:11,Derek Solomon,2010-02-09,07:00,South Africa,Hoedspruit,523,Call,African Rock Python hissing in foreground\n [s...,ABCDE,XC46725,Creative Commons Attribution-NonCommercial-NoD...,XC46725 - Common Ostrich - Struthio camelus.mp3,/46725/download
4,Common Ostrich (Struthio camelus),1:47,Morioka Zoological Park ZOOMO,2021-09-06,17:00,Japan,"Morioka Zoological Park ZOOMO, Morioka City, I...",260,"adult, female, voice during egg laying, zoo co...","Although this is a zoo collection, I share the...",ABCDE,XC675445,Creative Commons Attribution-NonCommercial-Sha...,XC675445 - Common Ostrich - Struthio camelus.mp3,/675445/download
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686325,Kilombero Cisticola (Cisticola tax.nov.kilombe...,1:58,Louis A. Hansen,2019-06-29,15:12,Tanzania,-,240,song,Sailing on the river in a dugout canoe.\n [al...,ABCDE,XC511336,Creative Commons Attribution-NonCommercial-Sha...,PROTECTED SPECIES,/new_thread.php?frontID=511336
686326,Kilombero Cisticola (Cisticola tax.nov.kilombe...,0:34,Louis A. Hansen,2019-06-29,12:00,Tanzania,-,245,song,Sailing on the river in a dugout canoe. Duet s...,ABCDE,XC511326,Creative Commons Attribution-NonCommercial-Sha...,PROTECTED SPECIES,/new_thread.php?frontID=511326
686327,Kilombero Cisticola (Cisticola tax.nov.kilombe...,0:21,Louis A. Hansen,2019-06-29,09:28,Tanzania,-,245,song,Sailing on the river in a dugout canoe. Duet s...,ABCDE,XC511308,Creative Commons Attribution-NonCommercial-Sha...,PROTECTED SPECIES,/new_thread.php?frontID=511308
686328,Kilombero Cisticola (Cisticola tax.nov.kilombe...,0:06,Louis A. Hansen,2019-06-29,10:56,Tanzania,-,245,call,Volume increase\nplayback-used:no\n [sono],ABCDE,XC515823,Creative Commons Attribution-NonCommercial-Sha...,PROTECTED SPECIES,/new_thread.php?frontID=515823
