In [1]:
!pip install lxml



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def get_species_list(num_species=-1, url = 'https://xeno-canto.org/collection/species/all'):
    #print("getting species list")
    #get page of all xento-canto species
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    #get headers of table
    headers = []
    for i in species_table.find_all('thead'):
         title = i.text
         headers.append(title)
    headers = headers[0].strip().split("\n")
    
    #add header for url and create DF
    headers.append('url')
    species_list = pd.DataFrame(columns = headers)
    
    #Get data from each row
    for j in species_table.find_all('tr'):
         #get row data
         row_data = j.find_all('td')
        
         #get all text from row
         row = [i.text for i in row_data]
        
         #get the link
         row.append(row_data[0].a["href"])
        
         #add to dataframe
         length = len(species_list)
         species_list.loc[length] = row

    #Decide to return full list or do RS     
    if (num_species == -1 or num_species > species_list.shape[0]):
        return species_list
    return species_list.sample(num_species).reset_index(drop=True)

#get_species_list()
#test1 = get_species_list(5)
#test1

In [4]:
def get_species_data(link_to_XC, pg="?pg="):
    #print("getting species file metadata for: " + link_to_XC)
    #get soup for Species
    url = 'https://xeno-canto.org' + link_to_XC
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    species_table = soup.find_all("table", {"class": "results"})[0]
    
    # Obtain every title of columns with tag <th>
    headers = []
    for i in species_table.find_all('thead'):
     title = i.text
     headers.append(title)

    headers = headers[0].strip()
    headers = headers.split("\n")
    headers[1] = "Common name / Scientific"
    headers.append("copyright")
    headers.append("filename")
    headers.append("download url")


    species_list = pd.DataFrame(columns = headers[1:])
    
    
    page_count = 1
    while len(species_table.find_all('tr')) != 1:
        
        #print(len(species_table.find_all('tr')))
        
        #Get data from each row
        for j in species_table.find_all('tr'):
            #get row data
            row_data = j.find_all('td')

            if (row_data == []):
                   continue

            #print(row_data) 
            #get all text from row
            row = [i.text.strip() for i in row_data][1:]

            #get copyright, download link and filename
            copyright = row_data[12].a.span["title"]
            try:
                 filename = row_data[11].a["download"]
            except:
                filename = "PROTECTED SPECIES"
            download_url = row_data[11].a["href"]
            row.append(copyright)
            row.append(filename)
            row.append(download_url)

            #add to dataframe
            length = len(species_list)
            species_list.loc[length] = row
        
        ## Get data for next page of audio data
        page_count += 1
        url = 'https://xeno-canto.org' + link_to_XC + "pg=" + str(page_count)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        species_table = soup.find_all("table", {"class": "results"})[0]
    return species_list

#test2 = get_species_data(test1.iloc[0]["url"])
#test2

In [5]:
data = get_species_data("/explore?query=box%3A17.301%2C-78.652%2C18.837%2C-75.839+&dir=0&order=en&")

In [6]:
data

Unnamed: 0,Common name / Scientific,Length,Recordist,Date,Time,Country,Location,Elev. (m),Type,Remarks,Actions,Cat.nr.,copyright,filename,download url
0,American Redstart (Setophaga ruticilla),0:05,Ross Gallardy,2016-03-09,16:30,Jamaica,Ecclesdown Rd,300,"call, male",bird-seen:yes\nplayback-used:no\n [sono],ABCDE,XC308434,Creative Commons Attribution-NonCommercial-Sha...,XC308434 - American Redstart - Setophaga rutic...,/308434/download
1,American Redstart (Setophaga ruticilla),0:22,Nathan Hentze,2014-02-09,11:49,Jamaica,"Burnt Hil, Trelawny",500,call,Spontaneously calling bird moving around roads...,ABCDE,XC289892,Creative Commons Attribution-NonCommercial-Sha...,XC289892 - American Redstart - Setophaga rutic...,/289892/download
2,Arrowhead Warbler (Setophaga pharetra),0:21,Richard C. Hoyer,2010-04-11,11:54,Jamaica,"Ecclesdown, Portland",220,call,female in response to pishing\n Recording (not...,ABCDE,XC48115,Creative Commons Attribution-NonCommercial-NoD...,XC48115 - Arrowhead Warbler - Setophaga pharet...,/48115/download
3,Bahama Mockingbird (Mimus gundlachii hillii),2:30,Nathan Hentze,2014-02-09,10:01,Jamaica,"Portland Cottage, Clarendon Parish, Jamaica",4,song,Perched in low branches of tree ~10 ft above g...,ABCDE,XC203771,Creative Commons Attribution-NonCommercial-Sha...,XC203771 - Bahama Mockingbird - Mimus gundlach...,/203771/download
4,Bahama Mockingbird (Mimus gundlachii),2:31,Garrett MacDonald,2013-05-06,13:22,Jamaica,"Portland Cottage, Clarendon Parish, Jamaica",0,song,bird-seen:yes\nplayback-used:yes\n [sono],ABCDE,XC133044,Creative Commons Attribution-NonCommercial-Sha...,XC133044 - Bahama Mockingbird - Mimus gundlach...,/133044/download
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Yellow-faced Grassquit (Tiaris olivaceus),0:17,Kiirsti,2017-02-25,08:00,Jamaica,"Fonthill Beach (near Font Hill), St. Elizabet...",0,"male, song","Male singing in logwood-dominant, dry-scrub ha...",ABCDE,XC362785,Creative Commons Attribution-NonCommercial-Sha...,XC362785 - Yellow-faced Grassquit - Tiaris oli...,/362785/download
165,Yellow-shouldered Grassquit (Loxipasser anoxan...,0:19,Richard C. Hoyer,2010-04-11,10:04,Jamaica,"Ecclesdown, Portland",220,song,[sono],ABCDE,XC48144,Creative Commons Attribution-NonCommercial-NoD...,XC48144 - Yellow-shouldered Grassquit - Loxipa...,/48144/download
166,Yellow-shouldered Grassquit (Loxipasser anoxan...,0:18,Richard C. Hoyer,2010-04-12,13:21,Jamaica,"Ecclesdown, Portland",220,song,[sono],ABCDE,XC48143,Creative Commons Attribution-NonCommercial-NoD...,XC48143 - Yellow-shouldered Grassquit - Loxipa...,/48143/download
167,Yellow-shouldered Grassquit (Loxipasser anoxan...,0:05,Bobby Wilcox,2022-01-27,14:00,Jamaica,"Green Castle Estate House, Robins Bay, Saint M...",120,"male, song",About 15m away in rainforest. \nbird-seen:yes\...,ABCDE,XC713807,Creative Commons Attribution-NonCommercial-Sha...,XC713807 - Yellow-shouldered Grassquit - Loxip...,/713807/download


In [7]:
data.to_csv("Jamaica.csv")

In [8]:
data.groupby("Common name / Scientific").count().sort_values(by="Length", ascending=False).rename(columns={"Length": "Count"})[["Count"]].to_csv("Jamaica Counts")

In [9]:
data.groupby("Common name / Scientific").count().index

Index(['(?) Identity unknown', 'American Redstart (Setophaga ruticilla)',
       'Arrowhead Warbler (Setophaga pharetra)',
       'Bahama Mockingbird (Mimus gundlachii hillii)',
       'Bahama Mockingbird (Mimus gundlachii)',
       'Bananaquit (Coereba flaveola flaveola)',
       'Bananaquit (Coereba flaveola)', 'Black-billed Amazon (Amazona agilis)',
       'Black-billed Streamertail (Trochilus scitulus)',
       'Black-faced Grassquit (Tiaris bicolor marchii)',
       'Black-whiskered Vireo (Vireo altiloquus)',
       'Blue Mountain Vireo (Vireo osburni)',
       'Caribbean Dove (Leptotila jamaicensis jamaicensis)',
       'Chestnut-bellied Cuckoo (Coccyzus pluvialis)',
       'Crested Quail-Dove (Geotrygon versicolor)',
       'Greater Antillean Bullfinch (Loxigilla violacea ruficollis)',
       'Greater Antillean Bullfinch (Loxigilla violacea)',
       'Greater Antillean Elaenia (Elaenia fallax)',
       'Greater Antillean Grackle (Quiscalus niger crassirostris)',
       'Greater 

In [10]:
all_species = get_species_list()

KeyboardInterrupt: 

In [None]:
all_species["Common name"] = all_species["Common name"].apply(lambda x: x.strip())

In [11]:
jamaica_species_names = pd.Series(data.groupby("Common name / Scientific").count().index).apply(lambda x: x.split("(")[0].strip())
jamaica_species_names

0                                
1               American Redstart
2               Arrowhead Warbler
3              Bahama Mockingbird
4              Bahama Mockingbird
                 ...             
56              White-eyed Thrush
57              White-winged Dove
58           Yellow-billed Amazon
59         Yellow-faced Grassquit
60    Yellow-shouldered Grassquit
Name: Common name / Scientific, Length: 61, dtype: object

In [None]:
all_species

In [None]:
import numpy as np

In [None]:
np.array(jamaica_species_names)

In [None]:
jamaica_species_data = all_species[all_species["Common name"].isin(np.array(jamaica_species_names))]

In [None]:
jamaica_species_data.shape[0]

In [None]:
jamaica_audio_metadata = pd.DataFrame(columns=data.columns)
for url in jamaica_species_data["url"]:
    url = url + "?"
    jamaica_audio_metadata = jamaica_audio_metadata.append(get_species_data(url))
jamaica_audio_metadata

In [None]:
jamaica_audio_metadata.groupby("Common name / Scientific").count().sort_values(by="Length", ascending=False).rename(columns={"Length": "Count"})[["Count"]].to_csv("Jamaica_Counts_Worldwide_Audio.csv")

In [None]:
data.groupby("Common name / Scientific").count().sort_values(by="Length", ascending=False).rename(columns={"Length": "Count"})[["Count"]].to_csv("Jamaica_Counts_Only_In_Jamaica.csv")

In [None]:
world = jamaica_audio_metadata.groupby("Common name / Scientific").count().sort_values(by="Length", ascending=False).rename(columns={"Length": "Count"})[["Count"]]

In [None]:
world.plot(kind="hist",density=True, bins=20, title="Count of Worldwide Audio Data of Jamaica Species");

In [None]:
jamaica = data.groupby("Common name / Scientific").count().sort_values(by="Length", ascending=False).rename(columns={"Length": "Count"})[["Count"]]

In [None]:
jamaica.plot(kind="hist",density=True, bins=20, title="Count of Jamaica Audio Data of Jamaica Species");

In [None]:
data.to_csv("jamaica_meta_only.csv")

In [None]:
jamaica_audio_metadata.to_csv("jamaica_meta_world.csv")

In [26]:
jamaica_species_names = pd.Series(data.groupby("Common name / Scientific").count().index).apply(lambda x: x.split("(")[1].strip()[:-1])
jamaica_species_names

0          ?) Identity unknow
1         Setophaga ruticilla
2          Setophaga pharetra
3     Mimus gundlachii hillii
4            Mimus gundlachii
               ...           
56         Turdus jamaicensis
57           Zenaida asiatica
58           Amazona collaria
59           Tiaris olivaceus
60      Loxipasser anoxanthus
Name: Common name / Scientific, Length: 61, dtype: object

In [27]:
jamaica_species_names

0          ?) Identity unknow
1         Setophaga ruticilla
2          Setophaga pharetra
3     Mimus gundlachii hillii
4            Mimus gundlachii
               ...           
56         Turdus jamaicensis
57           Zenaida asiatica
58           Amazona collaria
59           Tiaris olivaceus
60      Loxipasser anoxanthus
Name: Common name / Scientific, Length: 61, dtype: object

In [28]:
import numpy
numpy.intersect1d(pd.read_csv("./birdnet_labels.csv", header=None)[0], jamaica_species_names)

False


  ', ', prefix, suffix=suffix, legacy=True)


array(['Coereba flaveola', 'Eupsittula nana', 'Geotrygon montana',
       'Mellisuga minima', 'Mimus gundlachii', 'Mimus polyglottos',
       'Seiurus aurocapilla', 'Setophaga americana',
       'Setophaga ruticilla', 'Tiaris olivaceus', 'Tyrannus dominicensis',
       'Vireo altiloquus', 'Zenaida asiatica'], dtype=object)

In [30]:
birdnet = pd.read_csv("./birdnet_labels.csv", header=None)
birdnet[birdnet[0].isin(jamaica_species_names)]    

Unnamed: 0,0,1
745,Coereba flaveola,Bananaquit
1189,Eupsittula nana,Olive-throated Parakeet
1293,Geotrygon montana,Ruddy Quail-Dove
1773,Mellisuga minima,Vervain Hummingbird
1820,Mimus gundlachii,Bahama Mockingbird
1823,Mimus polyglottos,Northern Mockingbird
2758,Seiurus aurocapilla,Ovenbird
2776,Setophaga americana,Northern Parula
2796,Setophaga ruticilla,American Redstart
3082,Tiaris olivaceus,Yellow-faced Grassquit


In [22]:
birdnet[0]

0         Abroscopus albogularis
1       Abroscopus superciliaris
2                 Aburria aburri
3       Acanthagenys rufogularis
4               Acanthis cabaret
                  ...           
3332            Zosterops meyeni
3333       Zosterops palpebrosus
3334      Zosterops senegalensis
3335           Zosterops simplex
3336            Zosterops virens
Name: 0, Length: 3337, dtype: object

In [23]:
jamaica_species_names

0                                
1               American Redstart
2               Arrowhead Warbler
3              Bahama Mockingbird
4              Bahama Mockingbird
                 ...             
56              White-eyed Thrush
57              White-winged Dove
58           Yellow-billed Amazon
59         Yellow-faced Grassquit
60    Yellow-shouldered Grassquit
Name: Common name / Scientific, Length: 61, dtype: object

In [None]:
def download_by_url(file_data, download_folder):
    print(file_data)
    url = file_data[1]
    url = 'https://xeno-canto.org' + url
    r = requests.get(url, allow_redirects=True)
    path = download_folder+file_data[0]
    open(path, "wb").write(r.content)
    return path

def download_data(species_df, num_of_files, download_folder):
    print("downloading data")
    if (num_of_files != -1 and species_df.shape[0] > num_of_files):
        species_df = species_df.sample(num_of_files, replace=False)
    species_df["file_location"] = species_df[["filename","download url"]].apply(download_by_url,axis=1, args=(tuple([download_folder])))
    return species_df

#download_data(test2, 100, "./data/")

In [None]:
def download_xento_canto_data(num_species, num_of_files, download_folder, species_list=pd.DataFrame()):
    file_df = pd.DataFrame()
    if (species_list.empty):
        species_list = get_species_list(num_species)
    for url in species_list["url"]:
        try:
            species_files = get_species_data(url)
            temp_file_df = download_data(species_files, num_of_files, download_folder)
            if (file_df.empty):
                file_df = temp_file_df.reset_index(drop=True)
            else:
                file_df = file_df.append(temp_file_df.reset_index(drop=True)).reset_index(drop=True)
        except Exception as e:
            print(url + " did not download")
            print(e)
    return file_df
        

In [None]:
#metadata = download_xento_canto_data(-1, -1, "./data/")
#metadata

In [None]:
#metadata.to_csv("./metadata")

In [None]:
#get_species_list()
test1 = get_species_list(-1)
test1

In [None]:
!pip install tqdm
from tqdm import tqdm
file_df = pd.DataFrame()
for url in tqdm(test1["url"]):
    try:
        species_files = get_species_data(url)
    except:
        continue
    if (file_df.empty):
        file_df = species_files.reset_index(drop=True)
    else:
        file_df = file_df.append(species_files.reset_index(drop=True)).reset_index(drop=True)
file_df

In [None]:
madreDeDios_species = pd.read_csv("C:/Users/Siloux/Downloads/MadreDeDiosBirdsXCList - MadreDeDiosBirdsXCList.csv")

In [None]:
file_df.to_csv("./all_meta_data.csv")

In [None]:
file_df["Common"] = file_df["Common name / Scientific"].apply(lambda x: x.split(" ")[0] + " "+ x.split(" ")[1])
file_df

In [None]:
file_df.groupby("Common").count().sort_values(by="Length", ascending=False)

In [None]:
file_MDD_XC = file_df.merge(madreDeDios_species, left_on="Common", right_on="Common")
file_MDD_XC

In [None]:
file_MDD_XC.groupby("Common").count().sort_values(by="Length", ascending=False)

In [None]:
test1.set_index("Common name").loc["\n\nOrange-cheeked Parrot\n\n"]

In [None]:
#url = "https://xeno-canto.org/species/Pyrilia-barrabandi"
#get_species_data("/species/Pyrilia-barrabandi")

In [None]:
file_df2 = pd.read_csv("./all_meta_data.csv")
file_df2

In [None]:
file_df

In [None]:
import re
mixed_bird = pd.read_csv("C:/Users/Siloux/Desktop/E4E/PyHa/mixed_bird/mixed_bird_manual.csv")
mixed_bird["ID"] = mixed_bird["IN FILE"].apply(lambda x: re.findall(r'\d+', x)[0])
mixed_bird

In [None]:
file_df["ID"] = file_df["Cat.nr."].apply(lambda x: re.findall(r'\d+', x)[0])

In [None]:
file_df

In [None]:
file_df[~file_df["ID"].isin(mixed_bird["ID"])]

In [None]:
file_df[file_df["Common"].isin(madreDeDios_species["Common"])]

In [None]:
mixed_bird["MANUAL ID LENGTH"] = mixed_bird["MANUAL ID"].apply(lambda x: len(x.split(" ")))
mixed_bird["Common"] = mixed_bird["MANUAL ID"].apply(lambda x: " ".join(x.split(" ")[2:]))
mixed_bird

In [None]:
file_df[(file_df["Common"].isin(madreDeDios_species["Common"])) & (~file_df["ID"].isin(mixed_bird["ID"]))]

In [None]:
unlabeled_mixed_bird = file_df[(file_df["Common"].isin(madreDeDios_species["Common"])) & (~file_df["ID"].isin(mixed_bird["ID"])) & (file_df["Common"].isin(mixed_bird["Common"]))]

In [None]:
unlabeled_mixed_bird["TIME"] = unlabeled_mixed_bird["Length"].apply(lambda x : int(x.split(":")[0]) * 60 + int(x.split(":")[1]))

In [None]:
unlabeled_mixed_bird

In [None]:
counts_of_unlabeled_mixed_bird = unlabeled_mixed_bird.groupby("Common").sum()

In [None]:
possible_species = counts_of_unlabeled_mixed_bird[counts_of_unlabeled_mixed_bird["TIME"] < 500]

In [None]:
possible_species

In [None]:
possible_species_files = unlabeled_mixed_bird[unlabeled_mixed_bird["Common"].isin(possible_species.index)]

In [None]:
mixed_bird_counts = mixed_bird.groupby("Common").count().sort_values("IN FILE", ascending=True)
mixed_bird_counts[mixed_bird_counts["IN FILE"] < 25]

In [None]:
mixed_bird_counts = mixed_bird.groupby("Common").sum().sort_values("DURATION", ascending=True)
mixed_bird_counts = mixed_bird_counts[["DURATION"]]
mixed_bird_counts

In [None]:
mixed_bird_counts2 = mixed_bird.groupby("Common").count().sort_values("IN FILE", ascending=True)
mixed_bird_counts2 = mixed_bird_counts2[["IN FILE"]]
mixed_bird_counts = mixed_bird_counts.merge(mixed_bird_counts2, left_index=True, right_index=True)

In [None]:
mixed_bird_counts["FILES_PER_SECOND"] = mixed_bird_counts["IN FILE"] / mixed_bird_counts["DURATION"]
mixed_bird_counts

In [None]:
mixed_bird_counts.head(50)

In [None]:
sum(mixed_bird_counts["FILES_PER_SECOND"])/mixed_bird_counts.shape[0] * 100

In [None]:
mixed_bird_counts[(mixed_bird_counts["DURATION"] < 75)].head(30).index

In [None]:
files_to_download= unlabeled_mixed_bird[unlabeled_mixed_bird["Common"].isin(mixed_bird_counts[(mixed_bird_counts["DURATION"] < 75)].head(20).index)]

In [None]:
sum(files_to_download["TIME"])

In [None]:
(60 * 60)/3 * 10


In [None]:
files_to_download

In [None]:
meta = download_data(files_to_download, -1, "./extra_mixed_bird/")
meta

In [None]:
mixed_bird["Common"] = mixed_bird["MANUAL ID"].apply(lambda x: " ".join(x.split(" ")[2:]))
mixed_bird