In [1]:
import pandas as pd 
pd.options.display.max_columns = 100

In [None]:
wiki_list = pd.read_csv("list_from_wikipedia.csv")
wiki_list["scientific_name"] = wiki_list["Common and binomial names"].str.extract("\((.+)\)")
wiki_list["common_name"] = wiki_list["Common and binomial names"].str.extract("([^\(]+)")
wiki_list["common_name_lower_wiki"] = wiki_list["common_name"].str.lower()
wiki_list["scientific_name_lower_wiki"] = wiki_list["scientific_name"].str.lower()

wiki_list = wiki_list.drop(["Common and binomial names", "Image", "scientific_name", "common_name"], axis=1)
wiki_list = wiki_list.rename(columns={"Status": "wiki_status"})
wiki_list.head(2)

In [None]:
british_list = pd.read_csv("the_british_list.csv")
british_list["common_name_lower_bl"] = british_list["common_name"].str.lower()
british_list["scientific_name_lower_bl"] = british_list["scientific_name"].str.lower()
british_list.head(2)

In [None]:
merged = british_list.merge(wiki_list, left_on="common_name_lower_bl", right_on="common_name_lower_wiki", how="left")
f1 = merged["scientific_name_lower_bl"] != merged["scientific_name_lower_wiki"]
cols = ["common_name_lower_bl", "common_name_lower_wiki", "scientific_name_lower_bl", "scientific_name_lower_wiki"]
# m = merged[f1]
m = merged
m = m.drop(["common_name_lower_bl", "common_name_lower_wiki"], axis=1)
m["scientific_name_lower_wiki"] = m["scientific_name_lower_wiki"].fillna("")
m.sample(3)

In [None]:
len(m)

In [10]:
import requests 
import json
def call_taxa(taxon_name):
    print(f"calling api for {taxon_name}")
    endpoint = 'https://api.inaturalist.org/v1/taxa'
    params = {
        "q": taxon_name,
        "page":1,
        "all_names": True
    }
    
    r = requests.get(endpoint, params)
    
    results = json.loads(r.text) 
    
    df = pd.DataFrame(results["results"])

    cols = ["name", "rank", "preferred_common_name", "names", "id"]
    return df[cols]

results = call_taxa("Anas crecca carolinensis")
results.head(2)

calling api for Anas crecca carolinensis


Unnamed: 0,name,rank,preferred_common_name,names,id
0,Anas crecca carolinensis,subspecies,American Green-winged Teal,"[{'is_valid': True, 'name': 'Anas crecca carol...",123676
1,Anas crecca crecca × carolinensis,infrahybrid,Eurasian × American Green-winged Teal,"[{'is_valid': True, 'name': 'Anas crecca crecc...",533247


In [None]:
def find_in_results(results, wiki_bl_row):
    
    
    # Only results that match the one of the 
    f1 = results["name"].str.lower() == wiki_bl_row["scientific_name_lower_bl"]
    f2 = results["name"].str.lower() == wiki_bl_row["scientific_name_lower_wiki"]
    f3 = results["preferred_common_name"].str.lower() == wiki_bl_row["common_name"].lower()
    
    
    
    results.loc[f3, "result_priority"] = 3
    results.loc[f2, "result_priority"] = 4
    results.loc[f1, "result_priority"] = 5
    results.loc[f3 & f2, "result_priority"] = 1
    results.loc[f3 & f1, "result_priority"] = 2
    
    
    results_f = results[f1 | f2 | f3 ].sort_values("result_priority")
    
    if len(results_f) == 0:
        if len(results) == 1:
               results_f = results
    return results_f.iloc[0, :]




In [None]:
# Check against inaturalist api - go through one at a time requesting the scientific name.  If common name matches, use that taxon id
for r in m.iterrows():
    
    index = r[0]
    row = r[1]
    print(row["common_name"])
    taxa_results = call_taxa(row["scientific_name"])
    if len(taxa_results ) == 0:
        print("no results, trying wiki neame")
        taxa_results = call_taxa(row["scientific_name_lower_wiki"])
        if len(taxa_results ) == 0:
            print("still_no_results")
        
    try:
        found = find_in_results(taxa_results, row)

        m.loc[index, "inaturalist_id"] = found.id
        m.loc[index, "inaturalist_scientific_name"] = found["name"]
        m.loc[index, "inaturalist_common_name"] = found.preferred_common_name
    except IndexError:
        print(f'Error finding {row["common_name"]}')


In [None]:

m["inaturalist_id"] = m["inaturalist_id"].fillna(-1).astype(int)

m.to_csv("birds_list_with_inaturalist_id.csv", index=False)

In [3]:
m = pd.read_csv("birds_list_with_inaturalist_id.csv")

In [6]:
m[m["inaturalist_id"] == -1]

Unnamed: 0,common_name,link,scientific_name,status_in_uk,pairs_text,field_code_1,field_code_2,roughly_how_common,scientific_name_lower_bl,wiki_status,scientific_name_lower_wiki,inaturalist_id,inaturalist_scientific_name,inaturalist_common_name
41,Green-winged Teal,http://app.bto.org/birdfacts/results/bob1842.htm,Anas carolinensis,Scarce Visitor,10 - 100 Birds,TA,GRWTE,31.62278,anas carolinensis,A scarce migrant,anas carolinensis,-1,,
99,Baillon’s Crake,http://app.bto.org/birdfacts/results/bob4110.htm,Porzana pusilla,"Occasional Breeder, Accidental",10 - 100 Records,VC,BAICR,31.62278,porzana pusilla,,,-1,,
371,Jackdaw,http://app.bto.org/birdfacts/results/bob15600.htm,Coloeus monedula,"Resident Breeder, Winter Visitor",1.3E06 Pairs,JD,JACKD,1300000.0,coloeus monedula,A resident breeding species,coloeus monedula,-1,,
437,Pallas’s Grasshopper Warbler,http://app.bto.org/birdfacts/results/bob12330.htm,Helopsaltes certhiola,Accidental,10 - 100 Records,,PAGWA,31.62278,helopsaltes certhiola,,,-1,,


In [9]:
f1 = m["common_name"] == "Jackdaw"
m.loc[f1, "inaturalist_id"] = 8000
m.loc[f1, "inaturalist_scientific_name"] = "Corvus monedula"
m.loc[f1, "inaturalist_common_name"] = "Eurasian Jackdaw"


In [11]:
f1 = m["common_name"] == "Green-winged Teal"
m.loc[f1, "inaturalist_id"] = 123676
m.loc[f1, "inaturalist_scientific_name"] = "Anas crecca carolinensis"
m.loc[f1, "inaturalist_common_name"] = "American Green-winged Teal"

In [13]:
f1 = m["common_name"] == "Baillon’s Crake"
m.loc[f1, "inaturalist_id"] = 508920
m.loc[f1, "inaturalist_scientific_name"] = "Zapornia pusilla"
m.loc[f1, "inaturalist_common_name"] = "Baillon's Crake"


In [14]:
f1 = m["common_name"] == "Pallas’s Grasshopper Warbler"
m.loc[f1, "inaturalist_id"] = 116936
m.loc[f1, "inaturalist_scientific_name"] = "Locustella certhiola"
m.loc[f1, "inaturalist_common_name"] = "Pallas's Grasshopper-Warbler"


In [None]:
m