In [25]:
import pandas as pd

In [26]:
file = "../data/eBird/ebd_AFR_relJul-2024/ebd_AFR_relJul-2024.txt.gz"
ebd0 = pd.read_csv(
    file,
    delimiter="\t",
    usecols=[
        "SCIENTIFIC NAME",
    ],
)

In [27]:
ebd = ebd0.value_counts().reset_index()

In [28]:
# Add Avibase taxon concept id and species code from Clements taxonomy
species_ebird = pd.read_excel("../data/species_list/ebird_taxonomy_v2023.xlsx").rename(
    columns={
        "SCI_NAME": "SCIENTIFIC NAME",
        "TAXON_CONCEPT_ID": "TAXON CONCEPT ID",
        "SPECIES_CODE": "SPECIES CODE",
    }
)
ebd = pd.merge(
    ebd,  # .loc[:,['OBSERVER ID', 'PENTAD', "SAMPLING EVENT IDENTIFIER", "OBSERVATION DATE"]],
    species_ebird[["SCIENTIFIC NAME", "TAXON CONCEPT ID", "SPECIES CODE"]],
    how="left",
)

In [29]:
# Read species_match data
species_abap = pd.read_csv(
    "../data/species_list/ABAP_taxonomy_v20240911.csv", encoding="latin1"
)
species_abap["SCIENTIFIC NAME"] = species_abap["Genus"] + " " + species_abap["Species"]
species_abap = species_abap.rename(
    columns={"avibase_id": "TAXON CONCEPT ID", "SAFRING_No": "ADU"}
)[["ADU", "TAXON CONCEPT ID", "SCIENTIFIC NAME"]]

print(
    f"ADU without an avibaseid/taxon concept: {sum(species_abap['TAXON CONCEPT ID'].isna())}"
)

matched_concepts = pd.merge(
    species_abap[species_abap["TAXON CONCEPT ID"].isna()][["SCIENTIFIC NAME"]],
    species_ebird[["SCIENTIFIC NAME", "TAXON CONCEPT ID"]],
    on="SCIENTIFIC NAME",
    how="left",
)
species_abap.loc[species_abap["TAXON CONCEPT ID"].isna(), "TAXON CONCEPT ID"] = (
    matched_concepts["TAXON CONCEPT ID"]
)

print(
    f"ADU without an avibaseid/taxon concept: {sum(species_abap['TAXON CONCEPT ID'].isna())}"
)

ADU without an avibaseid/taxon concept: 336
ADU without an avibaseid/taxon concept: 332


In [30]:
unmatched_entries = ebd[~ebd["TAXON CONCEPT ID"].isin(species_abap["TAXON CONCEPT ID"])]
print(
    f"We have {len(unmatched_entries)} unmatched entries out of {len(ebd)} taxon in ebd, corresponding to {round(unmatched_entries['count'].sum()/ebd['count'].sum()*100)}% of the data"
)

We have 1037 unmatched entries out of 3127 taxon in ebd, corresponding to 11% of the data


In [31]:
unmatched_entries

Unnamed: 0,SCIENTIFIC NAME,count,TAXON CONCEPT ID,SPECIES CODE
0,Pycnonotus barbatus,303142,avibase-6ABDB635,combul2
9,Milvus migrans,147189,avibase-06D9A2C8,blakit1
16,Camaroptera brachyura,107092,avibase-DC456636,grbcam1
23,Egretta garzetta,93863,avibase-F2858F9F,litegr
26,Icthyophaga vocifer,90635,avibase-A19B0CF4,affeag1
...,...,...,...,...
3122,Merops leschenaulti,1,avibase-F8652F4F,chbeat1
3123,Pyrrholaemus sagittatus,1,avibase-A68B1787,spewar3
3124,Pyrilia haematotis,1,avibase-B197D2D1,brhpar1
3125,Buceros bicornis,1,avibase-A9D587B9,grehor1


In [32]:
matched_concepts = pd.merge(
    unmatched_entries[["SCIENTIFIC NAME"]],
    species_abap[["SCIENTIFIC NAME", "TAXON CONCEPT ID", "ADU"]].drop_duplicates(
        subset="SCIENTIFIC NAME"
    ),
    on="SCIENTIFIC NAME",
    how="left",
)

In [33]:
unmatched_entries = pd.merge(
    unmatched_entries,
    matched_concepts[["SCIENTIFIC NAME", "ADU"]],
    on="SCIENTIFIC NAME",
    how="left",
)

In [34]:
unmatched_entries

Unnamed: 0,SCIENTIFIC NAME,count,TAXON CONCEPT ID,SPECIES CODE,ADU
0,Pycnonotus barbatus,303142,avibase-6ABDB635,combul2,11491.0
1,Milvus migrans,147189,avibase-06D9A2C8,blakit1,128.0
2,Camaroptera brachyura,107092,avibase-DC456636,grbcam1,627.0
3,Egretta garzetta,93863,avibase-F2858F9F,litegr,59.0
4,Icthyophaga vocifer,90635,avibase-A19B0CF4,affeag1,
...,...,...,...,...,...
1032,Merops leschenaulti,1,avibase-F8652F4F,chbeat1,
1033,Pyrrholaemus sagittatus,1,avibase-A68B1787,spewar3,
1034,Pyrilia haematotis,1,avibase-B197D2D1,brhpar1,
1035,Buceros bicornis,1,avibase-A9D587B9,grehor1,


In [35]:
species_abap = pd.concat(
    [
        species_abap,
        unmatched_entries[~unmatched_entries["ADU"].isna()][
            ["ADU", "TAXON CONCEPT ID", "SCIENTIFIC NAME"]
        ],
    ]
)

In [36]:
unmatched_entries = ebd[~ebd["TAXON CONCEPT ID"].isin(species_abap["TAXON CONCEPT ID"])]
print(
    f"We have {len(unmatched_entries)} unmatched entries out of {len(ebd)} taxon in ebd, corresponding to {round(unmatched_entries['count'].sum()/ebd['count'].sum()*100)}% of the data"
)

We have 941 unmatched entries out of 3127 taxon in ebd, corresponding to 4% of the data


In [37]:
unmatched_entries.to_csv("../data/species_list/unmatched_species.csv")
unmatched_entries

Unnamed: 0,SCIENTIFIC NAME,count,TAXON CONCEPT ID,SPECIES CODE
26,Icthyophaga vocifer,90635,avibase-A19B0CF4,affeag1
30,Spermestes cucullata,82830,avibase-142E4CB7,broman1
91,Zapornia flavirostra,51197,avibase-148F0B01,blacra1
156,Lanius melanoleucus,34528,avibase-56A7F384,magshr1
173,Ardea brachyrhyncha,31084,avibase-C3ABF863,integr3
...,...,...,...,...
3122,Merops leschenaulti,1,avibase-F8652F4F,chbeat1
3123,Pyrrholaemus sagittatus,1,avibase-A68B1787,spewar3
3124,Pyrilia haematotis,1,avibase-B197D2D1,brhpar1
3125,Buceros bicornis,1,avibase-A9D587B9,grehor1


In [38]:
species_abap.to_csv("../data/species_list/matched_species.csv")