# Convert XC metadata to BirdCLEF metadata

Convert metadata scraped from Xeno-Canto to a format used for this competition

In [79]:
from pathlib import Path
import ast
from collections import defaultdict
from collections.abc import Mapping
from typing import Final

import pandas as pd

RATING_MAP: Final[defaultdict[str, int]] = defaultdict(lambda: 0, {
    "A": 5,
    "B": 4,
    "C": 3,
    "D": 2,
    "E": 1,
})
EBIRD_TAXONOMY_PATH: Final[Path] = Path("../data/raw/eBird_Taxonomy_v2021.csv")

# Re-scraped 2024 metadata
XC_METADATA_PATH: Final[Path] = Path("./dataset/train_metadata_xc.csv")
OUT_METADATA_PATH: Final[Path] = Path("../data/raw/2024/train_metadata.csv")

# Additional 2024 metadata
# XC_METADATA_PATH: Final[Path] = Path("../data/raw/BirdClef2024_additional.csv")
# OUT_METADATA_PATH: Final[Path] = Path("../data/raw/2024_additional/train_metadata.csv")

In [80]:
# Retrieve mapping from scientific name to birdcode
ebird_taxonomy = pd.read_csv(EBIRD_TAXONOMY_PATH, usecols=["SPECIES_CODE", "SCI_NAME"])

scientific_name_to_birdcode: Mapping[str, str] = ebird_taxonomy.set_index("SCI_NAME").to_dict()["SPECIES_CODE"]

In [81]:
# Read XC metadata and rename columns that map one-to-one to columns in train_metadata
metadata = pd.read_csv(XC_METADATA_PATH, usecols=["id", "gen", "sp", "en", "rec", "lat", "lng", "type", "url", "lic", "q", "also"]).rename(columns={
    "en": "common_name",
    "rec": "author",
    "lat": "latitude",
    "lng": "longitude",
    "lic": "license",
})

# Add all other columns
metadata["scientific_name"] = (metadata["gen"].fillna("") + " " + metadata["sp"].fillna("")).str.strip()
metadata["type"] = metadata["type"].fillna("").str.split(", ")
metadata["primary_label"] = metadata["scientific_name"].map(scientific_name_to_birdcode)
metadata["secondary_labels"] = metadata["also"].map(lambda l: [scientific_name_to_birdcode[x] for x in ast.literal_eval(l) if x in scientific_name_to_birdcode])
metadata["url"] = "https:" + metadata["url"]
metadata["filename"] = (metadata["primary_label"] + "/XC" + metadata["id"].astype(str) + ".mp3")
metadata["rating"] = metadata["q"].map(RATING_MAP)
metadata = metadata.drop(columns=["id", "q", "also"]).sort_values("filename").reindex(columns=[
    "primary_label", "secondary_labels", "type", "latitude", "longitude", "scientific_name", "common_name", "author", "license", "rating", "url",  "filename"
]).dropna(subset=["primary_label"])

metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
918,asbfly,[],[call],39.2297,118.1987,Muscicapa dauurica,Asian Brown Flycatcher,Matt Slaymaker,//creativecommons.org/licenses/by-nc-sa/3.0/,5,https://xeno-canto.org/134896,asbfly/XC134896.mp3
1774,asbfly,"[gybthr1, brebul1]",[song],36.3319,127.3555,Muscicapa dauurica,Asian Brown Flycatcher,Stuart Fisher,//creativecommons.org/licenses/by-nc-sa/4.0/,3,https://xeno-canto.org/175797,asbfly/XC175797.mp3
2903,asbfly,[],[call],21.1697,70.6005,Muscicapa dauurica,Asian Brown Flycatcher,vir joshi,//creativecommons.org/licenses/by-nc-sa/4.0/,1,https://xeno-canto.org/207738,asbfly/XC207738.mp3
3094,asbfly,[],[call],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,//creativecommons.org/licenses/by-nc-sa/4.0/,4,https://xeno-canto.org/209218,asbfly/XC209218.mp3
3095,asbfly,[],[call],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,//creativecommons.org/licenses/by-nc-sa/4.0/,4,https://xeno-canto.org/209219,asbfly/XC209219.mp3


In [82]:
metadata.to_csv(OUT_METADATA_PATH, index=False)