# Convert XC metadata to BirdCLEF metadata

Convert metadata scraped from Xeno-Canto to a format used for this competition

In [46]:
from pathlib import Path
import ast
from collections import defaultdict
from collections.abc import Mapping
from typing import Final

import pandas as pd

RATING_MAP: Final[defaultdict[str, int]] = defaultdict(lambda: 0, {
    "A": 5,
    "B": 4,
    "C": 3,
    "D": 2,
    "E": 1,
})
EBIRD_TAXONOMY_PATH: Final[Path] = Path("../data/raw/eBird_Taxonomy_v2021.csv")

# Re-scraped 2024 metadata
# XC_METADATA_PATH: Final[Path] = Path("./dataset/train_metadata_xc.csv")
# OUT_METADATA_PATH: Final[Path] = Path("../data/raw/2024/train_metadata.csv")
# EXTENSION: Final[str] = ".ogg"

# Additional 2024 metadata
XC_METADATA_PATH: Final[Path] = Path("../data/raw/BirdClef2024_additional.csv")
OUT_METADATA_PATH: Final[Path] = Path("../data/raw/2024-add/train_metadata.csv")
EXTENSION: Final[str] = ".mp3"

In [47]:
# Retrieve mapping from scientific name to birdcode
ebird_taxonomy = pd.read_csv(EBIRD_TAXONOMY_PATH, usecols=["SPECIES_CODE", "SCI_NAME"])

scientific_name_to_birdcode: Mapping[str, str] = ebird_taxonomy.set_index("SCI_NAME").to_dict()["SPECIES_CODE"] | {"Spilopelia senegalensis": "laudov1", "Spilopelia chinensis": "spodov", "Acritillas indica": "yebbul3"}

In [48]:
# Read XC metadata and rename columns that map one-to-one to columns in train_metadata
metadata = pd.read_csv(XC_METADATA_PATH, usecols=["id", "gen", "sp", "en", "rec", "lat", "lng", "type", "url", "lic", "q", "also"]).rename(columns={
    "en": "common_name",
    "rec": "author",
    "lat": "latitude",
    "lng": "longitude",
    "lic": "license",
})

# Add all other columns
metadata["scientific_name"] = (metadata["gen"].fillna("") + " " + metadata["sp"].fillna("")).str.strip()
metadata["type"] = metadata["type"].fillna("").str.split(", ")
metadata["primary_label"] = metadata["scientific_name"].map(scientific_name_to_birdcode)
metadata["secondary_labels"] = metadata["also"].map(lambda l: [scientific_name_to_birdcode[x] for x in ast.literal_eval(l) if x in scientific_name_to_birdcode])
metadata["url"] = "https:" + metadata["url"]
metadata["filename"] = (metadata["primary_label"] + "/XC" + metadata["id"].astype(str) + EXTENSION)
metadata["rating"] = metadata["q"].map(RATING_MAP)
metadata = metadata.drop(columns=["id", "q", "also"]).sort_values("filename").reindex(columns=[
    "primary_label", "secondary_labels", "type", "latitude", "longitude", "scientific_name", "common_name", "author", "license", "rating", "url",  "filename"
])

metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
6382,asbfly,[salwar1],"[call, song]",43.6524,142.7978,Muscicapa dauurica,Asian Brown Flycatcher,Frank Lambert,//creativecommons.org/licenses/by-nc-nd/4.0/,2,https://xeno-canto.org/155673,asbfly/XC155673.mp3
6361,asbfly,[putbab1],[call],10.5567,98.5664,Muscicapa dauurica,Asian Brown Flycatcher,Martjan Lammertink,//creativecommons.org/licenses/by-nc-nd/2.5/,4,https://xeno-canto.org/26046,asbfly/XC26046.mp3
6383,asbfly,[],[call],14.7501,106.0667,Muscicapa dauurica,Asian Brown Flycatcher,Wouter Halfwerk,//creativecommons.org/licenses/by-nc-nd/2.5/,1,https://xeno-canto.org/27245,asbfly/XC27245.mp3
6360,asbfly,[],[song],43.725637,142.805428,Muscicapa dauurica,Asian Brown Flycatcher,Peter Boesman,//creativecommons.org/licenses/by-nc-nd/4.0/,4,https://xeno-canto.org/286199,asbfly/XC286199.mp3
6338,asbfly,[],[song],43.615421,143.348494,Muscicapa dauurica,Asian Brown Flycatcher,Peter Boesman,//creativecommons.org/licenses/by-nc-nd/4.0/,5,https://xeno-canto.org/286212,asbfly/XC286212.mp3


# Merge re-scraped metadata with metadata from the competition
Since 542 audio files have been removed from Xeno-Canto, but are still present in our dataset, we have to merge the missing data from the BirdCLEF 2024 data into the new dataframe. 

SKIP THE NEXT BLOCK IF YOU USE ANY OTHER DATASET

In [36]:
BIRDCLEF_2024_METADATA_PATH: Final[Path] = Path("../data/raw/train_metadata.csv")

birdclef_2024_metadata = pd.read_csv(BIRDCLEF_2024_METADATA_PATH)

metadata = metadata.combine_first(birdclef_2024_metadata).sort_values("filename")

species_birdclef = set(birdclef_2024_metadata["primary_label"])
species_meta = set(metadata["primary_label"])
species_birdclef - species_meta

set()

In [49]:
metadata.to_csv(OUT_METADATA_PATH, index=False)
metadata.shape

(24279, 12)