# Scraping Xeno Canto

This notebook is used to scrape the metadata for the birds in our dataset. 
We will use the Xeno Canto API to get the metadata for the birds in our dataset.

In [1]:
from pathlib import Path

RAW_TRAIN_METADATA_PATH = Path('../data/raw/train_metadata.csv')

## Attempt 1: Retrieving all the ids one by one

According to the [Xeno Canto API documentation](https://xeno-canto.org/help/search), we can retrieve the metadata for a single id.
However, there seems to be no way to retrieve the metadata for multiple ids at once.
This means that we will have to retrieve the metadata for each id one by one.

This approach is not feasible as it seems that there is a hard server-side limit on the number of requests that can be made to the Xeno Canto API.

In [None]:
# Get the ids of all recordings in the dataset

## Using polars
# import polars as pl
# ids = pl.read_csv(RAW_TRAIN_METADATA_PATH)["url"].str.split("/").list.last().cast(int)

## Using pandas
# import pandas as pd
# ids = pd.read_csv(RAW_TRAIN_METADATA_PATH)["url"].str.split("/").str[-1]

# ids[:10]


# Retrieve the metadata for the recordings

from urllib import request, error

# Retrieves metadata for requested recordings in the form of a JSON file
# def get_metadata(i: int) -> dict:
#     url = f"https://xeno-canto.org/api/2/recordings?query=nr:{i}"
#     try:
#         response = request.urlopen(url)
#         response_json = json.loads(response.read().decode('UTF-8'))
#         file_path = Path(f"../data/download/metadata/{i}.json")
#         with open(file_path, "w") as f:
#             json.dump(response_json, f)
#         recordings = response_json["recordings"]
#         if not recordings:
#             return {}
#         return recordings[0]
#     except error.HTTPError as e:
#         print(f"Error retrieving metadata for recording {i}: {e}")
#         return {}

# Get metadata for the recordings
# TODO(Jeffrey): Use a more efficient way to get the metadata
# metadata = [get_metadata(i) for i in ids]
# metadata[:5]

KeyboardInterrupt: 

## Attempt 2: Retrieving all the metadata per species

According to the [Xeno Canto API documentation](https://xeno-canto.org/help/search), we can retrieve the metadata for a single species.
This means that we will retrieve too much metadata at once, but it makes less requests to the Xeno Canto API.
We can filter the metadata later on.

In [22]:
import polars as pl

species = pl.read_csv(RAW_TRAIN_METADATA_PATH)["scientific_name"].unique()
len(species)

182

In [32]:
# Retrieves metadata for requested recordings in the form of a JSON file
import xenocanto

# Get metadata for the recordings
for name in species:
    xenocanto.metadata([name])

Retrieving metadata...
Downloading metadata page 1...
Downloading metadata page 2...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Metadata retrieval complete.
Retrieving metadata...
Downloading metadata page 1...
Me

In [5]:
# Define types for storing the metadata
from typing import TypeAlias, TypedDict
from collections.abc import Mapping, Sequence

JSON: TypeAlias = Mapping[str, "JSON"] | Sequence["JSON"] | str | int | float | bool | None

# class XenoCantoAPIRecording(TypedDict, total=False):
#     id: int
#     gen: str
#     sp: str
#     ssp: str
#     group: str
#     en: str
#     rec: str
#     cnt: str
#     loc: str
#     lat: float
#     lng: float
#     alt: int
#     type: str
#     sex: str
#     stage: str
#     method: str
#     url: str
#     file: str
#     file_name: str
#     sono: Mapping[str, str]
#     osci: Mapping[str, str]
#     lic: str
#     q: float
#     length: str
#     time: str
#     date: str
#     uploaded: str
#     also: Sequence[str]
#     rmk: str
#     bird_seen: bool
#     animal_seen: bool
#     playback_used: bool
#     temp: str
#     regnr: str
#     auto: str
#     dvc: str
#     mic: str
#     smp: int

# class XenoCantoAPIResponse(TypedDict):
#     numRecordings: int
#     numSpecies: int
#     page: int
#     numPages: int
#     recordings: Sequence[XenoCantoAPIRecording]

In [41]:
# Once all the metadata has been retrieved, we merge all pages within a species and store all the recordings metadata in a dataframe
import itertools
import json

def merge_pages(name: str) -> list[JSON]:
    files = list((Path("dataset/metadata/") / name.replace(' ', '')).glob("*.json"))
    pages = [{}] * (len(files))
    for i, file in enumerate(files):
        with open(file, "r") as f:
            page: JSON = json.load(f)
            pages[i] = page
    return list(itertools.chain.from_iterable([page["recordings"] for page in pages]))

meta_dataframe = pl.DataFrame(list(itertools.chain.from_iterable([merge_pages(name) for name in species]))).cast(dtypes={"id": int}).sort(by="id")
meta_dataframe.to_pandas().to_csv("dataset/metadata.csv", index=False)
meta_dataframe.head(5)

id,gen,sp,ssp,group,en,rec,cnt,loc,lat,lng,alt,type,sex,stage,method,url,file,file-name,sono,osci,lic,q,length,time,date,uploaded,also,rmk,bird-seen,animal-seen,playback-used,temp,regnr,auto,dvc,mic,smp
i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,struct[4],struct[3],str,str,str,str,str,str,list[str],str,str,str,str,str,str,str,str,str,str
1135,"""Nycticorax""","""nycticorax""","""""","""birds""","""Black-crowned Night Heron""","""Don Jones""","""United States""","""Jakes Landing Road, Cape May C…","""39.192751""","""-74.853544""","""?""","""song""","""""","""""","""field recording""","""//xeno-canto.org/1135""","""https://xeno-canto.org/1135/do…","""bird228.mp3""","{""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/ffts/XC1135-small.png"",""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/ffts/XC1135-med.png"",""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/ffts/XC1135-large.png"",""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/ffts/XC1135-full.png""}","{""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/wave/XC1135-small.png"",""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/wave/XC1135-med.png"",""//xeno-canto.org/sounds/uploaded/BCWZQTGMSO/wave/XC1135-large.png""}","""//creativecommons.org/licenses…","""C""","""0:05""","""?""","""1993-05-01""","""2008-11-20""",[],"""""","""unknown""","""unknown""","""unknown""","""""","""""","""no""","""""","""""","""22050"""
2778,"""Ardea""","""alba""","""""","""birds""","""Great Egret""","""Sjoerd Mayer""","""Bolivia""","""Close to Trinidad, along road …","""-14.8001""","""-64.9001""","""125""","""call""","""""","""""","""field recording""","""//xeno-canto.org/2778""","""https://xeno-canto.org/2778/do…","""90/a20.mp3""","{""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2778-small.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2778-med.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2778-large.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2778-full.png""}","{""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2778-small.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2778-med.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2778-large.png""}","""//creativecommons.org/licenses…","""A""","""0:13""","""19:00""","""1997-02-19""","""2008-11-20""",[],"""At the roost. cd:http://www.bi…","""unknown""","""unknown""","""unknown""","""""","""""","""no""","""""","""""","""44100"""
2797,"""Nycticorax""","""nycticorax""","""""","""birds""","""Black-crowned Night Heron""","""Sjoerd Mayer""","""Bolivia""","""Laguna Alalay, Cochabamba""","""-17.4084""","""-66.1376""","""2600""","""flight call""","""""","""""","""field recording""","""//xeno-canto.org/2797""","""https://xeno-canto.org/2797/do…","""92/a07d.mp3""","{""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2797-small.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2797-med.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2797-large.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/ffts/XC2797-full.png""}","{""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2797-small.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2797-med.png"",""//xeno-canto.org/sounds/uploaded/KOIEAHGHNX/wave/XC2797-large.png""}","""//creativecommons.org/licenses…","""A""","""0:15""","""6:00""","""1997-03-17""","""2008-11-20""","[""Phleocryptes melanops""]",""" cd:http://www.birdsongs.com/B…","""unknown""","""unknown""","""unknown""","""""","""""","""no""","""""","""""","""44100"""
4415,"""Hirundo""","""rustica""","""""","""birds""","""Barn Swallow""","""Glauco Alves Pereira""","""Brazil""","""Engenho Santa Fé, Nazaré da Ma…","""-7.731915""","""-35.21307""","""110""","""call""","""""","""""","""field recording""","""//xeno-canto.org/4415""","""https://xeno-canto.org/4415/do…","""Hirundo hustica.mp3""","{""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/ffts/XC4415-small.png"",""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/ffts/XC4415-med.png"",""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/ffts/XC4415-large.png"",""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/ffts/XC4415-full.png""}","{""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/wave/XC4415-small.png"",""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/wave/XC4415-med.png"",""//xeno-canto.org/sounds/uploaded/ZWEROGGYVP/wave/XC4415-large.png""}","""//creativecommons.org/licenses…","""C""","""0:18""","""07:00""","""2004-10-31""","""2008-11-20""","[""Tachycineta albiventer""]","""small group landed in an elect…","""unknown""","""unknown""","""unknown""","""""","""""","""no""","""""","""""","""22050"""
5954,"""Passer""","""domesticus""","""""","""birds""","""House Sparrow""","""Manuel Grosselet""","""Mexico""","""san Augustin Etla""",,,"""1650""","""song""","""""","""""","""field recording""","""//xeno-canto.org/5954""","""https://xeno-canto.org/5954/do…","""House Sparrow.mp3""","{""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/ffts/XC5954-small.png"",""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/ffts/XC5954-med.png"",""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/ffts/XC5954-large.png"",""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/ffts/XC5954-full.png""}","{""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/wave/XC5954-small.png"",""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/wave/XC5954-med.png"",""//xeno-canto.org/sounds/uploaded/ILUHRFXDNU/wave/XC5954-large.png""}","""//creativecommons.org/licenses…","""A""","""0:42""","""8:00""","""2006-05-21""","""2008-11-20""",[],"""""","""unknown""","""unknown""","""unknown""","""""","""""","""no""","""""","""""","""44100"""


In [None]:
# Match the metadata with the recordings in our dataset
ids = pl.read_csv(RAW_TRAIN_METADATA_PATH)["url"].str.split("/").list.last().cast(int)

