# Scraping Xeno Canto

This notebook is used to scrape the metadata for the birds in our dataset. 
We will use the Xeno Canto API to get the metadata for the birds in our dataset.

In [19]:
from pathlib import Path

RAW_TRAIN_METADATA_PATH = Path('../data/raw/train_metadata.csv')

In [20]:
# Get the ids of all recordings in the dataset

## Using polars
import polars as pl
ids = pl.read_csv(RAW_TRAIN_METADATA_PATH)["url"].str.split("/").list.last().cast(int)

## Using pandas
# import pandas as pd
# ids = pd.read_csv(RAW_TRAIN_METADATA_PATH)["url"].str.split("/").str[-1]

ids[:10]

url
i64
134896
164848
175797
207738
209218
209219
267679
267680
267681
267682


In [18]:
from typing import TypedDict
# Retrieve the metadata for the recordings

from urllib import request, error
import json

# class XenoCantoAPIResponseRecording(TypedDict, total=False):
#     id: int
#     gen: str
#     sp: str
#     ssp: str
#     group: str
#     en: str
#     rec: str
#     cnt: str
#     loc: str
#     lat: float
#     lng: float
#     alt: int
#     type: str
#     sex: str
#     stage: str
#     method: str
#     url: str
#     file: str
#     file_name: str
#     sono: dict[str, str]
#     osci: dict[str, str]
#     lic: str
#     q: float
#     length: str
#     time: str | None
#     date: str
#     uploaded: str
#     also: list[str]
#     rmk: str
#     bird_seen: bool | None
#     animal_seen: bool | None
#     playback_used: bool | None
#     temp: str
#     regnr: str
#     auto: str
#     dvc: str
#     mic: str
#     smp: int
# 
# class XenoCantoAPIResponse(TypedDict):
#     numRecordings: int
#     numSpecies: int
#     page: int
#     numPages: int
#     recordings: list[XenoCantoAPIResponseRecording]

# Retrieves metadata for requested recordings in the form of a JSON file
# Inspired by https://github.com/ntivirikin/xeno-canto-py/blob/master/xenocanto.py#L30-L75
def get_metadata(i: int) -> dict:
    url = f"https://xeno-canto.org/api/2/recordings?query=nr:{i}"
    try:
        response = request.urlopen(url)
        recordings = json.loads(response.read())["recordings"]
        if not recordings:
            return {}
        return recordings[0]
    except error.HTTPError as e:
        print(f"Error retrieving metadata for recording {i}: {e}")
        return {}

# Get metadata for the recordings
# TODO(Jeffrey): Use a more efficient way to get the metadata
metadata = [get_metadata(i) for i in ids]
metadata[:5]

[{'id': '134896',
  'gen': 'Muscicapa',
  'sp': 'dauurica',
  'ssp': '',
  'group': 'birds',
  'en': 'Asian Brown Flycatcher',
  'rec': 'Matt Slaymaker',
  'cnt': 'China',
  'loc': 'Nanpu, Tangshan, Hebei',
  'lat': '39.2297',
  'lng': '118.1987',
  'alt': '0',
  'type': 'call',
  'sex': '',
  'stage': '',
  'method': 'field recording',
  'url': '//xeno-canto.org/134896',
  'file': 'https://xeno-canto.org/134896/download',
  'file-name': 'XC134896-AsianBrownFlycatcher_ Muscicapa_dauuricaCcall_NanpuPark,China_03May13.mp3',
  'sono': {'small': '//xeno-canto.org/sounds/uploaded/MZVKQWWXBQ/ffts/XC134896-small.png',
   'med': '//xeno-canto.org/sounds/uploaded/MZVKQWWXBQ/ffts/XC134896-med.png',
   'large': '//xeno-canto.org/sounds/uploaded/MZVKQWWXBQ/ffts/XC134896-large.png',
   'full': '//xeno-canto.org/sounds/uploaded/MZVKQWWXBQ/ffts/XC134896-full.png'},
  'osci': {'small': '//xeno-canto.org/sounds/uploaded/MZVKQWWXBQ/wave/XC134896-small.png',
   'med': '//xeno-canto.org/sounds/uploaded/MZ

In [None]:
# TODO(Jeffrey): Save the metadata to a csv file