In [1]:
import pandas as pd
from tqdm import tqdm

from src.dataset_utils import (
    read_satellite_image,
    get_bioclimatic_time_series_cube,
    get_satellite_time_series_landsat_cube,
    read_environmental_values,
    get_environmental_values_tensor
)

train_data = pd.read_csv("data/GLC25_PA_metadata_train.csv")
# train_data.columns
# >>> 'lon', 'lat', 'year', 'geoUncertaintyInM', 'areaInM2', 'region', 'country', 'speciesId', 'surveyId'

train_data["speciesId"] = train_data["speciesId"].astype(int)
environmental_train_data = read_environmental_values()

train_data.head()

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530,212


In [2]:
# For each surveyId get a list of speciesIds
survey_id2species_ids = train_data.groupby('surveyId')['speciesId'].apply(list).to_dict()

In [3]:
species_ids = train_data.speciesId.unique()
species_id2label = {species_id: i for i, species_id in enumerate(species_ids)}
label2species_id = {i: species_id for i, species_id in enumerate(species_ids)}

n_classes = len(species_ids)
n_classes  # 5016

5016

In [4]:
# Task: multi-label prediction
# Input:  (4, 64, 64) + (4, 19, 12) + (6, 4, 21) + (65,)
# Output: (n_classes,)
# Loss: BCEwithLogitsLoss

X = []
y = []

for survey_id in tqdm(train_data.surveyId.unique()[:100]): # TEST MODE
    satellite_image_data = read_satellite_image(survey_id)                                       # shape: (4, 64, 64)  [torch.int16]
    bioclimatic_time_series_cube = get_bioclimatic_time_series_cube(survey_id)                   # shape: (4, 19, 12)  [torch.float32]
    satellite_time_series_landsat_cube = get_satellite_time_series_landsat_cube(survey_id)       # shape: (6, 4, 21)   [torch.float32]
    environmental_values = get_environmental_values_tensor(survey_id, environmental_train_data)  # shape: (1, 65)      [torch.float64]

    X = (
        satellite_image_data.squeeze(),
        bioclimatic_time_series_cube.squeeze(),
        satellite_time_series_landsat_cube.squeeze(),
        environmental_values.squeeze()
    )
    y = survey_id2species_ids[survey_id]

100%|██████████| 100/100 [00:00<00:00, 217.09it/s]
