In [1]:
%cd ..

s:\Sasha\project\HSE\MMM_project


In [2]:
import pandas as pd
from tqdm import tqdm

from src.dataset_utils import (
    read_satellite_image,
    get_bioclimatic_time_series_cube,
    get_satellite_time_series_landsat_cube,
    read_environmental_values,
    get_environmental_values_tensor
)

train_data = pd.read_csv("./data/geoplant-at-paiss/GLC25_PA_metadata_train.csv")
# train_data.columns
# >>> 'lon', 'lat', 'year', 'geoUncertaintyInM', 'areaInM2', 'region', 'country', 'speciesId', 'surveyId'

train_data["speciesId"] = train_data["speciesId"].astype(int)
environmental_train_data = read_environmental_values()

train_data.head()

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530,212


In [6]:
train_data["speciesId"].unique()

array([ 6874,   476, 11157, ...,  3326,  6586,  8119], shape=(5016,))

In [3]:
# For each surveyId get a list of speciesIds
survey_id2species_ids = train_data.groupby('surveyId')['speciesId'].apply(list).to_dict()

In [4]:
species_ids = train_data.speciesId.unique()
species_id2label = {species_id: i for i, species_id in enumerate(species_ids)}
label2species_id = {i: species_id for i, species_id in enumerate(species_ids)}

n_classes = len(species_ids)
n_classes  # 5016

5016

In [39]:
import torch

def indexes_to_tensor(indexes_list, max_index):
    indexes_list = [int(idx)-1 for idx in indexes_list]
    tensor = torch.zeros(max_index, dtype=torch.float32)
    tensor[indexes_list] = 1.0
    tensor._values = tensor.data
    return tensor

max_species_id = int(train_data['speciesId'].max().item())

columns = train_data.columns.tolist()
columns.remove("speciesId")
columns.remove("surveyId")

aggregations = {col: "first" for col in columns}
aggregations["speciesId"] = lambda x: x.tolist()

train_data_g = train_data.groupby("surveyId").agg(aggregations).reset_index()
train_data_g['speciesId'] = train_data_g['speciesId'].apply(lambda x: indexes_to_tensor(x, max_species_id))

In [None]:
train_data_g

Unnamed: 0,surveyId,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId
0,212,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
1,222,9.884560,56.912140,2017,10.0,79.0,CONTINENTAL,Denmark,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2,243,8.256020,55.637050,2019,10.0,79.0,ATLANTIC,Denmark,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
3,324,-0.402590,43.505630,2018,1.0,,ATLANTIC,France,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
4,333,-0.517360,45.806430,2017,1.0,,ATLANTIC,France,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
...,...,...,...,...,...,...,...,...,...
88982,3919553,10.327990,57.305850,2018,10.0,79.0,CONTINENTAL,Denmark,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
88983,3919592,15.003900,55.090170,2017,10.0,707.0,CONTINENTAL,Denmark,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
88984,3919620,8.935060,55.461500,2018,10.0,79.0,ATLANTIC,Denmark,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
88985,3919640,17.252948,53.901434,2021,5.0,25.0,CONTINENTAL,Poland,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."


In [5]:
# Task: multi-label prediction
# Input:  (4, 64, 64) + (4, 19, 12) + (6, 4, 21) + (65,)
# Output: (n_classes,)
# Loss: BCEwithLogitsLoss + assymetric loss

# CLIP   -> v1 (512, )  -| cross_attention(CLIP, LSTM) -> (512, )  Q=CLIP, K=LSTM, V=LSTM
# LSTM 1 -> v2 (512, )  -| cross attention(CLIP, LSTM) -> (512, )  Q=LSTM, K=CLIP, V=CLIP
# LSTM 2 -> v3 (512, )
# MLP    -> v4 (512, )

# v1 | v2 | v3 | v4 -> (2048, )
# MLP 2048 -> (num_classes,)


X_train = []
y_train = []

# survey_id -> geo -> [labels]

for survey_id in tqdm(train_data.surveyId.unique()): # TEST MODE

    # IMAGE --> CNN / ViT on the first 3 channels + custom CNN on the last channel (NIR)
    satellite_image_data = read_satellite_image(survey_id)                                       # shape: (4, 64, 64)  [torch.int16]

    # TIME SERIES --> LSTM
    # 4 variables
    # 19 years, 12 months
    # 4 times series: 19 * 12 = 228
    bioclimatic_time_series_cube = get_bioclimatic_time_series_cube(survey_id)                   # shape: (4, 19, 12)  [torch.float32]

    # TIME SERIES --> LSTM  (but there are some RGB channels involved -- maybe it should be treated as a picture)
    # 6 variables
    # 4 = four times per year (after each season)
    # 21 years
    # 6 time series: 4 * 21 = 84
    satellite_time_series_landsat_cube = get_satellite_time_series_landsat_cube(survey_id)       # shape: (6, 4, 21)   [torch.float32]

    # MLP ???
    # 65 variables
    # 1 variable
    # 65 time series: 1 * 65 = 65
    environmental_values = get_environmental_values_tensor(survey_id, environmental_train_data)  # shape: (1, 65)      [torch.float64]

    if satellite_image_data is None or \
       bioclimatic_time_series_cube is None or \
         satellite_time_series_landsat_cube is None or \
           environmental_values is None:
        print(satellite_image_data is not None, 
            bioclimatic_time_series_cube is not None, 
            satellite_time_series_landsat_cube is not None, 
            environmental_values is not None)

     # Combine all inputs into a single tuple

    X = (
        satellite_image_data.squeeze(),
        bioclimatic_time_series_cube.squeeze(),
        satellite_time_series_landsat_cube.squeeze(),
        environmental_values.squeeze()
    )
    y = survey_id2species_ids[survey_id]
    X_train.append(X)
    y_train.append(y)

  2%|▏         | 1481/88987 [00:28<28:00, 52.07it/s]  


KeyboardInterrupt: 

In [None]:
columns = ['satellite_image_data', 'bioclimatic_time_series_cube',
           'satellite_time_series_landsat_cube', 'environmental_values']

df = pd.DataFrame(X_train, columns=columns)
df['labels'] = y_train
df.head()

Unnamed: 0,satellite_image_data,bioclimatic_time_series_cube,satellite_time_series_landsat_cube,environmental_values,labels
0,"[[[tensor(298, dtype=torch.int16), tensor(555,...","[[[tensor(5872.), tensor(1229.), tensor(1184.)...","[[[tensor(9.), tensor(11.), tensor(9.), tensor...","[tensor(212., dtype=torch.float64), tensor(288...","[6874, 476, 11157, 8784, 4530, 10520, 9458, 98..."
1,"[[[tensor(655, dtype=torch.int16), tensor(670,...","[[[tensor(4998.), tensor(6591.), tensor(6000.)...","[[[tensor(11.), tensor(11.), tensor(11.), tens...","[tensor(222., dtype=torch.float64), tensor(281...","[9816, 540, 4499, 433, 254]"
2,"[[[tensor(910, dtype=torch.int16), tensor(774,...","[[[tensor(5387.), tensor(8123.), tensor(5518.)...","[[[tensor(10.), tensor(10.), tensor(10.), tens...","[tensor(243., dtype=torch.float64), tensor(282...","[5386, 3227, 6079, 254, 6964, 1387, 4638, 5384..."
3,"[[[tensor(1228, dtype=torch.int16), tensor(126...","[[[tensor(1567.), tensor(9315.), tensor(10298....","[[[tensor(27.), tensor(25.), tensor(25.), tens...","[tensor(324., dtype=torch.float64), tensor(287...","[9028, 2386, 694, 9388, 146, 6788]"
4,"[[[tensor(1210, dtype=torch.int16), tensor(112...","[[[tensor(2085.), tensor(10213.), tensor(3199....","[[[tensor(24.), tensor(24.), tensor(18.), tens...","[tensor(333., dtype=torch.float64), tensor(285...","[2474, 981, 3935, 8151, 10247, 5189, 1888, 507..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   satellite_image_data                100 non-null    object
 1   bioclimatic_time_series_cube        100 non-null    object
 2   satellite_time_series_landsat_cube  100 non-null    object
 3   environmental_values                100 non-null    object
 4   labels                              100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB
