In [None]:
import pandas as pd
import numpy as np
import geobench
from pathlib import Path

## Merge country datasets together

In [None]:
df_us = pd.read_csv(f'{geobench.GEO_BENCH_DIR}/source/geolifeclef-2022/observations/observations_us_train.csv', delimiter=";")
df_fr = pd.read_csv(f'{geobench.GEO_BENCH_DIR}/source/geolifeclef-2022/observations/observations_fr_train.csv', delimiter=";")
df = df_us.append(df_fr)
df

## Do we have overlapping species in the two countries?

In [None]:
len(set(df_us['species_id']) & set(df_fr['species_id']))

## Calculate `n` most common species in train set

In [None]:
n = 100 # most common in train set
df_train = df.loc[df['subset'] == 'train']
species_counts = df_train['species_id'].value_counts()[:n]
species_counts

In [None]:
species = species_counts.keys()

## Filter for species

In [None]:
df.loc[df['species_id'].isin(species)]

## Check if all species are represented in val subset

In [None]:
df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'train')]['species_id'].value_counts()

In [None]:
df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'val')]['species_id'].value_counts()

## Subsample 10% from training dataset

In [None]:
df_train_species = df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'train')]

In [None]:
df_train_sample = df_train_species.sample(frac=0.1, random_state=1)
df_train_sample

In [None]:
df_train_sample['species_id'].value_counts()

## Create new dataset and change label enumeration

In [None]:
df_val_species = df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'val')]

In [None]:
df_new = df_train_sample.append(df_val_species)
df_new

In [None]:
df_new.to_csv(f'observations_sample_original_labels.csv', index=False, sep=";")

In [None]:
lookup = sorted(list(set(df_new['species_id'])))
def return_index(a):
    return lookup.index(a)

lookup

In [None]:
df_new['species_id'] = df_new['species_id'].map(return_index)

In [None]:
df_new.to_csv(f'observations_sample.csv', index=False, sep=";")

In [None]:
df_labels = pd.read_csv('observations_sample_original_labels.csv', sep=";")
df_labels

## Create names for species

In [None]:
df_species_names = pd.read_csv(f'{geobench.GEO_BENCH_DIR}/source/geolifeclef-2022/metadata/species_details.csv', sep=";")

In [None]:
merged_df = df_labels.join(df_species_names.set_index('species_id'), on='species_id')
merged_df

In [None]:
sorted_species = merged_df.sort_values(by=['species_id'])['GBIF_species_name']
u, idx = np.unique(sorted_species, return_index=True)
names = u[np.argsort(idx)]

In [None]:
df_names = pd.DataFrame(names, columns=['GBIF_species_name'])

In [None]:
df_names.to_csv(f'names.csv', index=False, sep=";")

In [None]:
df_read_names = pd.read_csv('names.csv', sep=";")
list(df_read_names['GBIF_species_name'])

## Investigate issues on altitude

In [None]:
path = geobench.GEO_BENCH_DIR / Path("/source/geolifeclef-2022/patches_sample/patches-us/00/00")
file = "20780000_altitude.tif"

from PIL import Image
im = Image.open(path / file)
im

In [None]:
import numpy as np
imarray = np.array(im)
imarray