In [1]:
import pandas as pd

## Merge country datasets together

In [6]:
df_us = pd.read_csv(f'/Volumes/Washington/data/source/geolifeclef-2022/data/observations/observations_us_train.csv', delimiter=";")
df_fr = pd.read_csv(f'/Volumes/Washington/data/source/geolifeclef-2022/data/observations/observations_fr_train.csv', delimiter=";")
df = df_us.append(df_fr)
df

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
0,20000173,33.197660,-116.180680,4911,train
1,20000175,34.037968,-118.876755,4912,train
2,20000176,27.620740,-97.222690,4913,train
3,20000177,29.155582,-95.653930,4914,train
4,20000179,36.605740,-121.959510,4915,train
...,...,...,...,...,...
671239,10543820,43.542500,4.967778,1041,train
671240,10304005,43.829823,4.450699,19,train
671241,10433186,43.329414,5.599397,906,train
671242,10352176,43.556637,7.016971,516,train


## Do we have overlapping species in the two countries?

In [7]:
len(set(df_us['species_id']) & set(df_fr['species_id']))

1956

## Calculate `n` most common species in train set

In [8]:
n = 100 # most common in train set
df_train = df.loc[df['subset'] == 'train']
species_counts = df_train['species_id'].value_counts()[:n]
species_counts

5045    6456
3072    5805
2902    4722
5053    3934
3034    3857
        ... 
5247    1370
418     1365
139     1359
2225    1359
869     1358
Name: species_id, Length: 100, dtype: int64

In [9]:
species = species_counts.keys()

## Filter for species

In [10]:
df.loc[df['species_id'].isin(species)]

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
6,20000182,43.889668,-73.009250,2950,train
8,20000185,31.935776,-108.941570,4918,train
22,20000204,30.416836,-98.045460,2524,train
28,20000211,33.789425,-84.373790,2992,train
29,20000212,46.867200,-96.449640,4936,train
...,...,...,...,...,...
671174,10612529,48.562592,3.013792,886,train
671184,10629793,44.970047,5.149254,185,train
671196,10721898,48.851585,2.363979,129,train
671202,10661298,43.370243,-1.781419,687,train


## Check if all species are represented in val subset

In [11]:
df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'train')]['species_id'].value_counts()

5045    6456
3072    5805
2902    4722
5053    3934
3034    3857
        ... 
5247    1370
418     1365
2225    1359
139     1359
869     1358
Name: species_id, Length: 100, dtype: int64

In [12]:
df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'val')]['species_id'].value_counts()

5045    245
720     142
5612    129
3072    129
2902    124
       ... 
5189     23
340      21
5855     19
4991     16
5200     16
Name: species_id, Length: 100, dtype: int64

## Subsample 10% from training dataset

In [13]:
df_train_species = df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'train')]

In [14]:
df_train_sample = df_train_species.sample(frac=0.1, random_state=1)
df_train_sample

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
38767,20083678,42.447910,-71.268210,5359,train
742982,21605199,39.492954,-88.176710,553,train
591000,21276352,40.899520,-73.894010,720,train
128676,10421962,47.487385,-0.561821,389,train
69315,10710717,43.235447,1.352631,33,train
...,...,...,...,...,...
526496,21136957,44.065010,-121.310050,5095,train
77972,20168673,38.720116,-77.799034,816,train
656664,21418525,34.128128,-118.212230,5045,train
809667,21750275,37.887920,-122.139885,5143,train


In [15]:
df_train_sample['species_id'].value_counts()

5045    640
3072    594
2902    466
5053    390
3495    384
       ... 
5020    133
418     132
4936    132
5247    116
869     114
Name: species_id, Length: 100, dtype: int64

## Create new dataset and change label enumeration

In [35]:
df_val_species = df.loc[(df['species_id'].isin(species)) & (df['subset'] == 'val')]

In [36]:
df_new = df_train_sample.append(df_val_species)
df_new

Unnamed: 0,observation_id,latitude,longitude,species_id,subset
38767,20083678,42.447910,-71.268210,5359,train
742982,21605199,39.492954,-88.176710,553,train
591000,21276352,40.899520,-73.894010,720,train
128676,10421962,47.487385,-0.561821,389,train
69315,10710717,43.235447,1.352631,33,train
...,...,...,...,...,...
669034,10519854,43.582371,-1.410010,389,val
669254,10127916,43.457775,5.882048,246,val
670006,10426388,44.108685,0.583843,33,val
670034,10543033,43.142288,6.230122,139,val


In [37]:
df_new.to_csv(f'observations_sample_original_labels.csv', index=False, sep=";")

In [38]:
lookup = sorted(list(set(df_new['species_id'])))
def return_index(a):
    return lookup.index(a)

In [39]:
df_new['species_id'] = df_new['species_id'].map(return_index)

In [40]:
df_new.to_csv(f'observations_sample.csv', index=False, sep=";")