In [54]:
import numpy as np
import pandas as pd

from os import walk

### Import original dataset

In [55]:
file_path = '../data/processed/'
proboscidia_df = pd.read_csv(file_path + 'proboscidia_final.csv', index_col=0)
felids_df = pd.read_csv(file_path + 'felids_final.csv', index_col=0)

In [56]:
print(len(proboscidia_df))
print(len(felids_df))

11292
68427


### Get all Taxon Test File Names

In [57]:
dir_path = '../data/taxon_test/'

files = []
for (dir_path, dir_names, file_names) in walk(dir_path):
    files.extend(file_names)
    

In [58]:
ids = []
for file in files:
    file = int(file[:-6])
    ids.append(file)

In [70]:
print(ids)

[141408556, 141408556, 144023496, 133198365, 141408556, 8035648, 38670750, 3779201, 18834506, 6468560, 62384184, 135658954, 20304831, 6468544, 3779203, 149475181, 91956532, 34245509, 3918407, 25577825, 146386308, 9761327, 8269912, 18817973, 6484304, 103635046, 19433004, 73019401, 139654324, 41976629, 132407170, 21201939, 21202507, 110565817, 91229332, 85951365, 23102042, 44156249, 58311001, 141973674, 55769486, 26439648, 112159539, 86161385, 67701843, 139447656, 59774942, 61603580, 128619035, 52142288, 144137598, 138502014, 41976629, 136330507, 144983757, 67293799, 3779174, 8707301, 131063247, 3779171, 7103901, 16991344, 6454623, 4604725, 4604723, 86830963, 82339865, 110565241, 108253502, 104174999, 103099632, 67012454, 82905392, 86444052, 56264848, 80550716, 104569232, 63464625, 136134858, 56304285, 83031005, 56304288, 149388043, 148977889, 139108823, 107119028, 74361866, 6468619, 103650317, 103650296, 73862414, 137556432, 37692480, 100510660, 150336140, 37662236, 55602221, 18756301, 

## Remove all matching observations from original data

In [59]:
proboscidia_train = proboscidia_df[~proboscidia_df.index.isin(ids)]
felids_train = felids_df[~felids_df.index.isin(ids)]

In [60]:
print(len(proboscidia_train))
print(len(felids_train))

9891
64536


In [61]:
proboscidia_train.to_csv(file_path + 'proboscidia_train.csv')
felids_train.to_csv(file_path + 'felids_train.csv')

### Create the test set

Concatenate both sets of observations

In [62]:
df = pd.concat([proboscidia_df, felids_df])
df = df[df['taxon_species_name'] != 'Felis catus']

Read in and concatenate both sets of weather data.

In [63]:
proboscidia_meta = pd.read_csv(file_path + 'proboscidia_meta.csv', index_col=0)
felids_meta = pd.read_csv(file_path + 'felids_meta.csv', index_col=0)

In [64]:
meta = pd.concat([proboscidia_meta, felids_meta])

Merge observations and meta-data

In [65]:
df = pd.merge(df, meta, how='inner', left_index=True, right_index=True)

In [66]:
df = df[df.index.isin(ids)]

In [67]:
df.head()

Unnamed: 0_level_0,observed_on_x,local_time_observed_at,latitude,longitude,positional_accuracy,public_positional_accuracy,image_url,license,geoprivacy,taxon_geoprivacy,...,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration_daily
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4126,2009-07-25,2009-07-25 10:58:00+02:00,-19.502735,23.298196,,30562.0,https://inaturalist-open-data.s3.amazonaws.com...,CC-BY-NC-ND,,obscured,...,0.0,0.0,0.0,2009-07-25T06:57,2009-07-25T18:09,17.0,37.4,132.0,19.05,4.18
16346,2007-11-29,2007-11-30 00:03:00+02:00,-20.253259,25.194044,,30490.0,https://inaturalist-open-data.s3.amazonaws.com...,CC-BY,,obscured,...,0.0,0.0,0.0,2007-11-30T05:29,2007-11-30T18:46,18.0,41.0,29.0,26.25,6.35
17083,2007-11-26,2007-11-27 00:58:00+02:00,-19.595253,23.228511,,30562.0,https://inaturalist-open-data.s3.amazonaws.com...,CC-BY,,obscured,...,2.6,0.0,2.0,2007-11-27T05:37,2007-11-27T18:51,11.8,29.2,23.0,25.99,6.4
17460,2010-04-03,2010-04-03 16:58:00+02:00,-19.271202,14.153619,,30580.0,http://static.inaturalist.org/photos/32395/med...,CC-BY-NC-SA,,obscured,...,3.2,0.0,2.0,2010-04-03T07:09,2010-04-03T19:03,18.4,35.6,44.0,20.65,4.33
56825,2011-07-22,2011-07-22 09:43:00+02:00,-18.752899,24.488977,62469.0,62469.0,http://static.inaturalist.org/photos/88420/med...,CC-BY,,obscured,...,0.0,0.0,0.0,2011-07-22T06:52,2011-07-22T18:04,12.6,32.8,67.0,18.84,4.83


In [68]:
print(len(df))

5277


In [69]:
df.to_csv(file_path + 'final_test_observations.csv')