In [3]:
import joblib
import mlflow
import multiprocessing

import pandas as pd

from memoized_property import memoized_property
from mlflow.tracking import MlflowClient
from psutil import virtual_memory
from termcolor import colored
from time import time

from biodiversipy.params import MLFLOW_EXPERIMENT_BASE, MLFLOW_URI
from biodiversipy.utils import simple_time_tracker

In [10]:
%%time
tmp = pd.read_csv('../raw_data/gbif/occurrences_100k/occurrences_100k_encoded.csv')
display(type(tmp))

pandas.core.frame.DataFrame

CPU times: user 16.6 s, sys: 3.41 s, total: 20 s
Wall time: 24.1 s


In [14]:
import sys

pandas.core.frame.DataFrame

In [15]:
sys.getsizeof(tmp)

2263684224

In [21]:
(sdf['10009934'] + sdf['10009934']).max()

2.0

In [16]:
sys.getsizeof(sdf)

3078720

In [13]:
sdf = tmp.astype(pd.SparseDtype("float", 0))
sdf.dtypes

latitude     Sparse[float64, 0]
longitude    Sparse[float64, 0]
10009934     Sparse[float64, 0]
10010617     Sparse[float64, 0]
10055902     Sparse[float64, 0]
                    ...        
9689880      Sparse[float64, 0]
9823072      Sparse[float64, 0]
9823314      Sparse[float64, 0]
9823570      Sparse[float64, 0]
9868510      Sparse[float64, 0]
Length: 3615, dtype: object

In [2]:
# Create custom metric
def compute_average(y_true, y_pred, t):
    """Returns the average number of species observed correctly predicted given a threshold value t"""
    assert t <= 1
    assert t >= 0
    N, C = y_pred.shape
    temp = map(lambda x: 1 if x >= t else 0, y_pred[y_true == 1])

    average = reduce_sum(temp) / N
    return average

def find_t_min(y_true, y_pred, K, rate, t):
    """
    Returns the minimum threshold t and corresponding average satisfying the
    condition average <= K.
    The minimum t is found iteratively, with tuning parameter rate [0-1]
    """
    assert rate <= 1
    assert rate >= 0
    assert K > 0
    average = compute_average(y_true, y_pred, t)
    while average <= K:
        t = rate*t
        average = compute_average(y_true, y_pred, t)
    t_min = t/rate
    average = compute_average(y_true, y_pred, t_min)
    return t_min, average

def compute_accuracy(y_true, y_pred, t_min):
    N, C = y_pred.shape
    temp = map_fn(lambda x: 1 if x >= t_min else 0, y_pred[y_true == 1])

    return reduce_sum(temp) / (N*C)

def custom_metric(y_true, y_pred, K=2, rate=0.99, t=1):
    t_min, average = find_t_min(y_true, y_pred, K, rate, t)
    accuracy = compute_accuracy(y_true, y_pred, t_min)

    return t_min, average, accuracy

# Trying dummy model

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)

In [2]:
!pwd

/Users/mikkelvaldemarkoch/code/MikkelValdemar/biodiversipy/notebooks


In [2]:
X = pd.read_csv('../raw_data/output/occurrences/coordinates_100k_top100/coordinates_100k_top100_features.csv')
y = pd.read_csv('../raw_data/gbif/occurrences_100k_top100/occurrences_100k_top100_encoded.csv')
y = y.drop(columns=['latitude', 'longitude'])

In [3]:
X.head()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
0,49.450288,6.89255,8.845834,8.341666,33.909214,620.71606,23.0,-1.6,24.6,2.333333,...,463,442,474,424,510,43,201,105,313,37
1,49.890565,6.734293,8.725,7.166666,30.62678,610.97723,22.1,-1.3,23.4,2.433333,...,500,487,513,481,796,52,188,89,470,60
2,52.366974,12.998404,9.208333,8.066667,31.02564,687.5147,23.6,-2.4,26.0,17.766666,...,152,148,159,150,727,100,202,115,366,101
3,54.312887,10.576389,8.3,6.283333,28.82263,591.92676,20.6,-1.2,21.800001,5.516667,...,0,0,0,0,0,0,0,0,0,0
4,49.322896,7.082453,8.8625,8.175,32.96371,631.1754,23.1,-1.7,24.800001,2.283333,...,471,466,487,447,887,62,281,145,376,75


In [4]:
y.head()

Unnamed: 0,2875818,2875979,2876213,2878688,2882316,2888443,2888605,2888728,2889011,2890668,...,7874883,7960979,7972010,8208358,8324121,8351737,9151957,9172281,9206251,9220780
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
preproc_pipeline = make_pipeline(StandardScaler())
pipe_baseline = make_pipeline(preproc_pipeline, LogisticRegression())

In [5]:
X.describe()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
count,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,...,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0
mean,50.858406,9.648334,-1.244649e+36,-1.244649e+36,-1.244649e+36,-1.244649e+36,-1.244649e+36,-1.244649e+36,-1.244649e+36,-1.244649e+36,...,247.489632,243.332587,255.612398,239.984473,433.19154,74.457087,164.177104,101.931364,227.64912,80.257869
std,1.7294,2.125983,2.053379e+37,2.053379e+37,2.053379e+37,2.053379e+37,2.053379e+37,2.053379e+37,2.053379e+37,2.053379e+37,...,216.872734,213.636848,225.499921,210.445256,385.158498,128.117123,179.121854,146.052371,221.194616,131.806642
min,47.320938,5.855626,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.471708,7.96411,8.525,7.633333,31.37255,600.1906,22.2,-2.8,23.6,14.81667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.979486,9.346275,9.1,8.066667,32.48299,645.2214,23.0,-1.8,25.1,16.33333,...,277.0,270.0,276.0,269.0,487.0,36.0,155.0,68.0,240.0,42.0
75%,52.160216,11.225239,9.620833,8.483334,33.40081,676.0037,23.7,-1.0,26.1,17.06667,...,454.0,448.0,471.0,440.0,730.0,93.0,248.0,139.0,357.0,102.0
max,55.024316,15.016563,11.03333,10.38333,37.76933,755.8695,26.0,1.0,28.8,19.11667,...,755.0,730.0,737.0,716.0,1726.0,2114.0,1987.0,2182.0,1734.0,2090.0


In [6]:
y.describe()

Unnamed: 0,2875818,2875979,2876213,2878688,2882316,2888443,2888605,2888728,2889011,2890668,...,7874883,7960979,7972010,8208358,8324121,8351737,9151957,9172281,9206251,9220780
count,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,...,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0,87141.0
mean,0.012543,0.016743,0.00933,0.008021,0.028024,0.00809,0.013002,0.018315,0.007299,0.017282,...,0.007895,0.01555,0.010523,0.013174,0.011372,0.018969,0.008021,0.008721,0.006977,0.011074
std,0.111291,0.128308,0.096139,0.089203,0.165041,0.089582,0.113283,0.134089,0.08512,0.130322,...,0.088504,0.123725,0.102042,0.11402,0.106034,0.136417,0.089203,0.092981,0.083238,0.104649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
y.sum(axis=0).min()

520

In [6]:
X[X['bio_1'] < -1000]

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
75,54.260010,8.852663,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
384,54.398598,10.220761,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
588,54.338844,13.680669,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
788,54.402565,13.615970,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
1063,53.790783,7.901619,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77582,54.415115,10.232071,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77708,54.134101,13.830972,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77817,53.705471,7.137551,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77854,54.763778,8.287171,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,251,245,247,250,956,587,613,720,560,642


In [None]:

df_masked = np.ma.masked_where((-273 > df), df)

In [30]:
X_scaled = preproc_pipeline.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.describe()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
count,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,...,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0
mean,1.269961e-15,-2.556262e-16,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,...,6.626673e-17,-4.883767e-17,-3.1953270000000003e-17,8.351423e-18,-7.879386e-17,2.015235e-17,1.8790700000000003e-17,-3.485812e-17,-1.3434900000000001e-17,-1.8246040000000002e-17
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,...,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-2.019126,-1.77921,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,...,-1.19086,-1.188719,-1.183837,-1.19016,-1.164274,-0.5952486,-0.9435448,-0.7160415,-1.054974,-0.6238768
25%,-0.77557,-0.78473,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,-1.19086,-1.188719,-1.183837,-1.19016,-1.164274,-0.5952486,-0.9435448,-0.7160415,-1.054974,-0.6238768
50%,0.061153,-0.1336538,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,0.2279539,0.2180067,0.2005337,0.2261485,0.1410321,-0.3037139,-0.06988361,-0.2325922,0.0391311,-0.2893896
75%,0.7354255,0.7307068,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,0.9304288,0.9354366,0.93605,0.9235914,0.7565008,0.1412601,0.4576099,0.250857,0.5686781,0.1640263
max,2.411486,2.525576,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,2.349243,2.182733,2.109321,2.223263,3.328647,17.39551,12.18335,15.21764,6.691292,16.56876


In [None]:
%%time
pipe_baseline.fit(X, y)

In [None]:
cv_results = cross_validate(pipe_baseline, X=X, y=y, cv=5, scoring=rmsle)

In [None]:
def model(X, y, metrics):
    normalization_layer = Normalization()
    normalization_layer.adapt(X)

    model = Sequential([
        normalization_layer,
        Dense(10,
              input_dim=X.shape[1],
              activation='relu',
              activity_regularizer=l1_l2(l1=0.005, l2=0.0005)),
        Dropout(0.4),
        Dense(y.shape[1], activation='softmax'),
    ])

    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])

    return model


In [7]:
y

Unnamed: 0,latitude,longitude,10009934,10010617,10055902,10068154,10096320,10220564,10243708,10356062,...,9573476,9606396,9624496,9643454,9677963,9689880,9823072,9823314,9823570,9868510
0,48.870487,10.335366,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,54.651142,9.772167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53.936638,11.247078,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49.667747,8.651412,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,48.826295,10.062125,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78269,50.498398,6.624358,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78270,53.321644,10.501041,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78271,53.089754,10.565015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78272,52.254784,12.928824,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
from biodiversipy.utils import encode_taxonKey, get_suffix
from os import path
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
merged, coordinates = encode_taxonKey('../raw_data', n=100000, from_csv = True, to_csv = False)
display((merged.shape, coordinates.shape))

raw_data_path = '../raw_data'
n=100000

filename = 'occurrences' + get_suffix(n) + '.csv'
source_path = path.join(raw_data_path, 'gbif', 'occurrences' + get_suffix(n), filename)

coordinates = pd.read_csv(source_path)
display(coordinates.shape)

# Create a DataFrame with a coordinates column (latitude, longitude)
#coordinates['coordinates'] = coordinates[['latitude', 'longitude']].apply(tuple, axis=1)

# Convert taxonKey to string for later vectorizing
coordinates['taxonKey'] = coordinates['taxonKey'].astype('string')

# Group by coordinates and list the taxonKey's
encoded_targets = coordinates.groupby(['latitude', 'longitude'])['taxonKey'].apply(list)
encoded_targets = pd.DataFrame(encoded_targets)
encoded_targets.shape

idx = encoded_targets.index

# Format taxonKey Pandas Series for vectorizing
encoded_targets['taxonKey'] = encoded_targets['taxonKey'].map(lambda x: ' '.join(x))

# Initialize CountVectorizer and apply it to the taxonKey's
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split())
encoded_targets = vectorizer.fit_transform(encoded_targets['taxonKey']).toarray()

# Get feature names out
encoded_targets = pd.DataFrame(encoded_targets, index=idx, columns = vectorizer.get_feature_names_out())
encoded_targets.reset_index(inplace=True)
encoded_targets.head()

((78274, 3615), (78274, 2))

(100000, 4)

(78274, 1)

In [80]:
tmp2 = pd.read_csv('../raw_data/gbif/occurrences_100k_top100/occurrences_100k_top100_encoded.csv')
tmp2.shape

(87141, 102)

In [42]:
tmp = pd.read_csv('../raw_data/gbif/occurrences_100k/coordinates_100K.csv')
tmp.shape

(78274, 2)

In [61]:
import numpy as np

In [72]:
length = int((len(tmp) / 5) + 1)

In [73]:
start = 0
end = length
(start,end)

(0, 15655)

In [77]:
start += length
end += length
(start,end)

(62620, 78275)

In [71]:
tmp[start:end]

Unnamed: 0,latitude,longitude
62620,52.394432,9.699992
62621,52.394440,7.900830
62622,52.394440,13.045523
62623,52.394447,13.078628
62624,52.394452,13.720291
...,...,...
78269,55.015335,8.434173
78270,55.022861,8.425530
78271,55.043995,8.391613
78272,55.044971,8.457426


In [48]:
np.ceil(len(tmp) / 5)

NameError: name 'np' is not defined

In [79]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78274 entries, 0 to 78273
Columns: 3615 entries, latitude to 9868510
dtypes: float64(2), int64(3613)
memory usage: 2.1 GB


In [52]:
encoded_targets.shape

(78274, 3615)

In [70]:
merged.sum()[:2].sum()

4732633.325635

In [71]:
merged.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            78264, 78265, 78266, 78267, 78268, 78269, 78270, 78271, 78272,
            78273],
           dtype='int64', length=78274)

In [69]:
encoded_targets.sum()[:2].sum()

4732633.325635

In [None]:


# Merging output of CountVectorizer with latitude and longitude data
coordinates = coordinates.drop(columns=['gbifID', 'taxonKey']).drop_duplicates()
merged = coordinates.merge(encoded_targets).drop(columns='coordinates')
coordinates = coordinates.drop(columns='coordinates')

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
print('Reading')

coordinates = pd.read_csv('../raw_data/gbif/occurrences/occurrences.csv')

# # Create a DataFrame with a coordinates column (latitude, longitude)
# coordinates['coordinates'] = coordinates[['latitude', 'longitude']].apply(tuple, axis=1)

# Convert taxonKey to string for later vectorizing
coordinates['taxonKey'] = coordinates['taxonKey'].astype('string')

print('Grouping')
# Group by coordinates and list the taxonKey's
encoded_targets = coordinates.groupby(['latitude', 'longitude'])['taxonKey'].apply(list)
encoded_targets = pd.DataFrame(encoded_targets)
idx = encoded_targets.index

# Format taxonKey Pandas Series for vectorizing
encoded_targets['taxonKey'] = encoded_targets['taxonKey'].map(lambda x: ' '.join(x))

In [26]:
print('Vectorizing')
# Initialize CountVectorizer and apply it to the taxonKey's
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split())
encoded_targets = vectorizer.fit_transform(encoded_targets['taxonKey'])
encoded_targets

Vectorizing


<1362717x8824 sparse matrix of type '<class 'numpy.int64'>'
	with 2078704 stored elements in Compressed Sparse Row format>

In [31]:
from scipy.sparse import save_npz, load_npz

In [30]:
%%time
save_npz('../raw_data/gbif/occurrences/occurrences_encoded', encoded_targets)

CPU times: user 1.07 s, sys: 13.4 ms, total: 1.08 s
Wall time: 1.11 s


In [32]:
tmp = load_npz('../raw_data/gbif/occurrences/occurrences_encoded.npz')
tmp

<1362717x8824 sparse matrix of type '<class 'numpy.int64'>'
	with 2078704 stored elements in Compressed Sparse Row format>

In [34]:
tmp.toarray().shape

(1362717, 8824)

In [35]:
!pip freeze

absl-py==1.0.0
affine==2.3.1
aiohttp==3.8.1
aiosignal==1.2.0
alembic==1.4.1
altair==4.2.0
anyio==3.6.1
appnope==0.1.3
APScheduler==3.9.1
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
astroid==2.11.5
asttokens==2.0.5
astunparse==1.6.3
async-timeout==4.0.2
attrs==21.4.0
Babel==2.10.1
backcall==0.2.0
backports.zoneinfo==0.2.1
beautifulsoup4==4.11.1
-e git+ssh://git@github.com/TmtStss/biodiversipy.git@042ef95eb5e84bbc4fc75a01f42c5363cb1fb0bc#egg=biodiversipy
black==21.12b0
bleach==5.0.0
blinker==1.4
cachetools==5.0.0
certifi==2022.5.18.1
cffi==1.15.0
charset-normalizer==2.0.12
click==8.1.3
click-plugins==1.1.1
cligj==0.7.2
cloudpickle==2.1.0
commonmark==0.9.1
coverage==6.2
cycler==0.11.0
databricks-cli==0.16.6
debugpy==1.6.0
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.5.1
docker==5.0.3
docutils==0.18.1
entrypoints==0.4
executing==0.8.3
fastjsonschema==2.15.3
Fiona==1.8.21
flake8==4.0.1
Flask==2.1.2
flatbuffers==2.0
fonttools==4.33.3
fro

In [38]:
N = 1

In [39]:
assert not N

AssertionError: 

In [40]:
tmp

<1362717x8824 sparse matrix of type '<class 'numpy.int64'>'
	with 2078704 stored elements in Compressed Sparse Row format>

In [98]:
tmp1 = pd.read_csv('../raw_data/gbif/occurrences/splits/coordinates_1.csv')
tmp1.shape

(272544, 2)

In [99]:
tmp2 = pd.read_csv('../raw_data/gbif/occurrences/splits/coordinates_2.csv')
tmp2.shape

(272544, 2)

In [100]:
tmp3 = pd.read_csv('../raw_data/gbif/occurrences/splits/coordinates_3.csv')
tmp3.shape

(272544, 2)

In [101]:
tmp4 = pd.read_csv('../raw_data/gbif/occurrences/splits/coordinates_4.csv')
tmp4.shape

(272544, 2)

In [102]:
tmp5 = pd.read_csv('../raw_data/gbif/occurrences/splits/coordinates_5.csv')
tmp5.shape

(272541, 2)

In [85]:
tmp1.shape[0] + tmp2.shape[0] + tmp3.shape[0] + tmp4.shape[0] + tmp5.shape[0]

1362717

In [86]:
tmp_all = pd.read_csv('../raw_data/gbif/occurrences/coordinates.csv')

In [87]:
tmp_all.shape

(1362717, 2)

In [90]:
tmp1

Unnamed: 0,latitude,longitude
0,47.272701,10.187670
1,47.272867,10.176992
2,47.273238,10.176238
3,47.274021,10.177095
4,47.275488,10.180092
...,...,...
272539,49.112795,8.252875
272540,49.112796,9.075545
272541,49.112796,9.075683
272542,49.112810,10.759762


In [105]:
tmp_join = pd.concat((tmp1, tmp2, tmp3, tmp4, tmp5), ignore_index=True)

In [106]:
tmp_join.equals(tmp_all)

True

In [119]:
from os import path, mkdir, listdir

In [115]:
path.isdir('../raw_data/output/occurrences/coordinates/splits')

False

In [117]:
mkdir('../raw_data/output/occurrences/coordinates/splits')

In [118]:
import re

In [132]:
out_path = path.join('../raw_data', 'output', 'occurrences', 'coordinates', 'splits')
string = 'worldclim'
files = [file for file in sorted(listdir(out_path)) if string in file]
sorted(files)

['coordinates_1_worldclim_germany.csv',
 'coordinates_2_worldclim_germany.csv',
 'coordinates_3_worldclim_germany.csv',
 'coordinates_4_worldclim_germany.csv']

In [128]:
data

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,bio_10,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19
0,52.382184,13.114806,9.329166e+00,8.158334e+00,3.113868e+01,6.916038e+02,2.390000e+01,-2.300000e+00,2.620000e+01,1.796667e+01,...,1.796667e+01,1.000000e+00,5.460000e+02,6.500000e+01,3.400000e+01,2.037085e+01,1.720000e+02,1.110000e+02,1.720000e+02,1.260000e+02
1,52.382188,10.537088,8.841667e+00,7.750000e+00,3.189300e+01,6.237418e+02,2.260000e+01,-1.700000e+00,2.430000e+01,1.656667e+01,...,1.656667e+01,1.350000e+00,6.280000e+02,7.000000e+01,3.800000e+01,1.611614e+01,1.910000e+02,1.330000e+02,1.910000e+02,1.450000e+02
2,52.382195,9.773645,9.645833e+00,8.041667e+00,3.268970e+01,6.266776e+02,2.350000e+01,-1.100000e+00,2.460000e+01,1.755000e+01,...,1.755000e+01,2.200000e+00,6.620000e+02,7.200000e+01,4.000000e+01,1.531258e+01,1.970000e+02,1.390000e+02,1.970000e+02,1.560000e+02
3,52.382195,12.517698,9.474999e+00,8.300000e+00,3.155894e+01,6.897546e+02,2.430000e+01,-2.000000e+00,2.630000e+01,1.660000e+01,...,1.808333e+01,1.166667e+00,5.550000e+02,6.200000e+01,3.600000e+01,1.726758e+01,1.710000e+02,1.150000e+02,1.700000e+02,1.240000e+02
4,52.382198,9.782183,9.608334e+00,8.016666e+00,3.272109e+01,6.275560e+02,2.340000e+01,-1.100000e+00,2.450000e+01,1.750000e+01,...,1.750000e+01,2.133333e+00,6.630000e+02,7.200000e+01,4.000000e+01,1.523891e+01,1.970000e+02,1.400000e+02,1.970000e+02,1.560000e+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272536,55.082249,8.486324,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,1.571667e+01,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02
272537,55.082630,8.486102,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,1.571667e+01,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02
272538,55.085609,8.473420,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
272539,55.086426,8.473474,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38


In [130]:
for i, file in enumerate(files):
    print(i, file)
    if i == 0:
        data = pd.read_csv(path.join(out_path, file))
    else:
        tmp = pd.read_csv(path.join(out_path, file))
        data = pd.concat((data,tmp), ignore_index=True)
        del tmp
data.shape

0 coordinates_4_worldclim_germany.csv
1 coordinates_2_worldclim_germany.csv
2 coordinates_3_worldclim_germany.csv
3 coordinates_1_worldclim_germany.csv


(1090172, 21)

In [131]:
filename = f"{occurrences_file}_{source['name']}_germany.csv"
filename
#data.to_csv(path.join(raw_data_path, 'output', 'occurrences', 'coordinates', filename), index=False)

NameError: name 'occurrences_file' is not defined

In [150]:
out_path = path.join('../raw_data', 'output', 'occurrences', 'coordinates', 'coordinates_worldclim_germany.csv')
tmp = pd.read_csv(out_path)
tmp

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,bio_10,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19
0,49.112831,8.014243,9.758333e+00,8.666667e+00,3.385417e+01,6.522299e+02,2.430000e+01,-1.300000e+00,2.560000e+01,1.635000e+01,...,1.793333e+01,1.900000e+00,7.480000e+02,7.700000e+01,4.900000e+01,1.540194e+01,2.130000e+02,1.570000e+02,1.920000e+02,1.860000e+02
1,49.112831,9.751282,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,1.720000e+01,7.166667e-01,9.740000e+02,9.600000e+01,6.700000e+01,1.135573e+01,2.660000e+02,2.220000e+02,2.530000e+02,2.540000e+02
2,49.112832,9.750753,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,1.720000e+01,7.166667e-01,9.740000e+02,9.600000e+01,6.700000e+01,1.135573e+01,2.660000e+02,2.220000e+02,2.530000e+02,2.540000e+02
3,49.112839,10.758035,8.712500e+00,9.208333e+00,3.373016e+01,6.954564e+02,2.390000e+01,-3.400000e+00,2.730000e+01,1.576667e+01,...,1.736667e+01,2.833333e-01,7.050000e+02,8.100000e+01,4.500000e+01,2.069717e+01,2.310000e+02,1.400000e+02,2.240000e+02,1.490000e+02
4,49.112846,9.199395,1.009167e+01,8.966667e+00,3.462033e+01,6.551434e+02,2.470000e+01,-1.200000e+00,2.590000e+01,1.681667e+01,...,1.830000e+01,2.183333e+00,7.480000e+02,8.100000e+01,4.800000e+01,1.805724e+01,2.370000e+02,1.500000e+02,2.170000e+02,1.670000e+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362711,55.082249,8.486324,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,1.571667e+01,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02
1362712,55.082630,8.486102,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,1.571667e+01,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02
1362713,55.085609,8.473420,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
1362714,55.086426,8.473474,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38


In [134]:
tmp[tmp['bio_1'] == -3.400000e+38].shape

(6430, 21)

In [141]:
out_path = path.join('../raw_data', 'gbif', 'occurrences', 'coordinates', 'splits')
files = [file for file in listdir(out_path) if f"_worldclim_" in file]
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
files

['coordinates_1_worldclim_germany.csv',
 'coordinates_2_worldclim_germany.csv',
 'coordinates_3_worldclim_germany.csv',
 'coordinates_4_worldclim_germany.csv',
 'coordinates_5_worldclim_germany.csv']

In [145]:
files = files + ['coordinates_10_worldclim_germany.csv']

In [146]:
sorted(files)

['coordinates_10_worldclim_germany.csv',
 'coordinates_1_worldclim_germany.csv',
 'coordinates_2_worldclim_germany.csv',
 'coordinates_3_worldclim_germany.csv',
 'coordinates_4_worldclim_germany.csv',
 'coordinates_5_worldclim_germany.csv']

In [147]:
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
files

['coordinates_1_worldclim_germany.csv',
 'coordinates_2_worldclim_germany.csv',
 'coordinates_3_worldclim_germany.csv',
 'coordinates_4_worldclim_germany.csv',
 'coordinates_5_worldclim_germany.csv',
 'coordinates_10_worldclim_germany.csv']

In [148]:
out_path = path.join('../raw_data', 'output', 'occurrences', 'coordinates', 'coordinates_soilgrids_germany.csv')
tmp = pd.read_csv(out_path)
tmp

Unnamed: 0,latitude,longitude,bdod_0-5cm,bdod_100-200cm,bdod_15-30cm,bdod_30-60cm,bdod_5-15cm,bdod_60-100cm,cec_0-5cm,cec_100-200cm,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
0,49.112831,8.014243,131,152,141,151,139,152,243,131,...,579,566,596,560,402,61,166,87,181,68
1,49.112831,9.751282,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49.112832,9.750753,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49.112839,10.758035,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.112846,9.199395,133,158,142,152,137,156,259,138,...,513,492,532,490,499,30,126,67,177,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362712,55.082249,8.486324,100,132,120,129,114,132,319,213,...,303,292,303,288,821,807,665,785,598,778
1362713,55.082630,8.486102,100,132,120,129,114,132,319,213,...,303,292,303,288,821,807,665,785,598,778
1362714,55.085609,8.473420,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1362715,55.086426,8.473474,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [151]:
out_path = path.join('../raw_data', 'output', 'occurrences', 'coordinates', 'coordinates_gee_germany.csv')
tmp1 = pd.read_csv(out_path)
tmp1

Unnamed: 0,latitude,longitude,elevation,slope
0,47.272701,10.187670,1859,36.430573
1,47.272867,10.176992,1846,17.113897
2,47.273238,10.176238,1856,18.371220
3,47.274021,10.177095,1872,28.319456
4,47.275488,10.180092,1852,27.529000
...,...,...,...,...
1362712,55.082249,8.486324,0,0.810120
1362713,55.082630,8.486102,-1,0.933447
1362714,55.085609,8.473420,0,0.000000
1362715,55.086426,8.473474,0,0.000000


In [152]:
tmp.merge(tmp1)

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19,elevation,slope
0,49.112831,8.014243,9.758333e+00,8.666667e+00,3.385417e+01,6.522299e+02,2.430000e+01,-1.300000e+00,2.560000e+01,1.635000e+01,...,7.480000e+02,7.700000e+01,4.900000e+01,1.540194e+01,2.130000e+02,1.570000e+02,1.920000e+02,1.860000e+02,197,0.708441
1,49.112831,9.751282,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,9.740000e+02,9.600000e+01,6.700000e+01,1.135573e+01,2.660000e+02,2.220000e+02,2.530000e+02,2.540000e+02,378,3.656657
2,49.112832,9.750753,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,9.740000e+02,9.600000e+01,6.700000e+01,1.135573e+01,2.660000e+02,2.220000e+02,2.530000e+02,2.540000e+02,378,3.656657
3,49.112839,10.758035,8.712500e+00,9.208333e+00,3.373016e+01,6.954564e+02,2.390000e+01,-3.400000e+00,2.730000e+01,1.576667e+01,...,7.050000e+02,8.100000e+01,4.500000e+01,2.069717e+01,2.310000e+02,1.400000e+02,2.240000e+02,1.490000e+02,421,1.984979
4,49.112846,9.199395,1.009167e+01,8.966667e+00,3.462033e+01,6.551434e+02,2.470000e+01,-1.200000e+00,2.590000e+01,1.681667e+01,...,7.480000e+02,8.100000e+01,4.800000e+01,1.805724e+01,2.370000e+02,1.500000e+02,2.170000e+02,1.670000e+02,181,4.301075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362711,55.082249,8.486324,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02,0,0.810120
1362712,55.082630,8.486102,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02,-1,0.933447
1362713,55.085609,8.473420,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,0,0.000000
1362714,55.086426,8.473474,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,0,0.000000


In [155]:
tmp_all = tmp1.merge(tmp.drop_duplicates(), 
                   how='left', indicator=True)

tmp_all

Unnamed: 0,latitude,longitude,elevation,slope,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,...,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19,_merge
0,47.272701,10.187670,1859,36.430573,2.895833e+00,7.991667e+00,3.275273e+01,6.166975e+02,1.580000e+01,-8.600000e+00,...,-4.333334e+00,1.370000e+03,1.690000e+02,8.700000e+01,2.540671e+01,4.790000e+02,2.720000e+02,4.790000e+02,2.770000e+02,both
1,47.272867,10.176992,1846,17.113897,1.695833e+00,7.491667e+00,3.187943e+01,6.003047e+02,1.400000e+01,-9.500000e+00,...,-5.250000e+00,1.464000e+03,1.730000e+02,9.600000e+01,2.161607e+01,4.910000e+02,3.040000e+02,4.910000e+02,3.110000e+02,both
2,47.273238,10.176238,1856,18.371220,1.695833e+00,7.491667e+00,3.187943e+01,6.003047e+02,1.400000e+01,-9.500000e+00,...,-5.250000e+00,1.464000e+03,1.730000e+02,9.600000e+01,2.161607e+01,4.910000e+02,3.040000e+02,4.910000e+02,3.110000e+02,both
3,47.274021,10.177095,1872,28.319456,1.695833e+00,7.491667e+00,3.187943e+01,6.003047e+02,1.400000e+01,-9.500000e+00,...,-5.250000e+00,1.464000e+03,1.730000e+02,9.600000e+01,2.161607e+01,4.910000e+02,3.040000e+02,4.910000e+02,3.110000e+02,both
4,47.275488,10.180092,1852,27.529000,1.741667e+00,7.516667e+00,3.198581e+01,6.011801e+02,1.410000e+01,-9.400000e+00,...,-5.216667e+00,1.456000e+03,1.730000e+02,9.500000e+01,2.205386e+01,4.910000e+02,3.010000e+02,4.910000e+02,3.070000e+02,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362712,55.082249,8.486324,0,0.810120,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,...,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02,both
1362713,55.082630,8.486102,-1,0.933447,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,...,1.766667e+00,7.750000e+02,9.700000e+01,3.900000e+01,3.052644e+01,2.750000e+02,1.300000e+02,1.890000e+02,1.790000e+02,both
1362714,55.085609,8.473420,0,0.000000,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,both
1362715,55.086426,8.473474,0,0.000000,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,both


In [157]:
tmp_all[tmp_all['_merge'] == 'left_only']

Unnamed: 0,latitude,longitude,elevation,slope,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,...,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19,_merge
892997,51.532429,5.703378,16,0.463735,,,,,,,...,,,,,,,,,,left_only


In [158]:
out_path = path.join('../raw_data', 'output', 'features', 'wc2.1_30s_bio_germany.csv')
worldclim = pd.read_csv(out_path)
worldclim

Unnamed: 0,lon_lower,lon_upper,lat_lower,lat_upper,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,...,bio_10,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19
0,5.704167,5.712500,55.087500,55.095833,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
1,5.704167,5.712500,55.079167,55.087500,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
2,5.704167,5.712500,55.070833,55.079167,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
3,5.704167,5.712500,55.062500,55.070833,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
4,5.704167,5.712500,55.054167,55.062500,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115312,15.387500,15.395833,47.137500,47.145833,8.300000e+00,9.500000e+00,3.298611e+01,7.289843e+02,2.330000e+01,-5.500000e+00,...,1.723333e+01,-6.999999e-01,9.110000e+02,1.300000e+02,3.200000e+01,4.433024e+01,3.730000e+02,1.150000e+02,3.730000e+02,1.150000e+02
1115313,15.387500,15.395833,47.129167,47.137500,8.595834e+00,9.425000e+00,3.261246e+01,7.358745e+02,2.360000e+01,-5.300000e+00,...,1.758333e+01,-5.166667e-01,8.800000e+02,1.280000e+02,2.900000e+01,4.638049e+01,3.670000e+02,1.060000e+02,3.670000e+02,1.060000e+02
1115314,15.387500,15.395833,47.120833,47.129167,8.937500e+00,9.291667e+00,3.204023e+01,7.464952e+02,2.400000e+01,-5.000000e+00,...,1.803333e+01,-3.333333e-01,8.430000e+02,1.240000e+02,2.600000e+01,4.839793e+01,3.580000e+02,9.700000e+01,3.580000e+02,9.700000e+01
1115315,15.387500,15.395833,47.112500,47.120833,8.904166e+00,9.358334e+00,3.204909e+01,7.485181e+02,2.410000e+01,-5.100000e+00,...,1.803333e+01,-3.833333e-01,8.500000e+02,1.250000e+02,2.700000e+01,4.795243e+01,3.600000e+02,9.900000e+01,3.600000e+02,9.900000e+01


In [162]:
mask =  ((worldclim['lon_lower'] <= 5.703378) & (worldclim['lon_upper'] > 5.703378) & \
        (worldclim['lat_lower'] <= 51.532429) & (worldclim['lat_upper'] > 51.532429))
worldclim[mask]

Unnamed: 0,lon_lower,lon_upper,lat_lower,lat_upper,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,...,bio_10,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19


In [164]:
out_path = path.join('../raw_data', 'gbif', 'occurrences', 'coordinates.csv')
coords = pd.read_csv(out_path)
coords[(coords['latitude'] == 51.532429) & (coords['longitude'] == 5.703378)]

Unnamed: 0,latitude,longitude
892997,51.532429,5.703378


In [170]:
out_path = path.join('../raw_data', 'output', 'occurrences', 'coordinates', 'coordinates_features.csv')
coords_feat = pd.read_csv(out_path)
coords_feat

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
0,49.112831,8.014243,9.758333e+00,8.666667e+00,3.385417e+01,6.522299e+02,2.430000e+01,-1.300000e+00,2.560000e+01,1.635000e+01,...,579,566,596,560,402,61,166,87,181,68
1,49.112831,9.751282,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,0,0,0,0,0,0,0,0,0,0
2,49.112832,9.750753,8.766666e+00,9.100000e+00,3.408240e+01,6.713024e+02,2.370000e+01,-3.000000e+00,2.670000e+01,1.563333e+01,...,0,0,0,0,0,0,0,0,0,0
3,49.112839,10.758035,8.712500e+00,9.208333e+00,3.373016e+01,6.954564e+02,2.390000e+01,-3.400000e+00,2.730000e+01,1.576667e+01,...,0,0,0,0,0,0,0,0,0,0
4,49.112846,9.199395,1.009167e+01,8.966667e+00,3.462033e+01,6.551434e+02,2.470000e+01,-1.200000e+00,2.590000e+01,1.681667e+01,...,513,492,532,490,499,30,126,67,177,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362712,55.082249,8.486324,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,303,292,303,288,821,807,665,785,598,778
1362713,55.082630,8.486102,8.575000e+00,5.266667e+00,2.544283e+01,5.833543e+02,1.990000e+01,-8.000000e-01,2.070000e+01,9.800000e+00,...,303,292,303,288,821,807,665,785,598,778
1362714,55.085609,8.473420,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
1362715,55.086426,8.473474,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0


In [171]:
coords_feat[(coords_feat['latitude'] == 51.532429) & (coords_feat['longitude'] == 5.703378)]

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
892997,51.532429,5.703378,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,...,239,235,247,226,643,155,247,238,312,194


In [168]:
coords_feat.fillna(-3.400000e+38, inplace=True)

In [169]:
coords_feat[(coords_feat['latitude'] == 51.532429) & (coords_feat['longitude'] == 5.703378)]

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
892997,51.532429,5.703378,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,...,239,235,247,226,643,155,247,238,312,194
