In [1]:
import joblib
import mlflow
import multiprocessing

from memoized_property import memoized_property
from mlflow.tracking import MlflowClient
from psutil import virtual_memory
from termcolor import colored
from time import time

from biodiversipy.params import MLFLOW_EXPERIMENT_BASE, MLFLOW_URI
from biodiversipy.utils import simple_time_tracker

In [2]:
# Create custom metric
def compute_average(y_true, y_pred, t):
    """Returns the average number of species observed correctly predicted given a threshold value t"""
    assert t <= 1
    assert t >= 0
    N, C = y_pred.shape
    temp = map(lambda x: 1 if x >= t else 0, y_pred[y_true == 1])

    average = reduce_sum(temp) / N
    return average

def find_t_min(y_true, y_pred, K, rate, t):
    """
    Returns the minimum threshold t and corresponding average satisfying the
    condition average <= K.
    The minimum t is found iteratively, with tuning parameter rate [0-1]
    """
    assert rate <= 1
    assert rate >= 0
    assert K > 0
    average = compute_average(y_true, y_pred, t)
    while average <= K:
        t = rate*t
        average = compute_average(y_true, y_pred, t)
    t_min = t/rate
    average = compute_average(y_true, y_pred, t_min)
    return t_min, average

def compute_accuracy(y_true, y_pred, t_min):
    N, C = y_pred.shape
    temp = map_fn(lambda x: 1 if x >= t_min else 0, y_pred[y_true == 1])

    return reduce_sum(temp) / (N*C)

def custom_metric(y_true, y_pred, K=2, rate=0.99, t=1):
    t_min, average = find_t_min(y_true, y_pred, K, rate, t)
    accuracy = compute_accuracy(y_true, y_pred, t_min)

    return t_min, average, accuracy

# Trying dummy model

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)

In [2]:
!pwd

/Users/mikkelvaldemarkoch/code/MikkelValdemar/biodiversipy/notebooks


In [3]:
X = pd.read_csv('../raw_data/output/occurrences/coordinates_100k/coordinates_100k_features.csv')
y = pd.read_csv('../raw_data/gbif/occurrences_100k/occurrences_100k_encoded.csv')
y = y.drop(columns=['latitude', 'longitude'])

In [12]:
X.head()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
0,48.870487,10.335366,8.025001,8.516667,32.382763,683.21265,22.7,-3.6,26.300001,14.95,...,408,400,404,397,400,26,141,50,223,31
1,54.651142,9.772167,8.304167,6.225,28.555044,589.65497,20.6,-1.2,21.800001,15.333334,...,209,206,218,200,727,278,461,385,545,283
2,53.936638,11.247078,8.629167,6.741666,29.311594,617.0769,21.5,-1.5,23.0,16.366667,...,328,321,346,313,502,122,251,166,287,155
3,49.667747,8.651412,10.070833,8.091667,31.982872,660.0601,24.2,-1.1,25.300001,16.733334,...,532,516,556,515,480,29,158,71,246,32
4,48.826295,10.062125,8.4125,8.775,33.492367,670.89197,23.0,-3.2,26.2,15.25,...,0,0,0,0,0,0,0,0,0,0


In [13]:
y.head()

Unnamed: 0,10009934,10010617,10055902,10068154,10096320,10220564,10243708,10356062,10448252,10461857,...,9573476,9606396,9624496,9643454,9677963,9689880,9823072,9823314,9823570,9868510
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
preproc_pipeline = make_pipeline(StandardScaler())
pipe_baseline = make_pipeline(preproc_pipeline, LogisticRegression())

In [4]:
X.describe()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
count,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,...,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0
mean,50.821125,9.641268,-1.863454e+36,-1.863454e+36,-1.863454e+36,-1.863454e+36,-1.863454e+36,-1.863454e+36,-1.863454e+36,-1.863454e+36,...,257.675818,253.507614,266.377507,249.996819,454.005008,77.5875,171.71831,106.639919,241.058653,83.932838
std,1.754502,2.121061,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,2.5101989999999997e+37,...,216.379231,213.26263,225.013437,210.054376,389.94921,130.345536,181.993919,148.930762,228.498647,134.535184
min,47.278587,5.867479,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.460394,7.976818,8.475,7.616667,31.35593,601.2202,22.1,-2.8,23.6,14.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.928417,9.357782,9.075,8.066667,32.44613,645.1185,23.0,-1.8,25.1,16.3,...,307.0,300.0,311.5,297.5,509.0,38.0,159.0,72.0,250.0,45.0
75%,52.111422,11.191132,9.616667,8.475,33.36454,674.1543,23.8,-1.0,26.0,17.03333,...,459.0,453.0,477.0,444.0,749.0,96.0,255.0,144.0,371.0,106.0
max,55.052055,14.998135,11.03333,10.38333,37.86438,753.814,26.0,1.1,28.9,19.11667,...,766.0,719.0,741.0,717.0,1752.0,2345.0,2389.0,2373.0,1770.0,2313.0


In [6]:
X[X['bio_1'] < -1000]

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
75,54.260010,8.852663,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
384,54.398598,10.220761,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
588,54.338844,13.680669,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
788,54.402565,13.615970,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
1063,53.790783,7.901619,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77582,54.415115,10.232071,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77708,54.134101,13.830972,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77817,53.705471,7.137551,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,0,0,0,0,0,0,0,0,0,0
77854,54.763778,8.287171,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,...,251,245,247,250,956,587,613,720,560,642


In [None]:

df_masked = np.ma.masked_where((-273 > df), df)

In [30]:
X_scaled = preproc_pipeline.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.describe()

Unnamed: 0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
count,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,...,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0,78274.0
mean,1.269961e-15,-2.556262e-16,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,2.677902e-18,...,6.626673e-17,-4.883767e-17,-3.1953270000000003e-17,8.351423e-18,-7.879386e-17,2.015235e-17,1.8790700000000003e-17,-3.485812e-17,-1.3434900000000001e-17,-1.8246040000000002e-17
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,...,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-2.019126,-1.77921,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,-13.47059,...,-1.19086,-1.188719,-1.183837,-1.19016,-1.164274,-0.5952486,-0.9435448,-0.7160415,-1.054974,-0.6238768
25%,-0.77557,-0.78473,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,-1.19086,-1.188719,-1.183837,-1.19016,-1.164274,-0.5952486,-0.9435448,-0.7160415,-1.054974,-0.6238768
50%,0.061153,-0.1336538,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,0.2279539,0.2180067,0.2005337,0.2261485,0.1410321,-0.3037139,-0.06988361,-0.2325922,0.0391311,-0.2893896
75%,0.7354255,0.7307068,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,0.9304288,0.9354366,0.93605,0.9235914,0.7565008,0.1412601,0.4576099,0.250857,0.5686781,0.1640263
max,2.411486,2.525576,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,0.07423578,...,2.349243,2.182733,2.109321,2.223263,3.328647,17.39551,12.18335,15.21764,6.691292,16.56876


In [None]:
%%time
pipe_baseline.fit(X, y)

In [None]:
cv_results = cross_validate(pipe_baseline, X=X, y=y, cv=5, scoring=rmsle)

In [None]:
def model(X, y, metrics):
    normalization_layer = Normalization()
    normalization_layer.adapt(X)

    model = Sequential([
        normalization_layer,
        Dense(10,
              input_dim=X.shape[1],
              activation='relu',
              activity_regularizer=l1_l2(l1=0.005, l2=0.0005)),
        Dropout(0.4),
        Dense(y.shape[1], activation='softmax'),
    ])

    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])

    return model


In [7]:
y

Unnamed: 0,latitude,longitude,10009934,10010617,10055902,10068154,10096320,10220564,10243708,10356062,...,9573476,9606396,9624496,9643454,9677963,9689880,9823072,9823314,9823570,9868510
0,48.870487,10.335366,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,54.651142,9.772167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,53.936638,11.247078,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49.667747,8.651412,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,48.826295,10.062125,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78269,50.498398,6.624358,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78270,53.321644,10.501041,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78271,53.089754,10.565015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78272,52.254784,12.928824,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
