In [None]:
import tensorflow_decision_forests as tfdf
import tensorflow_probability as tfp
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Concatenate
tfd = tfp.distributions
tfpl = tfp.layers
from scipy.stats import pearsonr

def ubrmse(ground,pred):
    bias = np.mean(ground-pred)
    rmse = np.sqrt(np.mean((ground-pred)**2))
    ubrmse = np.sqrt(rmse**2-bias**2)
    return round(ubrmse,4)

def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)

# Training

### RF

For training the RF ensemble, please use 2 - TFDF_Train.ipynb

### Non-RF

In [None]:
all_df = pd.read_pickle('dataframes/all_df_final.pkl')
all_df = normalize_df(all_df)

Helper Functions

In [None]:
def prior(kernel_size,bias_size, dtype=None):
    n = kernel_size + bias_size
    prior_model = Sequential([
        tfpl.DistributionLambda(
            lambda t: tfd.MultivariateNormalDiag(loc=tf.zeros(n),scale_diag=tf.ones(n))
        )
    ])
    return prior_model

def posterior(kernel_size,bias_size, dtype=None):
    n = kernel_size + bias_size
    posterior_model = Sequential([
        tfpl.VariableLayer(tfpl.MultivariateNormalTriL.params_size(n),dtype=dtype),
        tfpl.MultivariateNormalTriL(n)
    ])
    return posterior_model
def prior_trainable(kernel_size, bias_size=0, dtype=None):
    n = kernel_size + bias_size
    return tf.keras.Sequential([
      tfp.layers.VariableLayer(n, dtype=dtype,name='prior_var'),
      tfp.layers.DistributionLambda(lambda t: tfd.Independent(
          tfd.Normal(loc=t, scale=.5),
          reinterpreted_batch_ndims=1),name='prior_lam'),
    ])


def posterior_mean_field(kernel_size, bias_size=0, dtype=None):
    n = kernel_size + bias_size
    #c = np.log(np.expm1(1.))
    return tf.keras.Sequential([
      tfp.layers.VariableLayer(2 * n, dtype=dtype,name='post_var'),
      tfp.layers.DistributionLambda(lambda t: tfd.Independent(
          tfd.Normal(loc=t[..., :n],
                     scale=tf.nn.softplus(t[..., n:])),
          reinterpreted_batch_ndims=1),name='post_lam'),
    ])

def gen_model(inputs=7,train_len=100):
    model = keras.Sequential([keras.layers.Input(inputs,name='Input'),
                          keras.layers.Dense(inputs,'sigmoid',name='Dense_Layer'),
                          tfpl.DenseVariational(
                          units=tfpl.IndependentNormal.params_size(1),
                          make_prior_fn=prior_trainable,
                          make_posterior_fn=posterior_mean_field,
                          kl_weight=1/train_len,
                          name='Dense_Variational'),  
                          tfpl.IndependentNormal((1,),name='Output_Normal')])
    return model


def dense_model(inputs=7):
    model = keras.Sequential([keras.layers.Input(inputs,name='Input'),  
                          keras.layers.Dense(inputs,activation='sigmoid'),
                          keras.layers.Dense(6,activation='sigmoid'),  
                          keras.layers.Dense(2,activation=None),
                          tfpl.IndependentNormal((1,),name='Output_Normal')])
    
    return model

def create_weights(criteria,df):
    new_df = df.copy()
    class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=criteria.unique(),
                                                 y=criteria)
    weight_dict = {}
    for idx,i in enumerate(criteria.unique()):
        weight_dict[i] = class_weights[idx]
    for text in criteria.unique():
        new_df.loc[criteria == text,'weight'] = weight_dict[text]
        
    return new_df.pop('weight')

### Prob Ensembles

Cross Train Loop

In [None]:
def cross_train_prob(k):
    test_df = pd.read_csv(f'dataframes/crossfold/{k}.csv').set_index('Unnamed: 0')
    train_df = all_df.loc[lambda d: ~d.index.isin(test_df.index)]
    variables = ['sand','clay','ph','dem','ET','NDVI_500','LST_11','precip','smap']
    x = train_df.loc[:,variables]
    y = train_df['in_situ']

    val_x = test_df.loc[:,variables]
    val_y = test_df.in_situ
    histories = {}
    save_dir = f'Models/crossfold/prob/{k}/'

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
                                        monitor='val_loss',
                                        factor=0.5,
                                        patience=15,
                                        verbose=0,
                                        mode='auto',
                                        min_delta=0.01,#0.0005
                                        cooldown=0,
                                        min_lr=0.0005)

    early_stopping = tf.keras.callbacks.EarlyStopping(
                                        monitor='val_loss',
                                        patience=31,
                                        min_delta=0.0005)#0.0002

    for att in ['texture','sand','clay','koep','mcd12','ph']:
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
                    '{0}{1}'.format(save_dir,att),
                    monitor='val_loss',
                    verbose=0,
                    save_best_only=True,
                    save_weights_only=False,
                    mode='min',
                    save_freq='epoch',
                )
        callbacks = [checkpoint,reduce_lr,early_stopping]
        weights = create_weights(train_df[att],train_df)
        val_weights = create_weights(test_df[att],test_df)
        val_data = (val_x,val_y.values,val_weights)
        model = gen_model(inputs=x.shape[-1],train_len=x.shape[0])
        model.compile(loss=nll, optimizer=keras.optimizers.Adam(learning_rate=0.1),weighted_metrics=[])
        history = model.fit(x=x,y=y,epochs=500,batch_size=8192*2,sample_weight=weights,validation_data=val_data,callbacks=callbacks)
        histories[att] = history

    checkpoint = tf.keras.callbacks.ModelCheckpoint(
                '{}free'.format(save_dir),
                monitor='val_loss',
                verbose=0,
                save_best_only=True,
                save_weights_only=False,
                mode='min',
                save_freq='epoch',
            )

    callbacks = [checkpoint,reduce_lr,early_stopping]
    val_data = (val_x,val_y.values)
    model = gen_model(inputs=x.shape[-1],train_len=x.shape[0])
    model.compile(loss=nll, optimizer=keras.optimizers.Adam(learning_rate=0.1))
    history = model.fit(x=x,y=y,epochs=500,batch_size=8192*2,validation_data=val_data,callbacks=callbacks)

In [None]:
## Dense Train

def cross_train_dense(k):
    test_df = pd.read_csv(f'dataframes/crossfold/{k}.csv').set_index('Unnamed: 0')
    train_df = all_df.loc[lambda d: ~d.index.isin(test_df.index)]
    variables = ['sand','clay','ph','dem','ET','NDVI_500','LST_11','precip','smap']
    x = train_df.loc[:,variables]
    y = train_df['in_situ']

    val_x = test_df.loc[:,variables]
    val_y = test_df.in_situ
    histories = {}
    save_dir = f'Models/crossfold/dense/{k}/'

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
                                        monitor='val_loss',
                                        factor=0.5,
                                        patience=15,
                                        verbose=0,
                                        mode='auto',
                                        min_delta=0.01,#0.0005
                                        cooldown=0,
                                        min_lr=0.0005)

    early_stopping = tf.keras.callbacks.EarlyStopping(
                                        monitor='val_loss',
                                        patience=31,
                                        min_delta=0.0005)#0.0002

    for att in ['texture','sand','clay','koep','mcd12','ph']:
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
                    '{0}{1}'.format(save_dir,att),
                    monitor='val_loss',
                    verbose=0,
                    save_best_only=True,
                    save_weights_only=False,
                    mode='min',
                    save_freq='epoch',
                )
        callbacks = [checkpoint,reduce_lr,early_stopping]
        weights = create_weights(train_df[att],train_df)
        val_weights = create_weights(test_df[att],test_df)
        val_data = (val_x,val_y.values,val_weights)
        model = dense_model(inputs=x.shape[-1])
        model.compile(loss=nll, optimizer=keras.optimizers.Adam(learning_rate=0.1),weighted_metrics=[])
        history = model.fit(x=x,y=y,epochs=500,batch_size=8192*2,sample_weight=weights,validation_data=val_data,callbacks=callbacks)
        histories[att] = history

    checkpoint = tf.keras.callbacks.ModelCheckpoint(
                '{}free'.format(save_dir),
                monitor='val_loss',
                verbose=0,
                save_best_only=True,
                save_weights_only=False,
                mode='min',
                save_freq='epoch',
            )

    callbacks = [checkpoint,reduce_lr,early_stopping]
    val_data = (val_x,val_y.values)
    model = gen_model(inputs=x.shape[-1],train_len=x.shape[0])
    model.compile(loss=nll, optimizer=keras.optimizers.Adam(learning_rate=0.1))
    history = model.fit(x=x,y=y,epochs=500,batch_size=8192*2,validation_data=val_data,callbacks=callbacks)

In [None]:
for k in range(10):
    cross_train(k)
    cross_train_dense(k)

### WDL Ensemble

In [None]:
cat_dict = {'texture':13,'mcd12':18,'koep':32}

def wide_inputs(df,spatial=False):
    if spatial:
        dnn_in = df.loc[:,['sand','clay','ph','dem','NDVI_250','LST_11','ET','precip','smap']]
    else:
        dnn_in = df.loc[:,['sand','clay','ph','dem','NDVI_500','LST_11','ET','precip','smap']]
    wide_in = dnn_in
    cats = df.loc[:,['texture','mcd12','koep']]
    cat_embeds = []
    for cat in cats.columns:
        embedding = np.eye(cat_dict[cat])[cats[cat].astype(int)]
        cat_embeds.append(np.asarray(embedding))

    embeddings = np.concatenate(cat_embeds,axis=1)
    dnn_in = np.concatenate([dnn_in,embeddings],axis=1)
    return wide_in.values.astype(float),dnn_in.astype(float)

from tensorflow.keras.experimental import LinearModel, WideDeepModel
def initialize_models():
    linear_model = LinearModel()
    dnn_model = keras.Sequential([keras.layers.Dense(units=128,activation='sigmoid',name='dnn_1'),
                                 keras.layers.Dense(units=64,activation='sigmoid',name='dnn_2'),
                                 keras.layers.Dense(units=1,activation='sigmoid',name='dnn_3')])
    return linear_model,dnn_model

linear_model, dnn_model = initialize_models()

model = WideDeepModel(linear_model,dnn_model)
model.compile(optimizer=['sgd',keras.optimizers.Adam(learning_rate=0.001)],loss='mse')

In [None]:
## WDL Train
def cross_train_wdl(k):
    test_df = pd.read_csv(f'dataframes/crossfold/{k}.csv').set_index('Unnamed: 0')
    train_df = all_df.loc[lambda d: ~d.index.isin(test_df.index)]
    variables = ['sand','clay','ph','dem','ET','NDVI_500','LST_11','precip','smap']
    wide_x,dnn_x = wide_inputs(train_df)
    wide_y,dnn_y = wide_inputs(test_df)
    y = train_df.in_situ
    val_y = test_df.in_situ
    val_data = [[wide_y,dnn_y],val_y]
    save_dir = f'Models/crossfold/wdl/{k}/'

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
                                        monitor='val_loss',
                                        factor=0.5,
                                        patience=15,
                                        verbose=0,
                                        mode='auto',
                                        min_delta=0.01,#0.0005
                                        cooldown=0,
                                        min_lr=0.0005)

    early_stopping = tf.keras.callbacks.EarlyStopping(
                                        monitor='val_loss',
                                        patience=31,
                                        min_delta=0.0005)#0.0002

    for att in ['texture','sand','clay','koep','mcd12','ph']:
        linear_model,dnn_model = initialize_models()
        
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
                    '{0}{1}'.format(save_dir,att),
                    monitor='val_loss',
                    verbose=0,
                    save_best_only=True,
                    save_weights_only=False,
                    mode='min',
                    save_freq='epoch',
                )
        callbacks = [checkpoint,reduce_lr,early_stopping]
        weights = create_weights(train_df[att],train_df)
        val_weights = create_weights(test_df[att],test_df)
        val_data =  [[wide_y,dnn_y],val_y,val_weights]
        model = WideDeepModel(linear_model,dnn_model)
        for lr in [0.001,0.0005,0.0001]:
            model.compile(optimizer=['sgd',keras.optimizers.Adam(learning_rate=lr)][1],loss='mse',weighted_metrics=[])
            history = model.fit([wide_x,dnn_x],y,epochs=500,batch_size=8192*2,sample_weight=weights,validation_data=val_data,callbacks=callbacks)


    checkpoint = tf.keras.callbacks.ModelCheckpoint(
                '{}free'.format(save_dir),
                monitor='val_loss',
                verbose=0,
                save_best_only=True,
                save_weights_only=False,
                mode='min',
                save_freq='epoch',
            )

    linear_model,dnn_model = initialize_models()
    callbacks = [checkpoint,reduce_lr,early_stopping]
    val_data = [[wide_y,dnn_y],val_y]
    model = WideDeepModel(linear_model,dnn_model)
    for lr in [0.001,0.0005,0.0001]:
        model.compile(optimizer=['sgd',keras.optimizers.Adam(learning_rate=lr)][1],loss='mse')
        history = model.fit([wide_x,dnn_x],y,epochs=500,batch_size=8192*2,validation_data=val_data,callbacks=callbacks)

In [None]:
for k in range(10):
    cross_train_wdl(k)

# Predictions

Helper Functions

In [None]:
def ubrmse(ground,pred):
    bias = np.mean(ground-pred)
    rmse = np.sqrt(np.mean((ground-pred)**2))
    ubrmse = np.sqrt(rmse**2-bias**2)
    return round(ubrmse,4)
from scipy.stats import pearsonr

def to_rf(df):
    df = df.fillna(-1)
    for col in df.columns:
        if col in ['dem','smap','precip','in_situ']:
            df[col] = df[col].astype('float32')
        elif col =='date':
            continue
        else:
            df[col] = df[col].astype('int16')
    return df

def batch_dict(df,spatial):
    test_dict = {}
    lst,ndvi = spatial
    for var in ['sand','clay','ph','dem','ET',ndvi,lst,'smap', 'precip']:
        if var == ndvi:
            test_dict['NDVI_500'] = df[var].values
        elif var == lst:
            test_dict['LST_11'] = df[var].values
        else:
            test_dict[var] = df[var].values
    return test_dict

def rf_predict(df,model_dict,spatial):
    batched_dict = batch_dict(df,spatial)
    preds = []
    for i in model_dict.values():
        preds.append(i.predict_on_batch(batched_dict))
    preds = np.asarray(preds)
    predictions = np.mean(preds,axis=0)
    return predictions.squeeze()

def ens_pred(df,model_dict):
    preds = []
    for i in model_dict.values():
        preds.append(i(df.values).mean().numpy().squeeze())
    
    prediction = np.mean(np.asarray(preds),axis=0).squeeze()
    return prediction

def wdl_ens(df,model_dict,spatial=False):
    wide_in,dnn_in = wide_inputs(df)
    preds = []
    for i in model_dict.values():
        preds.append(i((wide_in,dnn_in)).numpy().squeeze())
    return np.mean(np.asarray(preds),axis=0)


In [None]:
from tqdm import tqdm

cross_eval_df = pd.DataFrame()
variables = ['sand','clay','ph','dem','ET','NDVI_500','LST_11','precip','smap']

for k in tqdm(range(10)):
    prob_dict = {}
    dense_dict = {}
    wdl_dict = {}
    for att in ['sand','clay','koep','mcd12','free','ph','texture']:
        prob_dict[att] = tf.keras.models.load_model(f'Models/crossfold/prob/{k}/{att}_run',custom_objects={'nll':nll})
        dense_dict[att] = tf.keras.models.load_model(f'Models/crossfold/dense/{k}/{att}_run',custom_objects={'nll':nll})
        wdl_dict[att] = tf.keras.models.load_model(f'Models/crossfold/wdl/{k}/{att}_run',custom_objects={'nll':nll})
    cross_df = pd.read_csv(f'Datasets/Crossfold_Datasets/{k}.csv').set_index('Unnamed: 0')
    val_df = cross_df.loc[:,variables]
    wdl_pred = wdl_ens(cross_df,wdl_dict)
    prob_pred = ens_pred(val_df,prob_dict)
    dense_pred = ens_pred(val_df,dense_dict)
    val_pred_df = cross_df[['in_situ','smap']]
    val_pred_df.loc[:,'d_pred'] = dense_pred
    val_pred_df.loc[:,'p_pred'] = prob_pred
    val_pred_df.loc[:,'wdl_pred'] = wdl_pred
    cross_eval_df = pd.concat([cross_eval_df,val_pred_df])
    

In [None]:
import os
all_df = pd.read_pickle('Datasets/all_df_final.pkl')
rf_eval_df = pd.DataFrame()
for k in tqdm(range(10)):
    rf_dict = {}
    for i in os.listdir(f'Models/crossfold/rf/{k}/'):
        if i in rf_dict.keys():
            continue
        try:
            int(i)
        except:
            continue
        rf_dict[i] = tf.keras.models.load_model(f'Models/crossfold/rf/{k}/{i}/')
    cross_df = pd.read_csv(f'Datasets/Crossfold_Datasets/{k}.csv').set_index('Unnamed: 0')
    cross_df = all_df.loc[cross_df.index.unique()]
    cross_rf = to_rf(cross_df.drop('date',axis=1))
    rf_pred = rf_predict(cross_rf,rf_dict,['NDVI_500','LST_11'])
    val_pred_df = cross_rf[['in_situ','smap']]
    val_pred_df['rf_pred'] = rf_pred
    rf_eval_df = pd.concat([rf_eval_df,val_pred_df])
cross_eval_df.loc[:,'rf_pred'] = rf_eval_df.rf_pred

In [None]:
def station_metrics(df,pred,situ='in_situ'):
    let = pred.split('_')[0]
    stat_dicts = {}
    for stat in df.index.unique():
        stat_df = df.loc[stat]
        if type(stat_df) == pd.core.series.Series:
            continue
        stat_r = pearsonr(stat_df[situ],stat_df[pred])[0]
        stat_ub = ubrmse(stat_df[situ],stat_df[pred])
        stat_b = round(np.mean(stat_df[pred]-stat_df[situ]),3) 
        stat_dicts[stat] = {'{}_r'.format(let):stat_r,
                            '{}_ub'.format(let):stat_ub,
                            '{}_b'.format(let):stat_b}
    return pd.DataFrame().from_dict(stat_dicts,orient='index')

In [None]:
prob_mets = station_metrics(cross_eval_df,'p_pred')
dense_mets = station_metrics(cross_eval_df,'d_pred')
smap_mets = station_metrics(cross_eval_df,'smap')
wdl_mets = station_metrics(cross_eval_df,'wdl_pred')
rf_mets = station_metrics(cross_eval_df,'rf_pred')

In [None]:
mets = pd.concat([smap_mets,prob_mets,dense_mets,wdl_mets,rf_mets],axis=1)

In [None]:
mets.to_pickle('Datasets/cross_eval_mets.csv')

In [62]:
series = []
for var in ['_r','_ub','_b']:
    sub = mets[[i for i in mets.columns if var in i]]
    avg = sub.mean()
    avg.name = var
    avg = avg.to_frame().T
    avg.columns = ['SMAP','Prob','Dense','Wens','RF']
    series.append(avg)
avgs = pd.concat(series)
avgs.index = ['R','ubRMSE','Bias']
avgs.round(3)

Unnamed: 0,SMAP,Prob,Dense,Wens,RF
R,0.562,0.621,0.639,0.611,0.572
ubRMSE,0.065,0.06,0.058,0.06,0.065
Bias,0.023,-0.017,0.0,-0.003,0.004
