In [1]:
import pandas as pd
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.metrics
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from MLscores import calc_metrics, metrics_dict, cmvals, recall, hybridrecall
import tensorflow.keras.backend as K
import tensorflow as tf
from functools import partial
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
#import eli5
#from eli5.sklearn import PermutationImportance

In [2]:
def create_NN_model(params, X):
    # define model
    model = Sequential()
    n_features = X.shape[1]
    intlayers = int(params['n_internal_layers'][0])
    model.add(Dense(params['n_internal_layers'][1]['layer_1_' + str(intlayers) + '_nodes'], activation='relu'))#, input_shape=(n_features,))) 
    if not params['dropout'] is None:
        model.add(Dropout(params['dropout']))
    for i in range(2, intlayers + 2):
        model.add(Dense(int(params['n_internal_layers'][1]['layer_' + str(i) + '_' + str(intlayers) + '_nodes']),
                        activation='relu', )) #kernel_initializer=initializer))
        if not params['dropout'] is None:
            model.add(Dropout(params['dropout']))
    model.add(Dense(2, activation='softmax'))

    # compile the model
    if params['optimizer']['name']=='Adam':
        if params['optimizer']['adam_params'] is None:
            opt = Adam()
        else:
            opt = Adam(learning_rate=params['optimizer']['adam_params']['learning_rate_adam'], beta_1=params['optimizer']['adam_params']['beta_1'],
                       beta_2=params['optimizer']['adam_params']['beta_2'],amsgrad=params['optimizer']['adam_params']['amsgrad'])
    elif params['optimizer']['name']=='SGD':
        opt = SGD(learning_rate=params['optimizer']['learning_rate_SGD'])

    if params['metric'] == 'accuracy':
        metrics = ['accuracy']
    elif params['metric'] == 'sparse':
        metrics = [tensorflow.metrics.SparseCategoricalAccuracy()]
    elif params['metric'] == 'tn':
        metrics = [tensorflow.metrics.TrueNegatives(),tensorflow.metrics.TruePositives()]
    if 'loss' in params and params['loss'] == 'unbalanced':
        lossf=unbalanced_loss
    else:
        lossf='sparse_categorical_crossentropy'
    model.compile(optimizer=opt, loss=lossf, metrics=metrics)  # , AUC(multi_label=False)])
    return model

In [3]:
def load_dataset(trfiles, featuredrop=[], debug=True, returnid=False):
    # dsfile = 'dataset_ndvi_lu.csv'
    domdircheck = 'dom_dir'
    dirmaxcheck = 'dir_max'
    corinecheck = 'Corine'
    monthcheck = 'month'
    wkdcheck = 'wkd'
    firedatecheck = 'firedate'
    X_columns = ['max_temp', 'min_temp', 'mean_temp', 'res_max', dirmaxcheck, 'dom_vel', domdircheck,
                 'rain_7days', corinecheck, 'Slope', 'DEM', 'Curvature', 'Aspect', 'ndvi', 'evi', 'lst_day',
                 'lst_night', monthcheck, wkdcheck,
                 'mean_dew_temp', 'max_dew_temp', 'min_dew_temp','frequency', 'f81', 'x', 'y']
    y_columns = ['fire']
    # if not os.path.exists(os.path.join(dsetfolder, dsready)):
    if isinstance(trfiles, list):
        if debug:
            print("Loading full dataset ...")
        dflist=[]
        for dsfile in trfiles:
            if debug:
                print("Loading dataset file %s" % dsfile)
            dflist.append(pd.read_csv(dsfile))
        df = pd.concat(dflist)
    else:
        dsfile = trfiles
    df = pd.read_csv(dsfile)
    X_columns_upper = [c.upper() for c in X_columns]
    newcols = [c for c in df.columns if
               c.upper() in X_columns_upper or any([cX in c.upper() for cX in X_columns_upper])]
    X_columns = newcols
    #corine_col, newcols = check_categorical(df, corinecheck, newcols)
    #dirmax_col, newcols = check_categorical(df, dirmaxcheck, newcols)
    #domdir_col, newcols = check_categorical(df, domdircheck, newcols)
    #month_col, newcols = check_categorical(df, monthcheck, newcols)
    #wkd_col, newcols = check_categorical(df, wkdcheck, newcols)

    firedate_col = [c for c in df.columns if firedatecheck.upper() in c.upper()][0]
    X, y, groupspd = prepare_dataset(df, X_columns, y_columns, firedate_col)
    print("Ignored columns from csv %s"%([c for c in df.columns if c not in X.columns]))
    idpd = df['id']
    df = None
    X_columns = X.columns
    if len(featuredrop) > 0:
        X = X.drop(columns=[c for c in X.columns if any([fd in c for fd in featuredrop])])
    print("Dropped columns %s"%(list(set(X_columns)-set(X.columns))))

    #if debug:
    #    print("X helth check %s"%X.describe())
    #    print("y helth check %s"%y.describe())
    if returnid:
        return X, y, groupspd, idpd
    else:
        return X, y, groupspd

In [4]:
def prepare_dataset(df, X_columns, y_columns, firedate_col):
    df = df[X_columns+y_columns+[firedate_col]]
    print('before nan drop: %d' % len(df.index))
    df = df.dropna()
    print('after nan drop: %d' % len(df.index))
    df = df.drop_duplicates(keep='first')
    df.reset_index(inplace=True, drop=True)
    print('after dup. drop: %d' % len(df.index))
    print('renaming "x": "xpos", "y": "ypos"')
    X_unnorm, y_int = df[X_columns], df[y_columns]
    X_unnorm = X_unnorm.rename(columns={'x': 'xpos', 'y': 'ypos'})
    # X = normdataset.normalize_dataset(X_unnorm, aggrfile='stats/featurestats.json')
    X = X_unnorm
    y = y_int
    groupspd = df[firedate_col]
    return X, y, groupspd

In [5]:
#params={'ES_mindelta': 0.001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 10}, 'dropout': 0.1, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 70}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
#params={'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 5}, 'dropout': 0.3, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 200.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
params = {}
params['nn_nh5'] = {'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 10}, 'dropout': None, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 50.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
params['nnd_nh5'] = {'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 10}, 'dropout': 0.3, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 400.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
params['nn_rh5'] = {'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 5}, 'dropout': None, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 50.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
params['nnd_nh2'] = {'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 5}, 'dropout': 0.3, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 200.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}

In [6]:
#X, y, g=load_dataset('/home/aapostolakis/Documents/ffpdata/newcrossval/datasets/randomnofire/old_random_new_features_norm.csv',\
#                    featuredrop=params['feature_drop'])
X, y, g=load_dataset('/home/aapostolakis/Documents/ffpdata/newcrossval/datasets/randomnofire/oldrandomnewfeat.csv',)
                      #featuredrop=['corine','dir_max','dom_dir','wkd','month'])

before nan drop: 25793
after nan drop: 25757
after dup. drop: 23813
renaming "x": "xpos", "y": "ypos"
Ignored columns from csv ['id', 'firedate', 'Unnamed: 0', 'Unnamed: 0.1', 'fire', 'x', 'y', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1']
Dropped columns []


In [7]:
class MakeModel(object):

    def __init__(self, X=None, y=None):
        pass

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred
        #return np.argmax(y_pred, axis=1)
    
    def fit(self, X, y):
        skwrapped_model = KerasClassifier(build_fn=creatennmodel,
                                          train_input=X,
                                          epochs=2,
                                          batch_size=512,
                                          #validation_split=1-TRAIN_TEST_SPLIT,
                                          verbose=1)
        self.model = skwrapped_model
        self.model.fit(X, y)
        return self.model

In [8]:
def create_sklearn_model(params):
    if params['algo']=='RF':
        model = RandomForestClassifier(max_depth=params['max_depth'], n_estimators=params['n_estimators'], min_samples_split=params['min_samples_split'],
                                       min_samples_leaf=params['min_samples_leaf'],criterion=params['criterion'],max_features=params['max_features'],
                                       bootstrap=params['bootstrap'], class_weight=params['class_weight'], n_jobs = 8
                                       )
    if params['algo']=='ET':
        model = ExtraTreesClassifier(max_depth=params['max_depth'], n_estimators=params['n_estimators'], min_samples_split=params['min_samples_split'],
                                       min_samples_leaf=params['min_samples_leaf'],criterion=params['criterion'],max_features=params['max_features'],
                                       bootstrap=params['bootstrap'], class_weight=params['class_weight'], n_jobs = 8
                                       )
    if params['algo']=='XGB':
        model = XGBClassifier(max_depth=int(params['max_depth']), n_estimators=params['n_estimators'], subsample=params['subsample'],\
                              reg_alpha=params['alpha'], gamma=params['gamma'], reg_lambda=params['lambda'],\
                              scale_pos_weight=params['scale_pos_weight'], n_jobs=8)

    return model

In [9]:
models={}
for paramset in params:
    creatennmodel = partial(create_NN_model, params[paramset], X)
    models[paramset] = KerasClassifier(build_fn=creatennmodel, batch_size=params[paramset]['batch_size'], epochs=300, verbose=0,)

In [10]:
paramssk={}
#RF best
paramssk['RF']={'n_estimators': 250, 'min_samples_split': 180, 'min_samples_leaf': 40, 'max_features': 41, 'max_depth': 20, 'criterion': 'entropy', 'class_weight': {0: 1, 1: 9}, 'bootstrap': True}
#ET best
paramssk['ET']={'n_estimators': 600, 'min_samples_split': 100, 'min_samples_leaf': 10, 'max_features': 53, 'max_depth': 22, 'criterion': 'gini', 'class_weight': {0: 1, 1: 10}, 'bootstrap': True}
#XG
paramssk['XGB']={'subsample': 0.5, 'scale_pos_weight': 1000, 'n_estimators': 800, 'max_depth': 4, 'lambda': 17, 'gamma': 10, 'alpha': 100}
for skalgo in paramssk:
    paramssk[skalgo]['algo']=skalgo
    models[skalgo] = create_sklearn_model(paramssk[skalgo])

In [11]:
models

{'nn_nh5': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7f9a666bed60>,
 'nnd_nh5': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7f9a666be310>,
 'nn_rh5': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7f9a666be3a0>,
 'nnd_nh2': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x7f9a666be430>,
 'RF': RandomForestClassifier(class_weight={0: 1, 1: 9}, criterion='entropy',
                        max_depth=20, max_features=41, min_samples_leaf=40,
                        min_samples_split=180, n_estimators=250, n_jobs=8),
 'ET': ExtraTreesClassifier(bootstrap=True, class_weight={0: 1, 1: 10}, max_depth=22,
                      max_features=53, min_samples_leaf=10,
                      min_samples_split=100, n_estimators=600, n_jobs=8),
 'XGB': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None, gamma=10,
               gpu_id=No

In [12]:
#models1={'RF':models['RF'], 'ET':models['ET']}
for modname in models:
    print(modname)
    models[modname].fit(X,y)

nn_nh5
nnd_nh5
nn_rh5
nnd_nh2
RF


  models[modname].fit(X,y)


ET


  models[modname].fit(X,y)


XGB


  return f(*args, **kwargs)


In [13]:
results_lists={}
for modname in models:
    print(modname)
    #models[modname].fit(X,y)
    r=permutation_importance(models[modname], X, y, n_repeats=3,random_state=0, scoring='roc_auc')
    results_lists[modname]=[]
    for i in r.importances_mean.argsort()[::-1]:
        #if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        if not any(c in X.columns[i] for c in ['corine','dir_max','dom_dir','wkd','month']):
            results_dict={}
            results_dict['rank '+modname]=X.columns[i]
            results_dict['perm. imp.'+modname]=r.importances_mean[i]
            results_lists[modname]+=[results_dict]

nn_nh5






















nnd_nh5
























nn_rh5






















nnd_nh2
























RF
ET
XGB


In [14]:
#r=permutation_importance(sknnmodel, X, y, n_repeats=3,random_state=0)

In [15]:
#for i in r.importances_mean.argsort()[::-1]:
    #if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
#    print(f"{X.columns[i]:<17}"
#        f"{r.importances_mean[i]:.3f}"
#        f" +/- {r.importances_std[i]:.3f}")

In [16]:
results_pd=[]
for mod in results_lists:
    results_pd+=[pd.DataFrame(results_lists[mod])]
all_res_pd=pd.concat(results_pd, axis=1)

In [17]:
all_res_pd.to_csv('results/permimpall.csv', index=False)