In [1]:
import pandas as pd
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.metrics
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from MLscores import calc_metrics, metrics_dict, cmvals, recall, hybridrecall
import tensorflow.keras.backend as K
import tensorflow as tf
from functools import partial
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.inspection import permutation_importance
#import eli5
#from eli5.sklearn import PermutationImportance

In [2]:
def create_NN_model(params, X):
    # define model
    model = Sequential()
    n_features = X.shape[1]
    intlayers = int(params['n_internal_layers'][0])
    model.add(Dense(params['n_internal_layers'][1]['layer_1_' + str(intlayers) + '_nodes'], activation='relu'))#, input_shape=(n_features,))) 
    if not params['dropout'] is None:
        model.add(Dropout(params['dropout']))
    for i in range(2, intlayers + 2):
        model.add(Dense(int(params['n_internal_layers'][1]['layer_' + str(i) + '_' + str(intlayers) + '_nodes']),
                        activation='relu', )) #kernel_initializer=initializer))
        if not params['dropout'] is None:
            model.add(Dropout(params['dropout']))
    model.add(Dense(2, activation='softmax'))

    # compile the model
    if params['optimizer']['name']=='Adam':
        if params['optimizer']['adam_params'] is None:
            opt = Adam()
        else:
            opt = Adam(learning_rate=params['optimizer']['adam_params']['learning_rate_adam'], beta_1=params['optimizer']['adam_params']['beta_1'],
                       beta_2=params['optimizer']['adam_params']['beta_2'],amsgrad=params['optimizer']['adam_params']['amsgrad'])
    elif params['optimizer']['name']=='SGD':
        opt = SGD(learning_rate=params['optimizer']['learning_rate_SGD'])

    if params['metric'] == 'accuracy':
        metrics = ['accuracy']
    elif params['metric'] == 'sparse':
        metrics = [tensorflow.metrics.SparseCategoricalAccuracy()]
    elif params['metric'] == 'tn':
        metrics = [tensorflow.metrics.TrueNegatives(),tensorflow.metrics.TruePositives()]
    if 'loss' in params and params['loss'] == 'unbalanced':
        lossf=unbalanced_loss
    else:
        lossf='sparse_categorical_crossentropy'
    model.compile(optimizer=opt, loss=lossf, metrics=metrics)  # , AUC(multi_label=False)])
    return model

In [3]:
def load_dataset(trfiles, featuredrop=[], debug=True, returnid=False):
    # dsfile = 'dataset_ndvi_lu.csv'
    domdircheck = 'dom_dir'
    dirmaxcheck = 'dir_max'
    corinecheck = 'Corine'
    monthcheck = 'month'
    wkdcheck = 'wkd'
    firedatecheck = 'firedate'
    X_columns = ['max_temp', 'min_temp', 'mean_temp', 'res_max', dirmaxcheck, 'dom_vel', domdircheck,
                 'rain_7days', corinecheck, 'Slope', 'DEM', 'Curvature', 'Aspect', 'ndvi', 'evi', 'lst_day',
                 'lst_night', monthcheck, wkdcheck,
                 'mean_dew_temp', 'max_dew_temp', 'min_dew_temp','frequency', 'f81', 'x', 'y']
    y_columns = ['fire']
    # if not os.path.exists(os.path.join(dsetfolder, dsready)):
    if isinstance(trfiles, list):
        if debug:
            print("Loading full dataset ...")
        dflist=[]
        for dsfile in trfiles:
            if debug:
                print("Loading dataset file %s" % dsfile)
            dflist.append(pd.read_csv(dsfile))
        df = pd.concat(dflist)
    else:
        dsfile = trfiles
    df = pd.read_csv(dsfile)
    X_columns_upper = [c.upper() for c in X_columns]
    newcols = [c for c in df.columns if
               c.upper() in X_columns_upper or any([cX in c.upper() for cX in X_columns_upper])]
    X_columns = newcols
    #corine_col, newcols = check_categorical(df, corinecheck, newcols)
    #dirmax_col, newcols = check_categorical(df, dirmaxcheck, newcols)
    #domdir_col, newcols = check_categorical(df, domdircheck, newcols)
    #month_col, newcols = check_categorical(df, monthcheck, newcols)
    #wkd_col, newcols = check_categorical(df, wkdcheck, newcols)

    firedate_col = [c for c in df.columns if firedatecheck.upper() in c.upper()][0]
    X, y, groupspd = prepare_dataset(df, X_columns, y_columns, firedate_col)
    print("Ignored columns from csv %s"%([c for c in df.columns if c not in X.columns]))
    idpd = df['id']
    df = None
    X_columns = X.columns
    if len(featuredrop) > 0:
        X = X.drop(columns=[c for c in X.columns if any([fd in c for fd in featuredrop])])
    print("Dropped columns %s"%(list(set(X_columns)-set(X.columns))))

    #if debug:
    #    print("X helth check %s"%X.describe())
    #    print("y helth check %s"%y.describe())
    if returnid:
        return X, y, groupspd, idpd
    else:
        return X, y, groupspd

In [4]:
def prepare_dataset(df, X_columns, y_columns, firedate_col):
    df = df[X_columns+y_columns+[firedate_col]]
    print('before nan drop: %d' % len(df.index))
    df = df.dropna()
    print('after nan drop: %d' % len(df.index))
    df = df.drop_duplicates(keep='first')
    df.reset_index(inplace=True, drop=True)
    print('after dup. drop: %d' % len(df.index))
    print('renaming "x": "xpos", "y": "ypos"')
    X_unnorm, y_int = df[X_columns], df[y_columns]
    X_unnorm = X_unnorm.rename(columns={'x': 'xpos', 'y': 'ypos'})
    # X = normdataset.normalize_dataset(X_unnorm, aggrfile='stats/featurestats.json')
    X = X_unnorm
    y = y_int
    groupspd = df[firedate_col]
    return X, y, groupspd

In [5]:
#params={'ES_mindelta': 0.001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 10}, 'dropout': 0.1, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 70}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
params={'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': {0: 1, 1: 5}, 'dropout': 0.3, 'feature_drop': ('dir_max', 'dom_dir', 'month', 'wkd'), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 200.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}

In [6]:
#X, y, g=load_dataset('/home/aapostolakis/Documents/ffpdata/newcrossval/datasets/randomnofire/old_random_new_features_norm.csv',\
#                    featuredrop=params['feature_drop'])
X, y, g=load_dataset('/home/aapostolakis/Documents/ffpdata/newcrossval/datasets/randomnofire/oldrandomnewfeat.csv')#,\
                    #featuredrop=params['feature_drop'])

before nan drop: 25793
after nan drop: 25757
after dup. drop: 23813
renaming "x": "xpos", "y": "ypos"
Ignored columns from csv ['id', 'firedate', 'Unnamed: 0', 'Unnamed: 0.1', 'fire', 'x', 'y', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1']
Dropped columns []


In [7]:
class MakeModel(object):

    def __init__(self, X=None, y=None):
        pass

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred
        #return np.argmax(y_pred, axis=1)
    
    def fit(self, X, y):
        skwrapped_model = KerasClassifier(build_fn=creatennmodel,
                                          train_input=X,
                                          epochs=2,
                                          batch_size=512,
                                          #validation_split=1-TRAIN_TEST_SPLIT,
                                          verbose=1)
        self.model = skwrapped_model
        self.model.fit(X, y)
        return self.model

In [8]:
from sklearn.model_selection import train_test_split
creatennmodel = partial(create_NN_model, params, X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
#es = EarlyStopping(monitor=params['ES_monitor'], patience=params['ES_patience'], min_delta=params['ES_mindelta'])
#sknnmodel = KerasClassifier(build_fn=creatennmodel, batch_size=params['batch_size'], epochs=params['max_epochs'], verbose=1, callbacks=[es],
#                            class_weight=params['class_weights'])
es = EarlyStopping(monitor=params['ES_monitor'], patience=params['ES_patience'], min_delta=params['ES_mindelta'])
sknnmodel = KerasClassifier(build_fn=creatennmodel, batch_size=params['batch_size'], epochs=200, verbose=0,)\
                           #validation_data=(X_test, y_test)) #class_weight=params['class_weights'])   

In [10]:
sknnmodel.fit(X, y)

<tensorflow.python.keras.callbacks.History at 0x7f2be2f92ee0>

In [11]:
sknnmodel.predict(X)



array([0, 0, 0, ..., 0, 0, 0])

In [12]:
#sfs = SequentialFeatureSelector(sknnmodel, n_features_to_select=2)

In [13]:
#sfs.fit(X, y)

In [14]:
#bestfeatmask=sfs.get_support()

In [15]:
'''
fn=0
for i in range(0,len(bestfeatmask)):
    if bestfeatmask[i]:
        fn+=1  
        print('feature %d : %s'%(fn, X.columns[i]))
'''

"\nfn=0\nfor i in range(0,len(bestfeatmask)):\n    if bestfeatmask[i]:\n        fn+=1  \n        print('feature %d : %s'%(fn, X.columns[i]))\n"

In [16]:
r=permutation_importance(sknnmodel, X, y, n_repeats=3,random_state=0)

In [17]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X.columns[i]:<17}"
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}")

dom_vel          0.111 +/- 0.002
evi              0.050 +/- 0.000
f81              0.045 +/- 0.001
xpos             0.030 +/- 0.001
ypos             0.027 +/- 0.001
month_8          0.023 +/- 0.001
dem              0.022 +/- 0.001
dom_dir_2        0.020 +/- 0.000
dom_dir_1        0.020 +/- 0.001
dir_max_1        0.019 +/- 0.001
month_7          0.019 +/- 0.000
wkd_3            0.018 +/- 0.001
dir_max_7        0.018 +/- 0.000
dom_dir_8        0.017 +/- 0.000
dir_max_8        0.017 +/- 0.001
dir_max_2        0.016 +/- 0.001
wkd_6            0.016 +/- 0.000
dir_max_6        0.016 +/- 0.001
dir_max_3        0.016 +/- 0.001
wkd_5            0.016 +/- 0.000
wkd_2            0.016 +/- 0.001
dom_dir_7        0.014 +/- 0.001
wkd_1            0.014 +/- 0.001
wkd_0            0.013 +/- 0.001
month_9          0.013 +/- 0.000
wkd_4            0.012 +/- 0.000
max_temp         0.012 +/- 0.001
dir_max_4        0.010 +/- 0.000
month_6          0.010 +/- 0.000
res_max          0.010 +/- 0.001
corine_334