# Load Data

In [1]:
from pyearth import Earth
from DLtools.Data_preprocess import series_to_supervised,load_data
from DLtools.Data import load_data,instant_data,check_specific_col

import numpy as np
import pandas as pd

In [2]:
# loaddata = load_data()

# df_d = loaddata.daily()
# df_h = loaddata.hourly()
###################################
loading = instant_data()
df_d = loading.daily_instant()
df_h = loading.hourly_instant()



In [3]:
TARGET = 'CPY015_wl'

## Period 2013-2017 ##
data = df_d['2013-01-01':'2017-12-31'].interpolate(limit=360).fillna(0)

## Shift target station to future 
n_out = 7
data[TARGET]=data[TARGET].shift(-n_out)
data = data.astype('float32').dropna()

X = data.drop([TARGET],axis=1)
xlabels = list(X.columns)


# REPLACE NAN WITH 0
X = X.fillna(0).values
y = np.log(data[TARGET].fillna(0).values)
where_are_NaNs = np.isnan(y)
y[where_are_NaNs] = 0


# MARS feature selection (from all stations)

In [4]:
#Fit an Earth model
criteria = ('rss', 'gcv', 'nb_subsets')
model = Earth(enable_pruning = True,
            #   max_degree=3,
            #   max_terms=10,
              minspan_alpha=.5,
              feature_importance_type=criteria,
              verbose=True)
model.fit(X,y,xlabels=xlabels)


Beginning forward pass
---------------------------------------------------------------
iter  parent  var  knot  mse       terms  gcv    rsq    grsq   
---------------------------------------------------------------
0     -       -    -     0.327770  1      0.328  0.000  0.000  
1     0       173  1645  0.204758  3      0.206  0.375  0.372  
2     0       137  1137  0.192988  5      0.195  0.411  0.405  
3     0       251  1281  0.185759  7      0.189  0.433  0.424  
4     0       146  573   0.182915  9      0.187  0.442  0.429  
5     0       41   1136  0.180249  11     0.186  0.450  0.435  
6     0       173  190   0.177701  13     0.184  0.458  0.440  
7     0       34   -1    0.175435  14     0.182  0.465  0.445  
8     0       216  828   0.173251  16     0.181  0.471  0.449  
9     0       121  1233  0.171413  18     0.180  0.477  0.452  
10    0       140  175   0.169971  20     0.179  0.481  0.453  
11    0       281  1086  0.168679  22     0.179  0.485  0.454  
12    0       165

Earth(feature_importance_type=('rss', 'gcv', 'nb_subsets'), minspan_alpha=0.5,
      verbose=True)

In [5]:
# #Print the model
# print(model.trace())
# print(model.summary())

* Feature Importance

In [6]:
print(model.summary_feature_importances(sort_by='rss')[:2000])

                                   rss    gcv    nb_subsets
Dam_SK_Useable_WaterVol            0.45   0.62   0.01          
CPY001_wl                          0.14   0.18   0.04          
DNP033_temp                        0.09   0.12   0.01          
PAS005_wl                          0.03   0.01   0.05          
DIV006_wl                          0.02   0.02   0.01          
NAN010_wl                          0.02   0.00   0.05          
CPY009_temp                        0.02   0.00   0.05          
PAS007_wl                          0.01   0.00   0.04          
NAN011_wl                          0.01   0.00   0.02          
CPY009_wl                          0.01   0.00   0.02          
GLF001_rain                        0.01   0.00   0.04          
DIV004_rain                        0.01   0.01   0.01          
PAS001_wl                          0.01   0.01   0.01          
VLGE13_rain                        0.01   0.00   0.02          
DNP015_temp                        0.01   0.

In [49]:
def toDF(rank):
    name,rss,gcv,nb_sub= list(),list(),list(),list()
    for i in range(len(rank)):
        if i%4==0:
            name.append(rank[i])
        elif i%4==1:
            rss.append(rank[i])
        elif i%4==2:
            gcv.append(rank[i])
        elif i%4==3:
            nb_sub.append(rank[i])
    data = {'feature':name,
    'rss':rss,
    'gcv':gcv,
    'nb_subset':nb_sub}
    score = pd.DataFrame(data)
    return score



In [55]:
nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83]
gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83]
rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83]


In [56]:
rss = toDF(rss)
gcv = toDF(gcv)
nbsub = toDF(nbsub)

In [66]:
#From top 20 rss,gcv,nbsub
top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature')
top20

Unnamed: 0,feature,rss,gcv,nb_subset
0,Dam_SK_Useable_WaterVol,0.45,0.62,0.01
1,CPY001_wl,0.14,0.18,0.04
2,DNP033_temp,0.09,0.12,0.01
3,PAS005_wl,0.03,0.01,0.05
4,DIV006_wl,0.02,0.02,0.01
5,NAN010_wl,0.02,0.0,0.05
6,CPY009_temp,0.02,0.0,0.05
7,PAS007_wl,0.01,0.0,0.04
8,NAN011_wl,0.01,0.0,0.02
9,CPY009_wl,0.01,0.0,0.02


In [68]:
top20.to_csv('featurelist_MAR_ahead7d.csv',index=False)