# Load Data

In [1]:
from pyearth import Earth
from DLtools.Data_preprocess import series_to_supervised,load_data
from DLtools.Data import load_data,instant_data,check_specific_col

import numpy as np
import pandas as pd

In [2]:
# loaddata = load_data()
# df_d = loaddata.daily()
# df_h = loaddata.hourly()
###################################
loading = instant_data()
df_h = loading.hourly_instant()


# Missing data handle

In [3]:
data = df_h['2013-01-01':'2017-12-31'].interpolate(limit=24)

for col in data.columns:
    if data[col].count()<len(data[col])*.8:
        data = data.drop(col, axis=1)
data = data.apply(lambda x: x.fillna(x.mean()),axis=0)

In [4]:
TARGET = 'CPY015_wl'

## Period 2013-2017 ##
#data = df_h['2013-01-01':'2017-12-31']

## Shift target station to future 
n_out = 7*24                            #next 7 day
data[TARGET]=data[TARGET].shift(-n_out)
data = data.astype('float64').dropna()

X = data.drop([TARGET],axis=1)
xlabels = list(X.columns)

X = X.values
y = data[TARGET].values

# MARS feature selection (from all stations)

In [5]:
# #scale data since it's cannot run MARS in general
# from sklearn.preprocessing import MinMaxScaler
# # SCALE
# scaler = MinMaxScaler()
# data[data.columns] = scaler.fit_transform(data[data.columns])


In [6]:
#Fit an Earth model
criteria = ('rss', 'gcv', 'nb_subsets')
model = Earth(enable_pruning = True,
            #   max_degree=3,
            #  max_terms=20,
              minspan_alpha=.5,
              feature_importance_type=criteria,
              verbose=True)
model.fit(X,y,xlabels=xlabels)


Beginning forward pass
---------------------------------------------------------------
iter  parent  var  knot  mse       terms  gcv    rsq    grsq   
---------------------------------------------------------------
0     -       -    -     0.386767  1      0.387  0.000  0.000  
1     0       11   31269  0.354590  3      0.355  0.083  0.083  
2     0       72   37361  0.332809  5      0.333  0.140  0.139  
3     0       94   31325  0.326307  7      0.327  0.156  0.156  
4     0       23   38710  0.321806  9      0.322  0.168  0.167  
5     0       67   39565  0.317401  11     0.318  0.179  0.178  
6     0       64   26983  0.313958  13     0.314  0.188  0.187  
7     0       39   12210  0.310348  15     0.311  0.198  0.196  
8     0       76   29959  0.307666  17     0.308  0.205  0.203  
9     0       66   36085  0.303649  19     0.304  0.215  0.213  
10    0       92   34735  0.300840  21     0.302  0.222  0.220  
11    0       75   12498  0.298676  23     0.299  0.228  0.226  
12    

Earth(feature_importance_type=('rss', 'gcv', 'nb_subsets'), minspan_alpha=0.5,
      verbose=True)

In [7]:
# #Print the model
# print(model.trace())
# print(model.summary())

* Feature Importance

In [8]:
print(model.summary_feature_importances(sort_by='rss')[:500])

                  rss    gcv    nb_subsets
CPY013_wl         0.28   0.29   0.02          
DNP007_temp       0.19   0.20   0.04          
DNP033_temp       0.06   0.06   0.02          
DNP006_humid      0.06   0.06   0.02          
DNP006_temp       0.05   0.05   0.06          
GLF001_wl         0.04   0.04   0.02          
PAS005_wl         0.03   0.03   0.02          
DNP025_temp       0.03   0.03   0.06          
DNP007_humid      0.02   0.02   0.02          
DNP027_temp       0.02   0.02   0.


In [9]:
def toDF(rank):
    name,rss,gcv,nb_sub= list(),list(),list(),list()
    for i in range(len(rank)):
        if i%4==0:
            name.append(rank[i])
        elif i%4==1:
            rss.append(rank[i])
        elif i%4==2:
            gcv.append(rank[i])
        elif i%4==3:
            nb_sub.append(rank[i])
    data = {'feature':name,
    'rss':rss,
    'gcv':gcv,
    'nb_subset':nb_sub}
    score = pd.DataFrame(data)
    return score

In [10]:
nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83]
gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83]
rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83]


In [11]:
rss = toDF(rss)
gcv = toDF(gcv)
nbsub = toDF(nbsub)

In [12]:
#From top 20 rss,gcv,nbsub
top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature')
top20

Unnamed: 0,feature,rss,gcv,nb_subset
0,CPY013_wl,0.28,0.29,0.02
1,DNP007_temp,0.19,0.2,0.04
2,DNP033_temp,0.06,0.06,0.02
3,DNP006_humid,0.06,0.06,0.02
4,DNP006_temp,0.05,0.05,0.06
5,GLF001_wl,0.04,0.04,0.02
6,PAS005_wl,0.03,0.03,0.02
7,DNP025_temp,0.03,0.03,0.06
8,DNP007_humid,0.02,0.02,0.02
9,DNP027_temp,0.02,0.02,0.04


In [13]:
top20.to_csv('featurelist_MAR_hourly_7d.csv',index=False)