In [1]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    # Regression
    return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

In [3]:
path = "./data/"

In [5]:
filename_read = os.path.join(path,"Train_Solar.csv")
filename_write = os.path.join(path,"Train_Solar_R5_model.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

In [6]:
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

In [8]:
df.head(1)

Unnamed: 0,datetime,date,time,I,T,UV,WS,RH,P
0,22/2/2018 6:27,22/2/2018,6:27:00,0.0,27.1,0.0,6.1,69.0,0.0


In [9]:
# Preprocess
datetime_sr = df['datetime']
date_sr = df['date']
time_sr = df['time']
df.drop('datetime',1,inplace=True)
df.drop('date',1,inplace=True)
df.drop('time',1,inplace=True)
missing_median(df, 'I')
missing_median(df, 'T')
missing_median(df, 'UV')
missing_median(df, 'WS')
missing_median(df, 'RH')
missing_median(df, 'P')

In [11]:
# Encode to a 2D matrix for training
x,y = to_xy(df,'P')



In [12]:
print(x[0:5])

[[  0.        27.1        0.         6.1       69.      ]
 [401.        31.1        3.6        1.8       65.8     ]
 [309.        30.3        1.7        0.        67.46667 ]
 [288.        32.9        3.1        8.3       66.3     ]
 [299.        30.8        3.         1.        70.503334]]


In [13]:
print(y[0:5])

[[0.     ]
 [2.37   ]
 [1.785  ]
 [1.665  ]
 [1.80372]]


In [14]:
# Cross-Validate
kf = KFold(5)

In [15]:
oos_y = []
oos_pred = []
fold = 0

##### for train, test in kf.split(x):
for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(25,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))

Fold #1
Epoch 00012: early stopping
Fold score (RMSE): 0.556136965751648
Fold #2
Epoch 00019: early stopping
Fold score (RMSE): 0.4930118918418884
Fold #3
Epoch 00026: early stopping
Fold score (RMSE): 0.484989732503891
Fold #4
Epoch 00011: early stopping
Fold score (RMSE): 0.5178372263908386
Fold #5
Epoch 00047: early stopping
Fold score (RMSE): 0.4991866946220398


In [16]:
# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))   

Final, out of sample score (RMSE): 0.510863721370697


In [17]:
# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
oosDF.to_csv(filename_write,index=False)

In [18]:
# save entire network to HDF5 (save everything, suggested)
model.save(os.path.join(path,"R5_model.h5"))

In [19]:
# save neural network structure to JSON (no weights)
model_json = model.to_json()
with open(os.path.join(path,"R5_model.json"), "w") as json_file:
    json_file.write(model_json)

In [20]:
# save neural network structure to YAML (no weights)
model_yaml = model.to_yaml()
with open(os.path.join(path,"R5_model.yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)