In [1]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


In [2]:
# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)
    
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    # Regression
    return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

In [3]:
path = "./data/"

In [4]:
filename_read = os.path.join(path,"Train_Solar.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

In [5]:
df.head(1)

Unnamed: 0,datetime,date,time,I,T,UV,WS,RH,P
0,1/1/2017 6:00,1/1/2017,6:00:00,0.0,24.6,0.0,1.2,59.8,0.0


In [6]:
# create feature vector
missing_median(df, 'I')
missing_median(df, 'T')
missing_median(df, 'UV')
missing_median(df, 'WS')
missing_median(df, 'P')
df.drop('datetime',1,inplace=True)
df.drop('date',1,inplace=True)
df.drop('time',1,inplace=True)

In [7]:
df.head(1)

Unnamed: 0,I,T,UV,WS,RH,P
0,0.0,24.6,0.0,1.2,59.8,0.0


In [8]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(df)
y_noano = clf.predict(df)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

df = df.iloc[y_noano[y_noano['Top'] == 1].index.values]
df.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", df.shape[0])

Number of Outliers: 13159
Number of rows without outliers: 118427


In [9]:
import warnings
warnings.filterwarnings('ignore')

col_train = list(df.columns)

mat_train = np.matrix(df)

prepro = MinMaxScaler()
prepro.fit(mat_train)

df = pd.DataFrame(prepro.transform(mat_train),columns = col_train)

In [10]:
x,y = to_xy(df,"P")

In [11]:
print(x[0:5])

[[0.         0.2510288  0.         0.03314917 0.6599099 ]
 [0.         0.24691358 0.         0.03867403 0.6644144 ]
 [0.         0.24691358 0.         0.05248619 0.6689189 ]
 [0.         0.24691358 0.         0.0359116  0.6711712 ]
 [0.         0.2510288  0.         0.03867403 0.6672297 ]]


In [12]:
print(y[0:5])

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [13]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.25, random_state=42)

In [14]:
model = Sequential()
model.add(Dense(50, input_dim=x.shape[1], activation='relu'))
model.add(Dense(25,activation='relu'))
model.add(Dense(1)) # Output

In [15]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [16]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

In [17]:
checkpointer = ModelCheckpoint(filepath="R3_model.hdf5", verbose=0, save_best_only=True) # save best model

In [18]:
model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=1000)

Train on 118427 samples, validate on 29607 samples
Epoch 1/1000
 - 4s - loss: 0.0044 - val_loss: 0.0036
Epoch 2/1000
 - 4s - loss: 0.0036 - val_loss: 0.0037
Epoch 3/1000
 - 4s - loss: 0.0036 - val_loss: 0.0034
Epoch 4/1000
 - 4s - loss: 0.0035 - val_loss: 0.0033
Epoch 5/1000
 - 4s - loss: 0.0034 - val_loss: 0.0033
Epoch 6/1000
 - 4s - loss: 0.0034 - val_loss: 0.0033
Epoch 00006: early stopping


<keras.callbacks.History at 0x7fc7cf160588>

In [19]:
model.load_weights('R3_model.hdf5') # load weights from best model

In [20]:
from sklearn import metrics
import tensorflow as tf

pred = model.predict(x_test)
print(pred[0:5]) # print first five predictions

[[0.3597149 ]
 [0.67056906]
 [0.6021825 ]
 [0.01184219]
 [0.10416352]]


In [21]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print(f"Before save score (RMSE): {score}")

Before save score (RMSE): 0.05767328664660454


In [22]:
# save entire network to HDF5 (save everything, suggested)
model.save(os.path.join(path,"R3_model.h5"))

In [23]:
# save neural network structure to JSON (no weights)
model_json = model.to_json()
with open(os.path.join(path,"R3_model.json"), "w") as json_file:
    json_file.write(model_json)

In [24]:
# save neural network structure to YAML (no weights)
model_yaml = model.to_yaml()
with open(os.path.join(path,"R3_model.yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)