In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


In [2]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:

In [3]:
path = "./data/"

In [4]:
filename_train = os.path.join(path,"Train_Solar.csv")
filename_test = os.path.join(path,"Test_Solar.csv")
filename_submit = os.path.join(path,"R9_model_submit.csv")

In [5]:
df = pd.read_csv(filename_train,na_values=['NA','?'])

In [6]:
df.head(1)

Unnamed: 0,datetime,date,time,I,T,UV,WS,RH,P
0,1/1/2017 6:00,1/1/2017,6:00:00,0.0,24.6,0.0,1.2,59.8,0.0


In [7]:
# Preprocess
datetime_sr = df['datetime']
date_sr = df['date']
time_sr = df['time']
df.drop('datetime',1,inplace=True)
df.drop('date',1,inplace=True)
df.drop('time',1,inplace=True)
missing_median(df, 'I')
missing_median(df, 'T')
missing_median(df, 'UV')
missing_median(df, 'WS')
missing_median(df, 'RH')
missing_median(df, 'P')

In [8]:
df.head(1)

Unnamed: 0,I,T,UV,WS,RH,P
0,0.0,24.6,0.0,1.2,59.8,0.0


In [9]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(df)
y_noano = clf.predict(df)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

df = df.iloc[y_noano[y_noano['Top'] == 1].index.values]
df.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", df.shape[0])

Number of Outliers: 13159
Number of rows without outliers: 118427


In [10]:
import warnings
warnings.filterwarnings('ignore')

col_train = list(df.columns)

mat_train = np.matrix(df)

prepro = MinMaxScaler()
prepro.fit(mat_train)

df = pd.DataFrame(prepro.transform(mat_train),columns = col_train)

In [11]:
# Create the x-side (feature vectors) of the training
x, y = to_xy(df,'P')

In [12]:
print(x[:5])

[[0.         0.2510288  0.         0.03314917 0.6599099 ]
 [0.         0.24691358 0.         0.03867403 0.6644144 ]
 [0.         0.24691358 0.         0.05248619 0.6689189 ]
 [0.         0.24691358 0.         0.0359116  0.6711712 ]
 [0.         0.2510288  0.         0.03867403 0.6672297 ]]


In [13]:
print(y[:5])

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [14]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y,
    test_size=0.25,
    random_state=45)

In [15]:
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1))



model.compile(loss='mean_squared_error',
              optimizer='adam')


monitor = EarlyStopping(monitor='val_loss',
                        min_delta=1e-3,
                        patience=5,
                        verbose=1,
                        mode='auto')

In [16]:
model.fit(x_train,
          y_train,
          validation_data=(x_test,y_test),
          callbacks=[monitor],
          verbose=0,
          epochs=1000)

Epoch 00006: early stopping


<keras.callbacks.History at 0x7f5434c1a710>

In [17]:
from sklearn import metrics
import tensorflow as tf

pred = model.predict(x_test)
print(pred[0:5]) # print first five predictions

[[0.38237602]
 [0.62212354]
 [0.00352091]
 [0.19598332]
 [0.24476399]]


In [18]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print(f"Before save score (RMSE): {score}")

Before save score (RMSE): 0.06048131734132767


In [19]:
from sklearn import metrics

# Calculate multi log loss error
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Log loss score: {}".format(score))

Log loss score: 0.06048131734132767


In [20]:
# save entire network to HDF5 (save everything, suggested)
model.save(os.path.join(path,"R9_model.h5"))

In [21]:
# save neural network structure to JSON (no weights)
model_json = model.to_json()
with open(os.path.join(path,"R9_model.json"), "w") as json_file:
    json_file.write(model_json)

In [22]:
# save neural network structure to YAML (no weights)
model_yaml = model.to_yaml()
with open(os.path.join(path,"R9_model.yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)

In [23]:
# Generate Kaggle submit file

# Encode feature vector
df_test = pd.read_csv(filename_test,na_values=['NA','?'])

encode_numeric_zscore(df_test,'I')
encode_numeric_zscore(df_test,'T')
encode_numeric_zscore(df_test,'UV')
encode_numeric_zscore(df_test,'WS')
encode_numeric_zscore(df_test,'RH')

# Preprocess
datetime_sr = df_test['datetime']
date_sr = df_test['date']
time_sr = df_test['time']
df_test.drop('datetime',1,inplace=True)
df_test.drop('date',1,inplace=True)
df_test.drop('time',1,inplace=True)


x = df_test.as_matrix().astype(np.float32)

# Generate predictions
pred = model.predict(x)
#pred

# Create submission data set

df_submit = pd.DataFrame(pred)
df_submit.insert(0,'datetime',datetime_sr)
df_submit.columns = ['datetime','P']

df_submit.to_csv(filename_submit, index=False)

print(df_submit)

              datetime         P
0       25/4/2018 6:00 -0.055448
1       25/4/2018 6:03 -0.096754
2       25/4/2018 6:06 -0.099869
3       25/4/2018 6:09 -0.068698
4       25/4/2018 6:12 -0.099388
5       25/4/2018 6:15 -0.066572
6       25/4/2018 6:18 -0.071885
7       25/4/2018 6:21 -0.066679
8       25/4/2018 6:24 -0.057857
9       25/4/2018 6:27 -0.068584
10      25/4/2018 6:30 -0.143639
11      25/4/2018 6:33 -0.131926
12      25/4/2018 6:36 -0.116571
13      25/4/2018 6:39 -0.099475
14      25/4/2018 6:42 -0.104660
15      25/4/2018 6:45 -0.099107
16      25/4/2018 6:48 -0.109840
17      25/4/2018 6:51 -0.157423
18      25/4/2018 6:54 -0.113624
19      25/4/2018 6:57 -0.096453
20      25/4/2018 7:00 -0.094394
21      25/4/2018 7:03 -0.069175
22      25/4/2018 7:06 -0.057193
23      25/4/2018 7:09 -0.061970
24      25/4/2018 7:12 -0.041822
25      25/4/2018 7:15 -0.061086
26      25/4/2018 7:18 -0.081455
27      25/4/2018 7:21 -0.116167
28      25/4/2018 7:24 -0.108946
29      25