# KERAS LSTM HURRICANE PREDICTION DATA PREPARATION #



In [1]:
import pandas as pd
import numpy as np
from pandas import read_csv
from datetime import datetime

After importing the needed modules, the CSV file can be parsed using pandas. The `date_parser` option is used in order to later ensure that training takes place on consecutive, observations spaced six hours apart.

In [2]:
def parse_date(raw):
    return datetime.strptime(raw, "%Y%m%d%H")
data = read_csv('data/cleaned.csv', parse_dates = ['datetime'], date_parser=parse_date)

The following code iterates row by row over the csv file and collects consecutive observations separated by exactly six hours from a particular weather event. This is possible because the csv is sorted in chronological order grouped according to the weather event (*the 'id' field*). It also normalizes the location features to a value 0-1. This has been observed to improve the performance of the network.

In [3]:
gc = 1 #group size counter
gs = 5 #group size target
nrows = len(data)
ri = 1 #row index
test_and_train = []


prev_row = data.iloc[0]

def get_normalized_row(index):
    return [(data.iloc[index]['latitude']),(data.iloc[index]['longitude']),(data.iloc[index]['maximumwind'])]

cg = [get_normalized_row(ri)] #current group
while ri < nrows:
    if gc < gs:
        if data.iloc[ri]['id'] == prev_row['id'] and (data.iloc[ri]['datetime'] - prev_row['datetime']).seconds == 21600:
            cg.append(get_normalized_row(ri))
            gc += 1
        else:
            cg = [get_normalized_row(ri)]
            gc = 1
        prev_row = data.iloc[ri]
        ri += 1
    else:
        test_and_train.append(cg)
        cg = [get_normalized_row(ri)]
        gc = 1
        prev_row = data.iloc[ri]
        ri += 1

We can see in the next cell that we successfully created 13673 data groups.

In [4]:
num_groups = len(test_and_train)
test_and_train = np.array(test_and_train)
num_groups

13673

In [5]:
#Use validation_split in keras fit function for automatic splitting
#train = np.array(test_and_train[:-ttsplit])
#test = np.array(test_and_train[-ttsplit:])

Now that we have our data set (contained in the test_and_train variable), we can import the keras modules to build the network.

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


The following function is a callback called each epoch of the fit function. This allows us to start with a higher learning rate that gets decays over time to ruduce overall training time without sacrificing accuracy.

In [66]:
def lr_decay(epoch):
    if epoch < 2: return 0.01
    if epoch < 6: return 0.001
    return 0.0001

In [116]:
def lr_min(epoch):
    return 0.0001

Originally attempted to use adam optimizer and tried a variety of different configurations of network. 

In [105]:
model = Sequential()
model.add(LSTM(100, input_shape=(4,3), return_sequences=False)) #Input layer, LSTM with 48 neurons
model.add(Dense(16))
model.add(Dense(3))

#optimizer = RMSprop(0.0001)
model.compile(loss="mae", optimizer='rmsprop', metrics=['acc'])
#model.compile(loss="mse", optimizer='adam', metrics=['acc']) #adam optimizer performed worse than RMSprop

Need to reshape the dataset before fitting.

In [10]:
X, y = test_and_train[:,:4].reshape(num_groups,4,3), test_and_train[:,-1].reshape(num_groups,3)

In [120]:
#model.fit(X, y, epochs=20, batch_size=50, validation_split=0.01, verbose=2, callbacks=[LearningRateScheduler(lr_decay)])
model.fit(X, y, epochs=200, batch_size=10, validation_split=0.01, verbose=2, callbacks=[LearningRateScheduler(lr_min)])

Train on 13536 samples, validate on 137 samples
Epoch 1/200
 - 16s - loss: 1.4240 - acc: 0.9792 - val_loss: 1.2411 - val_acc: 0.9781
Epoch 2/200
 - 16s - loss: 1.4228 - acc: 0.9789 - val_loss: 1.2282 - val_acc: 0.9781
Epoch 3/200
 - 16s - loss: 1.4214 - acc: 0.9793 - val_loss: 1.2355 - val_acc: 0.9781
Epoch 4/200
 - 17s - loss: 1.4193 - acc: 0.9793 - val_loss: 1.2579 - val_acc: 0.9781
Epoch 5/200
 - 16s - loss: 1.4172 - acc: 0.9793 - val_loss: 1.2408 - val_acc: 0.9781
Epoch 6/200
 - 17s - loss: 1.4166 - acc: 0.9792 - val_loss: 1.2311 - val_acc: 0.9781
Epoch 7/200
 - 16s - loss: 1.4157 - acc: 0.9787 - val_loss: 1.2424 - val_acc: 0.9781
Epoch 8/200
 - 16s - loss: 1.4134 - acc: 0.9792 - val_loss: 1.2384 - val_acc: 0.9781
Epoch 9/200
 - 16s - loss: 1.4130 - acc: 0.9793 - val_loss: 1.2384 - val_acc: 0.9781
Epoch 10/200
 - 16s - loss: 1.4117 - acc: 0.9798 - val_loss: 1.2427 - val_acc: 0.9781
Epoch 11/200
 - 16s - loss: 1.4109 - acc: 0.9794 - val_loss: 1.2342 - val_acc: 0.9781
Epoch 12/200
 -

Epoch 96/200
 - 16s - loss: 1.3536 - acc: 0.9798 - val_loss: 1.2076 - val_acc: 0.9781
Epoch 97/200
 - 16s - loss: 1.3518 - acc: 0.9798 - val_loss: 1.2165 - val_acc: 0.9708
Epoch 98/200
 - 17s - loss: 1.3525 - acc: 0.9790 - val_loss: 1.2330 - val_acc: 0.9781
Epoch 99/200
 - 17s - loss: 1.3522 - acc: 0.9798 - val_loss: 1.2191 - val_acc: 0.9708
Epoch 100/200
 - 16s - loss: 1.3522 - acc: 0.9795 - val_loss: 1.2125 - val_acc: 0.9854
Epoch 101/200
 - 16s - loss: 1.3521 - acc: 0.9794 - val_loss: 1.2067 - val_acc: 0.9708
Epoch 102/200
 - 17s - loss: 1.3516 - acc: 0.9797 - val_loss: 1.2240 - val_acc: 0.9781
Epoch 103/200
 - 16s - loss: 1.3513 - acc: 0.9795 - val_loss: 1.2207 - val_acc: 0.9708
Epoch 104/200
 - 16s - loss: 1.3507 - acc: 0.9798 - val_loss: 1.2164 - val_acc: 0.9708
Epoch 105/200
 - 16s - loss: 1.3495 - acc: 0.9795 - val_loss: 1.2521 - val_acc: 0.9708
Epoch 106/200
 - 16s - loss: 1.3505 - acc: 0.9792 - val_loss: 1.2313 - val_acc: 0.9708
Epoch 107/200
 - 17s - loss: 1.3498 - acc: 0.97

Epoch 191/200
 - 16s - loss: 1.3178 - acc: 0.9795 - val_loss: 1.1814 - val_acc: 0.9708
Epoch 192/200
 - 16s - loss: 1.3180 - acc: 0.9795 - val_loss: 1.1856 - val_acc: 0.9708
Epoch 193/200
 - 16s - loss: 1.3182 - acc: 0.9801 - val_loss: 1.1680 - val_acc: 0.9708
Epoch 194/200
 - 16s - loss: 1.3177 - acc: 0.9789 - val_loss: 1.1886 - val_acc: 0.9708
Epoch 195/200
 - 16s - loss: 1.3171 - acc: 0.9798 - val_loss: 1.1930 - val_acc: 0.9708
Epoch 196/200
 - 16s - loss: 1.3188 - acc: 0.9797 - val_loss: 1.1566 - val_acc: 0.9708
Epoch 197/200
 - 16s - loss: 1.3170 - acc: 0.9792 - val_loss: 1.1691 - val_acc: 0.9708
Epoch 198/200
 - 16s - loss: 1.3158 - acc: 0.9794 - val_loss: 1.1757 - val_acc: 0.9708
Epoch 199/200
 - 16s - loss: 1.3170 - acc: 0.9798 - val_loss: 1.1763 - val_acc: 0.9781
Epoch 200/200
 - 16s - loss: 1.3162 - acc: 0.9793 - val_loss: 1.1635 - val_acc: 0.9708


<keras.callbacks.History at 0x2238cbc2978>

In [121]:
lat_s = 0
lon_s = 0
ws_s = 0
lat_f = 0
lon_f = 0
ws_f = 0
lat_err = 0
lon_err = 0
ws_err = 0

for iter in range(num_groups):
    gt = model.predict(X[iter].reshape(1,4,3))
    pred = y[iter]
    if abs(gt[0][0] - pred[0]) <= 0.1:
        lat_s += 1
    else:
        lat_f += 1
    if abs(gt[0][1] - pred[1]) <= 0.2:
        lon_s += 1
    else:
        lon_f += 1
    if abs(gt[0][2] - pred[2]) <= 1:
        ws_s += 1
    else:
        ws_f += 1
    lat_err += abs(gt[0][0] - pred[0])
    lon_err += abs(gt[0][1] - pred[1])
    ws_err += abs(gt[0][2] - pred[2])

In [122]:
lat_acc = (lat_s/(num_groups))*100
lat_err /= num_groups
lon_acc = (lon_s/(num_groups))*100
lon_err /= num_groups
ws_acc = (ws_s/(num_groups))*100
ws_err /= num_groups

print("Lattitude prediction threshold success rate = ", lat_acc)
print("Lattitude average error = ", lat_err)
print("Longitude prediction threshold success rate = ", lon_acc)
print("Longitude average error = ", lon_err)
print("Wind Speed prediction threshold success rate = ", ws_acc)
print("Wind Speed average error = ", ws_err)

Lattitude prediction threshold success rate =  30.395670299129673
Lattitude average error =  0.266207645751
Longitude prediction threshold success rate =  39.34030571198713
Longitude average error =  0.687404977967
Wind Speed prediction threshold success rate =  41.41739194032034
Wind Speed average error =  2.96868602724
