# KERAS LSTM HURRICANE PREDICTION DATA PREPARATION #



In [1]:
import pandas as pd
import numpy as np
from pandas import read_csv
from datetime import datetime

After importing the needed modules, the CSV file can be parsed using pandas. The `date_parser` option is used in order to later ensure that training takes place on consecutive, observations spaced six hours apart.

In [2]:
def parse_date(raw):
    return datetime.strptime(raw, "%Y%m%d%H")
data = read_csv('data/cleaned.csv', parse_dates = ['datetime'], date_parser=parse_date)

The following code iterates row by row over the csv file and collects consecutive observations separated by exactly six hours from a particular weather event. This is possible because the csv is sorted in chronological order grouped according to the weather event (*the 'id' field*). It also normalizes the location features to a value 0-1. This has been observed to improve the performance of the network.

In [3]:
gc = 1 #group size counter
gs = 5 #group size target
nrows = len(data)
ri = 1 #row index
test_and_train = []


prev_row = data.iloc[0]

def get_normalized_row(index):
    return [(data.iloc[index]['latitude']),(data.iloc[index]['longitude']),(data.iloc[index]['maximumwind'])]

cg = [get_normalized_row(ri)] #current group
while ri < nrows:
    if gc < gs:
        if data.iloc[ri]['id'] == prev_row['id'] and (data.iloc[ri]['datetime'] - prev_row['datetime']).seconds == 21600:
            cg.append(get_normalized_row(ri))
            gc += 1
        else:
            cg = [get_normalized_row(ri)]
            gc = 1
        prev_row = data.iloc[ri]
        ri += 1
    else:
        test_and_train.append(cg)
        cg = [get_normalized_row(ri)]
        gc = 1
        prev_row = data.iloc[ri]
        ri += 1

We can see in the next cell that we successfully created 13673 data groups.

In [4]:
num_groups = len(test_and_train)
test_and_train = np.array(test_and_train)
num_groups

13673

In [5]:
#Use validation_split in keras fit function for automatic splitting
#train = np.array(test_and_train[:-ttsplit])
#test = np.array(test_and_train[-ttsplit:])

Now that we have our data set (contained in the test_and_train variable), we can import the keras modules to build the network.

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


The following function is a callback called each epoch of the fit function. This allows us to start with a higher learning rate that gets decays over time to ruduce overall training time without sacrificing accuracy.

In [66]:
def lr_decay(epoch):
    if epoch < 2: return 0.01
    if epoch < 6: return 0.001
    return 0.0001

In [116]:
def lr_min(epoch):
    return 0.0001

Originally attempted to use adam optimizer and tried a variety of different configurations of network. 

In [105]:
model = Sequential()
model.add(LSTM(100, input_shape=(4,3), return_sequences=False)) #Input layer, LSTM with 48 neurons
model.add(Dense(16))
model.add(Dense(3))

#optimizer = RMSprop(0.0001)
model.compile(loss="mae", optimizer='rmsprop', metrics=['acc'])
#model.compile(loss="mse", optimizer='adam', metrics=['acc']) #adam optimizer performed worse than RMSprop

Need to reshape the dataset before fitting.

In [10]:
X, y = test_and_train[:,:4].reshape(num_groups,4,3), test_and_train[:,-1].reshape(num_groups,3)

In [117]:
#model.fit(X, y, epochs=20, batch_size=50, validation_split=0.01, verbose=2, callbacks=[LearningRateScheduler(lr_decay)])
model.fit(X, y, epochs=10, batch_size=10, validation_split=0.01, verbose=2, callbacks=[LearningRateScheduler(lr_min)])

Train on 13536 samples, validate on 137 samples
Epoch 1/10
 - 17s - loss: 1.4773 - acc: 0.9787 - val_loss: 1.2974 - val_acc: 0.9781
Epoch 2/10
 - 16s - loss: 1.4508 - acc: 0.9787 - val_loss: 1.2703 - val_acc: 0.9781
Epoch 3/10
 - 16s - loss: 1.4428 - acc: 0.9789 - val_loss: 1.2915 - val_acc: 0.9781
Epoch 4/10
 - 17s - loss: 1.4408 - acc: 0.9792 - val_loss: 1.2688 - val_acc: 0.9781
Epoch 5/10
 - 17s - loss: 1.4382 - acc: 0.9792 - val_loss: 1.2492 - val_acc: 0.9781
Epoch 6/10
 - 17s - loss: 1.4342 - acc: 0.9793 - val_loss: 1.2590 - val_acc: 0.9854
Epoch 7/10
 - 17s - loss: 1.4321 - acc: 0.9795 - val_loss: 1.2553 - val_acc: 0.9781
Epoch 8/10
 - 17s - loss: 1.4320 - acc: 0.9789 - val_loss: 1.2317 - val_acc: 0.9781
Epoch 9/10
 - 19s - loss: 1.4287 - acc: 0.9791 - val_loss: 1.2380 - val_acc: 0.9781
Epoch 10/10
 - 19s - loss: 1.4256 - acc: 0.9788 - val_loss: 1.2427 - val_acc: 0.9854


<keras.callbacks.History at 0x2238cb96470>

In [118]:
lat_s = 0
lon_s = 0
ws_s = 0
lat_f = 0
lon_f = 0
ws_f = 0
lat_err = 0
lon_err = 0
ws_err = 0

for iter in range(num_groups):
    gt = model.predict(X[iter].reshape(1,4,3))
    pred = y[iter]
    if abs(gt[0][0] - pred[0]) <= 0.1:
        lat_s += 1
    else:
        lat_f += 1
    if abs(gt[0][1] - pred[1]) <= 0.2:
        lon_s += 1
    else:
        lon_f += 1
    if abs(gt[0][2] - pred[2]) <= 1:
        ws_s += 1
    else:
        ws_f += 1
    lat_err += abs(gt[0][0] - pred[0])
    lon_err += abs(gt[0][1] - pred[1])
    ws_err += abs(gt[0][2] - pred[2])

In [119]:
lat_acc = (lat_s/(num_groups))*100
lat_err /= num_groups
lon_acc = (lon_s/(num_groups))*100
lon_err /= num_groups
ws_acc = (ws_s/(num_groups))*100
ws_err /= num_groups

print("Lattitude prediction threshold success rate = ", lat_acc)
print("Lattitude average error = ", lat_err)
print("Longitude prediction threshold success rate = ", lon_acc)
print("Longitude average error = ", lon_err)
print("Wind Speed prediction threshold success rate = ", ws_acc)
print("Wind Speed average error = ", ws_err)

Lattitude prediction threshold success rate =  22.7382432531266
Lattitude average error =  0.355981987295
Longitude prediction threshold success rate =  28.296643019088712
Longitude average error =  0.84332578814
Wind Speed prediction threshold success rate =  38.78446573539092
Wind Speed average error =  3.05020879996
