# KERAS LSTM HURRICANE PREDICTION DATA PREPARATION #



In [None]:
import pandas as pd
import numpy as np
from pandas import read_csv
from datetime import datetime

After importing the needed modules, the CSV file can be parsed using pandas. The `date_parser` option is used in order to later ensure that training takes place on consecutive, observations spaced six hours apart.

In [None]:
def parse_date(raw):
    return datetime.strptime(raw, "%Y%m%d%H")
data = read_csv('data/cleaned.csv', parse_dates = ['datetime'], date_parser=parse_date)

The following code iterates row by row over the csv file and collects consecutive observations separated by exactly six hours from a particular weather event. This is possible because the csv is sorted in chronological order grouped according to the weather event (*the 'id' field*). It also normalizes the location features to a value 0-1. This has been observed to improve the performance of the network.

In [None]:
gc = 1 #group size counter
gs = 5 #group size target
nrows = len(data)
ri = 1 #row index
test_and_train = []


prev_row = data.iloc[0]

def get_normalized_row(index):
    return [(data.iloc[index]['latitude']),(data.iloc[index]['longitude']),(data.iloc[index]['maximumwind'])]

cg = [get_normalized_row(ri)] #current group
while ri < nrows:
    if gc < gs:
        if data.iloc[ri]['id'] == prev_row['id'] and (data.iloc[ri]['datetime'] - prev_row['datetime']).seconds == 21600:
            cg.append(get_normalized_row(ri))
            gc += 1
        else:
            cg = [get_normalized_row(ri)]
            gc = 1
        prev_row = data.iloc[ri]
        ri += 1
    else:
        test_and_train.append(cg)
        cg = [get_normalized_row(ri)]
        gc = 1
        prev_row = data.iloc[ri]
        ri += 1

We can see in the next cell that we successfully created 13673 data groups.

In [None]:
num_groups = len(test_and_train)
test_and_train = np.array(test_and_train)
num_groups

In [None]:
train = np.array(test_and_train[:-ttsplit])
test = np.array(test_and_train[-ttsplit:])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LearningRateScheduler

In [None]:
def lr_decay(epoch):
    if epoch < 3: return 0.1
    if epoch < 10: return 0.01
    return 0.001

In [None]:
model = Sequential()
model.add(LSTM(48, input_shape=(4,3)))
model.add(Dense(32))
model.add(Dense(16))
model.add(Dense(3))

optimizer = RMSprop(lr=0.0001)
model.compile(loss="mae", optimizer=optimizer, metrics=['acc'])
#model.compile(loss="mae", optimizer='adam', metrics=['acc'])

In [None]:
X, y = test_and_train[:,:4].reshape(num_groups,4,3), test_and_train[:,-1].reshape(num_groups,3)

In [None]:
#model.fit(X, y, epochs=75, batch_size=50, validation_split=0.33, verbose=2, callbacks=[LearningRateScheduler(lr_decay)])
model.fit(X, y, epochs=50, batch_size=5, validation_split=0.2, verbose=2)

In [113]:
model.predict(X[796].reshape(1,4,3))

array([[  12.58266926, -101.92967224,   25.24845314]], dtype=float32)

In [112]:
X[796]

array([[  11.9,  -98.2,   25. ],
       [  12. ,  -99.2,   25. ],
       [  12.1, -100.1,   25. ],
       [  12.3, -101. ,   25. ]])

In [111]:
y[796]

array([  12.5, -101.8,   25. ])