# KERAS LSTM HURRICANE PREDICTION DATA PREPARATION #



In [94]:
import pandas as pd
import numpy as np
from pandas import read_csv
from datetime import datetime

After importing the needed modules, the CSV file can be parsed using pandas. The `date_parser` option is used in order to later ensure that training takes place on consecutive, observations spaced six hours apart.

In [11]:
def parse_date(raw):
    return datetime.strptime(raw, "%Y%m%d%H")
data = read_csv('cleaned.csv', parse_dates = ['datetime'], date_parser=parse_date)

The following code iterates row by row over the csv file and collects consecutive observations separated by exactly six hours from a particular weather event. This is possible because the csv is sorted in chronological order grouped according to the weather event (*the 'id' field*). It also normalizes the location features to a value 0-1. This has been observed to improve the performance of the network.

In [53]:
gc = 1 #group size counter
gs = 5 #group size target
nrows = len(data)
ri = 1 #row index
test_and_train = []


prev_row = data.iloc[0]

def get_normalized_row(index):
    return [(data.iloc[index]['latitude']/90),(data.iloc[index]['longitude']/180),(data.iloc[index]['maximumwind'])]

cg = [get_normalized_row(ri)] #current group
while ri < nrows:
    if gc < gs:
        if data.iloc[ri]['id'] == prev_row['id'] and (data.iloc[ri]['datetime'] - prev_row['datetime']).seconds == 21600:
            cg.append(get_normalized_row(ri))
            gc += 1
        else:
            cg = [get_normalized_row(ri)]
            gc = 1
        prev_row = data.iloc[ri]
        ri += 1
    else:
        test_and_train.append(cg)
        cg = [get_normalized_row(ri)]
        gc = 1
        prev_row = data.iloc[ri]
        ri += 1

We can see in the next cell that we successfully created 13673 data groups.

In [129]:
num_groups = len(test_and_train)
ttsplit = int(num_groups/5)
train_dim = num_groups - ttsplit
num_groups

13673

In [125]:
train = np.array(test_and_train[:-ttsplit])
test = np.array(test_and_train[-ttsplit:])

In [78]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop

In [143]:
model = Sequential()
model.add(LSTM(32, input_shape=(4,3)))
model.add(Dense(32))
model.add(Dense(32))
model.add(Dense(12))
model.add(Dense(3))
model.add(Activation('sigmoid'))

optimizer = RMSprop(lr=0.1)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [144]:
X, y = train[:,:4].reshape(train_dim,4,3), train[:,-1].reshape(train_dim,3)
X_test, y_test = test[:,:4].reshape(ttsplit,4,3), test[:,-1].reshape(ttsplit,3)

In [145]:
model.fit(X, y, epochs=4, validation_data=(X_test, y_test), verbose=2)

Train on 10939 samples, validate on 2734 samples
Epoch 1/4
 - 6s - loss: 41.0938 - val_loss: 38.9764
Epoch 2/4
 - 5s - loss: 40.6925 - val_loss: 38.9764
Epoch 3/4
 - 5s - loss: 40.6925 - val_loss: 38.9764
Epoch 4/4
 - 5s - loss: 40.6925 - val_loss: 38.9764


<keras.callbacks.History at 0x25839ec0be0>