# RNN
## Long Short Term Memory

In [124]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from tensorflow.keras import models, layers, callbacks

In [117]:
DATA_FILE = '../data/export_IA.csv'

df = pd.read_csv(DATA_FILE, parse_dates=[2])
df = df.iloc[: , 1:] # remove the id column
df

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,636017833,2023-05-29 00:00:06,29.07019,-89.29958,13.4,227.6,227.000000,STOLT LOTUS,IMO9617648,D5MX5,83,0,183.000000,28.0,10.000000,83,A
1,367387350,2023-05-29 00:00:01,25.76775,-80.14296,5.6,176.0,145.087443,ISLAND QUEEN,,WDE6981,60,0,20.000000,7.0,1.007000,60,B
2,367477280,2023-05-29 00:00:06,29.25456,-89.97235,0.0,267.4,267.191108,HERO,IMO8964147,WDH2114,60,0,47.000000,9.0,2.300000,60,A
3,367037610,2023-05-29 00:00:06,29.88270,-89.95578,0.4,269.0,222.227120,FRANK H,,WDC5212,60,0,15.000000,5.0,0.391000,60,A
4,368112000,2023-05-29 00:00:10,29.75368,-92.20613,7.1,51.4,96.944237,ADRI LAB,IMO8739023,WDF2836,60,0,49.000000,9.0,1.553000,60,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419080,563326000,2023-05-31 23:56:30,29.19883,-94.40605,4.2,228.8,225.000000,EAGLE KINABALU,IMO9422196,9V8779,80,0,243.000000,42.0,14.600000,80,A
419081,367761740,2023-05-31 23:55:55,29.80404,-93.95511,0.0,360.0,184.815365,GENERAL PATTON,,WDJ2788,33,3,106.670417,0.0,5.239151,33,A
419082,367740670,2023-05-31 23:57:44,27.84062,-97.06931,4.1,168.8,170.653324,CHARLES W. HEALD,,WDI8644,60,0,91.105000,0.0,2.338000,60,A
419083,564329000,2023-05-31 23:58:52,28.60688,-94.12511,13.0,245.6,245.000000,EAGLE KLANG,IMO9417892,9V8640,80,0,243.000000,42.0,14.600000,80,A


In [118]:
df = df.sort_values(by=['MMSI', 'BaseDateTime'])
df.reset_index(drop=True, inplace=True)

df["NextLAT"] = None
df["NextLON"] = None
df["NextTime"] = None

In [119]:
# Super slow and super bad, but idk how to do better
total = len(pd.unique(df["MMSI"]))
for i in range(len(df)-1):

    # If new boat (df is sorted)
    if(df.at[i, "MMSI"] != df.at[i+1, "MMSI"]):
        # print(f"Skipped {total}")
        total -= 1
        continue

    if (df.at[i, "SOG"] < 0.3):
        continue

    df.at[i, "NextLAT"] = df.at[i+1, "LAT"]
    df.at[i, "NextLON"] = df.at[i+1, "LON"]
    df.at[i, "NextTime"] = df.at[i+1, "BaseDateTime"] - df.at[i, "BaseDateTime"]

# Remove columns where we can't find a next location
df = df[df["NextTime"].notna()]

# Fix some stuff
df["NextTime"] = df["NextTime"].apply(pd.Timedelta.total_seconds)
df["VesselType"] = df["VesselType"].astype(str)

df.shape

(149359, 20)

In [120]:
scaler = make_pipeline(
    MinMaxScaler(feature_range=(0, 1)),
)

df[["LAT", "LON", "SOG", "COG", "NextTime", "NextLAT", "NextLON"]] = scaler.fit_transform(df[["LAT", "LON", "SOG", "COG", "NextTime", "NextLAT", "NextLON"]])

In [121]:
x_ = pd.get_dummies(df[["LAT", "LON", "SOG", "COG", "Heading", "VesselType", "NextTime"]])
y_ = df[["NextLAT", "NextLON"]]

x_train, x_test, y_train, y_test = train_test_split(x_, y_)

In [126]:
model = models.Sequential([
    layers.Input((x_.shape[1],1)),
    layers.LSTM(4),
    layers.Dense(1),
])

model.compile(
    loss='mean_squared_error',
    optimizer='adam'
)

early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience = 3, restore_best_weights=True)

model.fit(
    x_train,
    y_train,
    epochs=100,
    batch_size=1,
    callbacks=[early_stopping],
    verbose=2
)

Epoch 1/100
112019/112019 - 469s - 4ms/step - loss: 0.0316
Epoch 2/100


  current = self.get_monitor_value(logs)


112019/112019 - 469s - 4ms/step - loss: 0.0280
Epoch 3/100
112019/112019 - 469s - 4ms/step - loss: 0.0277
Epoch 4/100
112019/112019 - 464s - 4ms/step - loss: 0.0277
Epoch 5/100
112019/112019 - 464s - 4ms/step - loss: 0.0276
Epoch 6/100
112019/112019 - 3236s - 29ms/step - loss: 0.0276
Epoch 7/100
112019/112019 - 150s - 1ms/step - loss: 0.0276
Epoch 8/100
112019/112019 - 190s - 2ms/step - loss: 0.0276
Epoch 9/100
112019/112019 - 169s - 2ms/step - loss: 0.0276
Epoch 10/100
112019/112019 - 170s - 2ms/step - loss: 0.0276
Epoch 11/100
112019/112019 - 164s - 1ms/step - loss: 0.0276
Epoch 12/100
112019/112019 - 167s - 1ms/step - loss: 0.0276
Epoch 13/100
112019/112019 - 167s - 1ms/step - loss: 0.0276
Epoch 14/100
112019/112019 - 166s - 1ms/step - loss: 0.0276
Epoch 15/100
112019/112019 - 167s - 1ms/step - loss: 0.0276
Epoch 16/100
112019/112019 - 167s - 1ms/step - loss: 0.0276
Epoch 17/100
112019/112019 - 166s - 1ms/step - loss: 0.0276
Epoch 18/100
112019/112019 - 274s - 2ms/step - loss: 0.027

<keras.src.callbacks.history.History at 0x7ffebab7dd60>

In [129]:
model.save('v1.keras')

In [128]:
# make predictions
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)

# invert predictions
# pred_train = scaler.inverse_transform(pred_train)
# y_train = scaler.inverse_transform([y_train])
# pred_test = scaler.inverse_transform(pred_test)
# y_test = scaler.inverse_transform([y_train])

[1m3501/3501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 782us/step
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 796us/step


In [59]:
# https://www.geeksforgeeks.org/dsa/haversine-formula-to-find-distance-between-two-points-on-a-sphere/
def haversine(lat1, lon1, lat2, lon2):
    
    # distance between latitudes
    # and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0

    # apply formulae
    a = (pow(math.sin(dLat / 2), 2) + 
         pow(math.sin(dLon / 2), 2) * 
             math.cos(lat1) * math.cos(lat2));
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c * 1000

In [63]:
# calculate root mean squared error
print(pred_train)
score = haversine(y_test["NextLAT"], y_test["NextLON"], pred_train[:, 0], pred_train[:, 1])
print(f'{score}')
# testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
# print('Test Score: %.2f RMSE' % (testScore))

[[0.28735685]
 [0.37679905]
 [0.6522338 ]
 ...
 [0.3735891 ]
 [0.4711529 ]
 [0.40209985]]


IndexError: index 1 is out of bounds for axis 1 with size 1