In [None]:
!pip install basemap
!pip install basemap-data-hires

!pip install ipympl
!pip install nodejs-bin
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!jupyter labextension install jupyter-matplotlib
!pip install graphviz
!pip install shapely

In [62]:
import pandas as pd
import numpy as np
import importlib
from datetime import datetime
from datetime import timedelta
import time
import copy

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import math
import IPython
import os

In [63]:
base_lat = [32, 36]
base_lon = [125, 129]

# 탱커와 벌커의 평균 속도: 15~16노트(28~30km/h)
avg_speed = 25
pivot_time = 5
gridmap_pivot_distance_km = 5

In [64]:
TRAINING_DIR = 'training_data/'
TRAINING_CSV_LIST = ['Dynamic_20200204_13_0_0.csv',
                     'Dynamic_20200204_14_0_0.csv',
                     'Dynamic_20200204_15_0_0.csv',
                     'Dynamic_20200204_16_0_0.csv']
data_set = pd.DataFrame()

In [65]:
for csv in TRAINING_CSV_LIST:
    file = TRAINING_DIR + csv
    print('Read file:%s'%file)
    if os.path.exists(file):
        tdf = pd.read_csv(file)
        data_set = pd.concat([data_set, tdf])
    else:
        print('%s is not exist.'%file)

Read file:training_data/Dynamic_20200204_13_0_0.csv
Read file:training_data/Dynamic_20200204_14_0_0.csv
Read file:training_data/Dynamic_20200204_15_0_0.csv
Read file:training_data/Dynamic_20200204_16_0_0.csv


In [66]:
data_set = data_set.sample(frac=1).reset_index(drop=True)

In [67]:
data_set

Unnamed: 0,Measurement_time,Predict_time_time,N_of_ships,Grid1_N_of_ships,Grid1_pos_weight,Grid1_avg_SOG,Grid2_N_of_ships,Grid2_pos_weight,Grid2_avg_SOG,Grid3_N_of_ships,...,Grid86_avg_SOG,Grid87_N_of_ships,Grid87_pos_weight,Grid87_avg_SOG,Grid88_N_of_ships,Grid88_pos_weight,Grid88_avg_SOG,Grid89_N_of_ships,Grid89_pos_weight,Grid89_avg_SOG
0,2020-02-04 14:00:00,2020-02-04 16:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,1.0,0.0,0.0,1.0,0.0,0.4,0.0,1.0
1,2020-02-04 16:00:00,2020-02-04 20:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-02-04 14:00:00,2020-02-04 16:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
3,2020-02-04 13:00:00,2020-02-04 14:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.00,0.0,0.0,1.7,10.0,2.0,0.0,0.0,0.0
4,2020-02-04 16:00:00,2020-02-04 20:00:00,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.16,5.0,5.0,0.0,24.0,0.0,6.8,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28699,2020-02-04 14:00:00,2020-02-04 16:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,109.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0
28700,2020-02-04 16:00:00,2020-02-04 20:00:00,0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,8.10,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
28701,2020-02-04 13:00:00,2020-02-04 14:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28702,2020-02-04 13:00:00,2020-02-04 14:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 우선 선박의 수로 예측을 진행하기로 함.

In [68]:
nmin = min(data_set['N_of_ships'].values)
nmax = max(data_set['N_of_ships'].values)

print('min of N : %d'%nmin)
print('max of N : %d'%nmax)

min of N : 0
max of N : 73


In [69]:
# space_itv = [0, 1, 2, 3, 5, 8, 13, 21, 34]
space_itv = [0, 5, 10, 15, 20, 30, 40, 50]
space = []

for i in range(0, len(space_itv)-1):
    tspace = [x for x in data_set['N_of_ships'].values if x >= space_itv[i] and x < space_itv[i+1]]
    space.append(tspace)
space.append([x for x in data_set['N_of_ships'].values if x >= space_itv[len(space_itv)-1]])

In [70]:
for s in space:
    print(len(s))

28167
322
78
51
44
20
10
12


In [71]:
### training에 사용되는 column들 ###
label = data_set['N_of_ships'].values
cols = list(data_set)[3:]
column_set = data_set[cols].astype(float)

In [72]:
label

array([0, 0, 0, ..., 0, 0, 0])

In [74]:
column_set

Unnamed: 0,Grid1_N_of_ships,Grid1_pos_weight,Grid1_avg_SOG,Grid2_N_of_ships,Grid2_pos_weight,Grid2_avg_SOG,Grid3_N_of_ships,Grid3_pos_weight,Grid3_avg_SOG,Grid4_N_of_ships,...,Grid86_avg_SOG,Grid87_N_of_ships,Grid87_pos_weight,Grid87_avg_SOG,Grid88_N_of_ships,Grid88_pos_weight,Grid88_avg_SOG,Grid89_N_of_ships,Grid89_pos_weight,Grid89_avg_SOG
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,1.0,0.0,0.0,1.0,0.0,0.4,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.00,0.0,0.0,1.7,10.0,2.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.16,5.0,5.0,0.0,24.0,0.0,6.8,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,109.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0
28700,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.10,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
28701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
### 변수들의 scale이 너무 다른 경우를 대비하여, scaling을 진행해준다 ###
# normalize the dataset
scaler = StandardScaler()
scaler = scaler.fit(column_set)
data_set_scaled = scaler.transform(column_set)
data_set_scaled

array([[-0.05902755, -0.12356861, -0.0481361 , ..., -0.19663786,
         0.        ,  0.19885675],
       [-0.05902755, -0.12356861, -0.0481361 , ..., -0.32574262,
         0.        , -0.21875989],
       [-0.05902755, -0.12356861, -0.0481361 , ..., -0.32574262,
         0.        , -0.21875989],
       ...,
       [-0.05902755, -0.12356861, -0.0481361 , ..., -0.32574262,
         0.        , -0.21875989],
       [-0.05902755, -0.12356861, -0.0481361 , ..., -0.32574262,
         0.        , -0.21875989],
       [-0.05902755, -0.12356861, -0.0481361 , ..., -0.32574262,
         0.        , -0.21875989]])

In [76]:
# split to train data and test data
n_train = int(0.9*data_set_scaled.shape[0]) # 트레이닝 데이터와 테스트 데이터의 수를 9:1 비율로 분할. 
train_data_scaled = data_set_scaled[0: n_train] 
train_dates = column_set[0: n_train]

test_data_scaled = data_set_scaled[n_train:]
test_dates = column_set[n_train:]
# print(test_dates.head(5))

In [77]:
print('Shape of train set:' + str(train_dates.shape))
print('Shape of test set:' + str(test_dates.shape))

Shape of train set:(25833, 267)
Shape of test set:(2871, 267)


In [83]:
# 하이퍼파라미터들
# data reformatting for LSTM
input_dim = 3  # input_dimension: training column 길이
# sequence length: 예측에 사용되는 data의 row길이. 이 경우에는 detected grid의 숫자로 train_dates.shape[1] / input_dim과 동일.
seq_len = int(train_dates.shape[1] / input_dim)

trainX = []
trainY = []
testX = []
testY = []

In [None]:


for i in range(seq_len, n_train-pred_days +1):
    # 2차원 튜플 형태의 데이터 셋을 training 데이터로 저장. (seq_len길이 만큼의 데이터를 저장. 뒤의 길이는 input_dim과 동일함)
    trainX.append(train_data_scaled[i - seq_len:i, 0:train_data_scaled.shape[1]])
    # 예측 대상. 
    trainY.append(train_data_scaled[i + pred_days - 1:i + pred_days, 0])

for i in range(seq_len, len(test_data_scaled)-pred_days +1):
    testX.append(test_data_scaled[i - seq_len:i, 0:test_data_scaled.shape[1]])
    testY.append(test_data_scaled[i + pred_days - 1:i + pred_days, 0])

trainX, trainY = np.array(trainX), np.array(trainY)
testX, testY = np.array(testX), np.array(testY)