#  $\color{red}{\mathbf{\text{TRAINING DATA PORCESSING}}}$

In [1]:
import pandas as pd 
import xgboost as xgb
import numpy as np
import geopandas as gpd
from calendar import monthrange
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('ais_train.csv', sep='|')
df.head(10)

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3
5,2024-01-01 00:05:13,186.9,0.0,0,187,5,12-20 02:40,24.27431,-110.32727,61e9f468b937134a3c4c028f,61d37ac11366c3998241da0a
6,2024-01-01 00:05:40,123.4,0.0,128,511,5,12-16 01:00,40.71466,29.46603,61e9f46bb937134a3c4c02b3,61d38259b7b7526e1adf3a41
7,2024-01-01 00:05:49,151.2,0.0,0,20,5,12-31 18:30,-19.25026,146.83507,61e9f3bfb937134a3c4bfe9f,61d36f6e0a1807568ff9a115
8,2024-01-01 00:06:18,265.0,0.1,0,122,1,12-30 19:00,-26.73068,153.29194,61e9f45bb937134a3c4c0221,61d36f640a1807568ff9a103
9,2024-01-01 00:06:29,36.0,0.0,0,70,5,12-30 19:55,35.46922,139.68343,61e9f3e6b937134a3c4bff6d,61d379f61366c3998241d8d2


## Clean AIS data

In [3]:
# Replacing default with Nan bacause too close to valid values, eliminate non valid values
df['cog'] = df['cog'].replace(360, np.nan)
df = df[df['cog'] <= 360]

# Replacing default with Nan bacause too close to valid values
df['sog'] = df['sog'].replace(1023, np.nan)

# Replacing default with Nan bacause too close to valid values
# Changing uncertain values to bigger number to be further away from sample pool
# Adding uncertainty flag
df['rot'] = df['rot'].replace(128, np.nan)
df['rot'] = df['rot'].replace({127: 200, -127: -200})
df['uncertain_rot'] = np.where(df['rot'].isin([200, -200]), 1, 0)

# Replacing default value with NaN to not get taken in consideration by regression
df['heading'] = df['heading'].replace(511, np.nan)

In [4]:
# One hot encoding for the navigation status
df = pd.get_dummies(df, columns=['navstat'], drop_first = True)

### Date-Time handling

In [5]:
df['time'] = pd.to_datetime(df['time'], errors='coerce')
df['etaRaw'] = df['etaRaw'].fillna(0)
df['etaRaw'] = df['etaRaw'].apply(lambda x: f"{2024}-{x}")
df['etaRaw'] = pd.to_datetime(df['etaRaw'], errors='coerce')
df.rename(columns={'etaRaw': 'etaStd'}, inplace=True)

# Handle first month of the years ETA year to be 2023
df['etaStd'] = df.apply(lambda row: row['etaStd'].replace(year=row['etaStd'].year - 1)
                        if row['etaStd'].month in [11, 12] and row['time'].month in [1, 2] 
                        else row['etaStd'], axis=1) 
df['etaStd'].head(10)

0   2024-01-09 23:00:00
1   2023-12-29 20:00:00
2   2024-01-02 09:00:00
3   2023-12-31 20:00:00
4   2024-01-25 12:00:00
5   2023-12-20 02:40:00
6   2023-12-16 01:00:00
7   2023-12-31 18:30:00
8   2023-12-30 19:00:00
9   2023-12-30 19:55:00
Name: etaStd, dtype: datetime64[ns]

In [6]:
# Separate Date-Time in single attributes
df['year_rec'] = df['time'].dt.year
df['month_rec'] = df['time'].dt.month
df['day_rec'] = df['time'].dt.day
df['hour_rec'] = df['time'].dt.hour
df['minute_rec'] = df['time'].dt.minute



df['year_eta'] = df['etaStd'].dt.year.fillna(0).astype('int32')
df['month_eta'] = df['etaStd'].dt.month.fillna(0).astype('int32')
df['day_eta'] = df['etaStd'].dt.day.fillna(0).astype('int32')
df['hour_eta'] = df['etaStd'].dt.hour.fillna(0).astype('int32')
df['minute_eta'] = df['etaStd'].dt.minute.fillna(0).astype('int32')



In [7]:
df[df['month_eta']==0].shape[0]

3759

In [8]:
df.dtypes

time             datetime64[ns]
cog                     float64
sog                     float64
rot                     float64
heading                 float64
etaStd           datetime64[ns]
latitude                float64
longitude               float64
vesselId                 object
portId                   object
uncertain_rot             int64
navstat_1                  bool
navstat_2                  bool
navstat_3                  bool
navstat_4                  bool
navstat_5                  bool
navstat_6                  bool
navstat_7                  bool
navstat_8                  bool
navstat_9                  bool
navstat_11                 bool
navstat_12                 bool
navstat_13                 bool
navstat_14                 bool
navstat_15                 bool
year_rec                  int32
month_rec                 int32
day_rec                   int32
hour_rec                  int32
minute_rec                int32
year_eta                  int32
month_et

In [9]:
def get_month_progress(row, time_type = '_rec'):
    if row[f'month{time_type}'] == 0:
        return np.nan
    days_in_month = monthrange(row[f'year{time_type}'], row[f'month{time_type}'])[1]
    day_progress = (row[f'day{time_type}'] - 1 + row[f'hour{time_type}']/24 + row[f'minute{time_type}']/(24*60)) / days_in_month
    return day_progress

In [10]:
# Recordings time
df['month_sin_rec'] = np.sin(2 * np.pi * df['month_rec'] / 12)
df['month_cos_rec'] = np.cos(2 * np.pi * df['month_rec'] / 12)

df['day_progress_rec'] = df.apply(lambda row: get_month_progress(row, '_rec'), axis=1)
df['day_sin_rec'] = np.sin(2 * np.pi * df['day_progress_rec'])
df['day_cos_rec'] = np.cos(2 * np.pi * df['day_progress_rec'])

hour_progress = df['hour_rec'] + df['minute_rec']/60
df['hour_sin_rec'] = np.sin(2 * np.pi * hour_progress / 24)
df['hour_cos_rec'] = np.cos(2 * np.pi * hour_progress / 24)

df['minute_sin_rec'] = np.sin(2 * np.pi * df['minute_rec'] / 60)
df['minute_cos_rec'] = np.cos(2 * np.pi * df['minute_rec'] / 60)



# ETA
df['month_sin_eta'] = np.sin(2 * np.pi * df['month_eta'] / 12)
df['month_cos_eta'] = np.cos(2 * np.pi * df['month_eta'] / 12)

df['day_progress_eta'] = df.apply(lambda row: get_month_progress(row, '_eta'), axis=1)
df['day_sin_eta'] = np.sin(2 * np.pi * df['day_progress_eta'])
df['day_cos_eta'] = np.cos(2 * np.pi * df['day_progress_eta'])

hour_progress = df['hour_eta'] + df['minute_eta']/60
df['hour_sin_eta'] = np.sin(2 * np.pi * hour_progress / 24)
df['hour_cos_eta'] = np.cos(2 * np.pi * hour_progress / 24)


In [11]:
df.dtypes

time                datetime64[ns]
cog                        float64
sog                        float64
rot                        float64
heading                    float64
etaStd              datetime64[ns]
latitude                   float64
longitude                  float64
vesselId                    object
portId                      object
uncertain_rot                int64
navstat_1                     bool
navstat_2                     bool
navstat_3                     bool
navstat_4                     bool
navstat_5                     bool
navstat_6                     bool
navstat_7                     bool
navstat_8                     bool
navstat_9                     bool
navstat_11                    bool
navstat_12                    bool
navstat_13                    bool
navstat_14                    bool
navstat_15                    bool
year_rec                     int32
month_rec                    int32
day_rec                      int32
hour_rec            

In [12]:
df.head(10)

Unnamed: 0,time,cog,sog,rot,heading,etaStd,latitude,longitude,vesselId,portId,...,hour_cos_rec,minute_sin_rec,minute_cos_rec,month_sin_eta,month_cos_eta,day_progress_eta,day_sin_eta,day_cos_eta,hour_sin_eta,hour_cos_eta
0,2024-01-01 00:00:25,284.0,0.7,0.0,88.0,2024-01-09 23:00:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,...,1.0,0.0,1.0,0.5,0.866025,0.288978,0.970159,-0.242468,-0.258819,0.965926
1,2024-01-01 00:00:36,109.6,0.0,-6.0,347.0,2023-12-29 20:00:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,...,1.0,0.0,1.0,-2.449294e-16,1.0,0.930108,-0.425168,0.905115,-0.8660254,0.5
2,2024-01-01 00:01:45,111.0,11.0,0.0,112.0,2024-01-02 09:00:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,...,0.99999,0.104528,0.994522,0.5,0.866025,0.044355,0.275096,0.961417,0.7071068,-0.707107
3,2024-01-01 00:03:11,96.4,0.0,0.0,142.0,2023-12-31 20:00:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,...,0.999914,0.309017,0.951057,-2.449294e-16,1.0,0.994624,-0.033774,0.999429,-0.8660254,0.5
4,2024-01-01 00:03:51,214.0,19.7,0.0,215.0,2024-01-25 12:00:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,...,0.999914,0.309017,0.951057,0.5,0.866025,0.790323,-0.968077,0.250653,1.224647e-16,-1.0
5,2024-01-01 00:05:13,186.9,0.0,0.0,187.0,2023-12-20 02:40:00,24.27431,-110.32727,61e9f468b937134a3c4c028f,61d37ac11366c3998241da0a,...,0.999762,0.5,0.866025,-2.449294e-16,1.0,0.616487,-0.668293,-0.743898,0.6427876,0.766044
6,2024-01-01 00:05:40,123.4,0.0,,,2023-12-16 01:00:00,40.71466,29.46603,61e9f46bb937134a3c4c02b3,61d38259b7b7526e1adf3a41,...,0.999762,0.5,0.866025,-2.449294e-16,1.0,0.485215,0.092763,-0.995688,0.258819,0.965926
7,2024-01-01 00:05:49,151.2,0.0,0.0,20.0,2023-12-31 18:30:00,-19.25026,146.83507,61e9f3bfb937134a3c4bfe9f,61d36f6e0a1807568ff9a115,...,0.999762,0.5,0.866025,-2.449294e-16,1.0,0.992608,-0.046432,0.998921,-0.9914449,0.130526
8,2024-01-01 00:06:18,265.0,0.1,0.0,122.0,2023-12-30 19:00:00,-26.73068,153.29194,61e9f45bb937134a3c4c0221,61d36f640a1807568ff9a103,...,0.999657,0.587785,0.809017,-2.449294e-16,1.0,0.961022,-0.242468,0.970159,-0.9659258,0.258819
9,2024-01-01 00:06:29,36.0,0.0,0.0,70.0,2023-12-30 19:55:00,35.46922,139.68343,61e9f3e6b937134a3c4bff6d,61d379f61366c3998241d8d2,...,0.999657,0.587785,0.809017,-2.449294e-16,1.0,0.962254,-0.234951,0.972007,-0.8767268,0.480989


In [13]:
df.drop(['month_rec', 'day_rec', 'hour_rec', 'minute_rec', 
         'month_eta', 'day_eta', 'hour_eta', 'minute_eta',
         'day_progress_rec', 'day_progress_eta'], axis=1, inplace=True)

In [14]:
df_ais = df #checkpoint the ais df
df_ais.head(10)

Unnamed: 0,time,cog,sog,rot,heading,etaStd,latitude,longitude,vesselId,portId,...,hour_sin_rec,hour_cos_rec,minute_sin_rec,minute_cos_rec,month_sin_eta,month_cos_eta,day_sin_eta,day_cos_eta,hour_sin_eta,hour_cos_eta
0,2024-01-01 00:00:25,284.0,0.7,0.0,88.0,2024-01-09 23:00:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,...,0.0,1.0,0.0,1.0,0.5,0.866025,0.970159,-0.242468,-0.258819,0.965926
1,2024-01-01 00:00:36,109.6,0.0,-6.0,347.0,2023-12-29 20:00:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,...,0.0,1.0,0.0,1.0,-2.449294e-16,1.0,-0.425168,0.905115,-0.8660254,0.5
2,2024-01-01 00:01:45,111.0,11.0,0.0,112.0,2024-01-02 09:00:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,...,0.004363,0.99999,0.104528,0.994522,0.5,0.866025,0.275096,0.961417,0.7071068,-0.707107
3,2024-01-01 00:03:11,96.4,0.0,0.0,142.0,2023-12-31 20:00:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,...,0.01309,0.999914,0.309017,0.951057,-2.449294e-16,1.0,-0.033774,0.999429,-0.8660254,0.5
4,2024-01-01 00:03:51,214.0,19.7,0.0,215.0,2024-01-25 12:00:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,...,0.01309,0.999914,0.309017,0.951057,0.5,0.866025,-0.968077,0.250653,1.224647e-16,-1.0
5,2024-01-01 00:05:13,186.9,0.0,0.0,187.0,2023-12-20 02:40:00,24.27431,-110.32727,61e9f468b937134a3c4c028f,61d37ac11366c3998241da0a,...,0.021815,0.999762,0.5,0.866025,-2.449294e-16,1.0,-0.668293,-0.743898,0.6427876,0.766044
6,2024-01-01 00:05:40,123.4,0.0,,,2023-12-16 01:00:00,40.71466,29.46603,61e9f46bb937134a3c4c02b3,61d38259b7b7526e1adf3a41,...,0.021815,0.999762,0.5,0.866025,-2.449294e-16,1.0,0.092763,-0.995688,0.258819,0.965926
7,2024-01-01 00:05:49,151.2,0.0,0.0,20.0,2023-12-31 18:30:00,-19.25026,146.83507,61e9f3bfb937134a3c4bfe9f,61d36f6e0a1807568ff9a115,...,0.021815,0.999762,0.5,0.866025,-2.449294e-16,1.0,-0.046432,0.998921,-0.9914449,0.130526
8,2024-01-01 00:06:18,265.0,0.1,0.0,122.0,2023-12-30 19:00:00,-26.73068,153.29194,61e9f45bb937134a3c4c0221,61d36f640a1807568ff9a103,...,0.026177,0.999657,0.587785,0.809017,-2.449294e-16,1.0,-0.242468,0.970159,-0.9659258,0.258819
9,2024-01-01 00:06:29,36.0,0.0,0.0,70.0,2023-12-30 19:55:00,35.46922,139.68343,61e9f3e6b937134a3c4bff6d,61d379f61366c3998241d8d2,...,0.026177,0.999657,0.587785,0.809017,-2.449294e-16,1.0,-0.234951,0.972007,-0.8767268,0.480989


## Clean vessels data

In [15]:
df_vessels = pd.read_csv('vessels.csv', sep='|')
# Drop useless data
df_vessels.drop('GT', axis=1, inplace=True)
df_vessels.drop('NT', axis=1, inplace=True)
df_vessels.drop('depth', axis=1, inplace=True)
df_vessels.drop('draft', axis=1, inplace=True)
df_vessels.drop('homePort', axis=1, inplace=True)
df_vessels.drop('maxHeight', axis=1, inplace=True)
df_vessels.drop('maxWidth', axis=1, inplace=True)
# One hot encoding for vessels types
df_vessels = pd.get_dummies(df_vessels, columns=['vesselType']) #no drop_first to handle the NaN

In [16]:
df = pd.merge(df_ais, df_vessels, on='vesselId', how='left')
df.head(10)

# Added: shippingLineId | CEU | DWT | vesselType(1 Hot) | breath | lenght 

Unnamed: 0,time,cog,sog,rot,heading,etaStd,latitude,longitude,vesselId,portId,...,enginePower,freshWater,fuel,length,maxSpeed,rampCapacity,yearBuilt,vesselType_14.0,vesselType_21.0,vesselType_83.0
0,2024-01-01 00:00:25,284.0,0.7,0.0,88.0,2024-01-09 23:00:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,...,0.0,,,199.0,,,2004,False,False,True
1,2024-01-01 00:00:36,109.6,0.0,-6.0,347.0,2023-12-29 20:00:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,...,15100.0,663.0,2742.0,199.97,22.5,,2012,False,False,True
2,2024-01-01 00:01:45,111.0,11.0,0.0,112.0,2024-01-02 09:00:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,...,0.0,,,199.0,,,2005,False,False,True
3,2024-01-01 00:03:11,96.4,0.0,0.0,142.0,2023-12-31 20:00:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,...,0.0,,,199.0,,,1995,False,False,True
4,2024-01-01 00:03:51,214.0,19.7,0.0,215.0,2024-01-25 12:00:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,...,15130.0,331.0,2951.0,199.95,22.4,,2010,False,False,True
5,2024-01-01 00:05:13,186.9,0.0,0.0,187.0,2023-12-20 02:40:00,24.27431,-110.32727,61e9f468b937134a3c4c028f,61d37ac11366c3998241da0a,...,16200.0,,,193.0,,,2009,False,True,False
6,2024-01-01 00:05:40,123.4,0.0,,,2023-12-16 01:00:00,40.71466,29.46603,61e9f46bb937134a3c4c02b3,61d38259b7b7526e1adf3a41,...,0.0,,,200.0,,,2011,False,True,False
7,2024-01-01 00:05:49,151.2,0.0,0.0,20.0,2023-12-31 18:30:00,-19.25026,146.83507,61e9f3bfb937134a3c4bfe9f,61d36f6e0a1807568ff9a115,...,14121.0,420.0,3288.0,199.94,21.0,,1998,False,False,True
8,2024-01-01 00:06:18,265.0,0.1,0.0,122.0,2023-12-30 19:00:00,-26.73068,153.29194,61e9f45bb937134a3c4c0221,61d36f640a1807568ff9a103,...,12210.0,346.0,2408.0,183.0,22.2,,2011,False,False,True
9,2024-01-01 00:06:29,36.0,0.0,0.0,70.0,2023-12-30 19:55:00,35.46922,139.68343,61e9f3e6b937134a3c4bff6d,61d379f61366c3998241d8d2,...,14160.0,405.0,3101.0,199.54,21.9,,2003,False,False,True


## Clean ports data

In [17]:
df_ports = pd.read_csv('ports.csv', sep='|')
# Drop useless
df_ports.drop('name', axis=1, inplace=True)
df_ports.drop('portLocation', axis=1, inplace=True)
df_ports.drop('countryName', axis=1, inplace=True)
df_ports.drop('UN_LOCODE', axis=1, inplace=True)
df_ports.drop('ISO', axis=1, inplace=True)
# Rename to differenciate
df_ports.rename(columns={'longitude': 'portLongitude', 'latitude': 'portLatitude'}, inplace=True)

In [18]:
df_uni = pd.merge(df, df_ports, on='portId', how='left')
df_uni.head(10)

Unnamed: 0,time,cog,sog,rot,heading,etaStd,latitude,longitude,vesselId,portId,...,fuel,length,maxSpeed,rampCapacity,yearBuilt,vesselType_14.0,vesselType_21.0,vesselType_83.0,portLongitude,portLatitude
0,2024-01-01 00:00:25,284.0,0.7,0.0,88.0,2024-01-09 23:00:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,...,,199.0,,,2004,False,False,True,-71.618889,-33.5875
1,2024-01-01 00:00:36,109.6,0.0,-6.0,347.0,2023-12-29 20:00:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,...,2742.0,199.97,22.5,,2012,False,False,True,-79.533,8.967
2,2024-01-01 00:01:45,111.0,11.0,0.0,112.0,2024-01-02 09:00:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,...,,199.0,,,2005,False,False,True,-76.558889,39.2325
3,2024-01-01 00:03:11,96.4,0.0,0.0,142.0,2023-12-31 20:00:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,...,,199.0,,,1995,False,False,True,150.899444,-34.4625
4,2024-01-01 00:03:51,214.0,19.7,0.0,215.0,2024-01-25 12:00:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,...,2951.0,199.95,22.4,,2010,False,False,True,-5.817,35.783
5,2024-01-01 00:05:13,186.9,0.0,0.0,187.0,2023-12-20 02:40:00,24.27431,-110.32727,61e9f468b937134a3c4c028f,61d37ac11366c3998241da0a,...,,193.0,,,2009,False,True,False,-109.054444,25.594167
6,2024-01-01 00:05:40,123.4,0.0,,,2023-12-16 01:00:00,40.71466,29.46603,61e9f46bb937134a3c4c02b3,61d38259b7b7526e1adf3a41,...,,200.0,,,2011,False,True,False,29.841944,40.751111
7,2024-01-01 00:05:49,151.2,0.0,0.0,20.0,2023-12-31 18:30:00,-19.25026,146.83507,61e9f3bfb937134a3c4bfe9f,61d36f6e0a1807568ff9a115,...,3288.0,199.94,21.0,,1998,False,False,True,151.252778,-23.83
8,2024-01-01 00:06:18,265.0,0.1,0.0,122.0,2023-12-30 19:00:00,-26.73068,153.29194,61e9f45bb937134a3c4c0221,61d36f640a1807568ff9a103,...,2408.0,183.0,22.2,,2011,False,False,True,153.169444,-27.3825
9,2024-01-01 00:06:29,36.0,0.0,0.0,70.0,2023-12-30 19:55:00,35.46922,139.68343,61e9f3e6b937134a3c4bff6d,61d379f61366c3998241d8d2,...,3101.0,199.54,21.9,,2003,False,False,True,139.667778,35.436389


In [19]:
df_uni.duplicated().sum()

0

## $\color{red}{\mathbf{\text{Feature engineering}}}$

In [20]:
# Sort train data on vesselId and by time
df_uni = df_uni.sort_values(by=['vesselId', 'time'])

### Time since last data collection

In [21]:
df_uni['time_diff_minutes'] = df_uni.groupby('vesselId')['time'].diff().dt.total_seconds() / 60
# Make integer changing numbers 0<x<1 to one so that the 0 is only a separator between ships
df_uni['time_diff_minutes'] = df_uni['time_diff_minutes'].fillna(0).apply(lambda x: 1 if 0 < x < 1 else x).astype(int)

### Distance since last data collection

In [22]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))


In [23]:
def calculate_distances(df):
    # Create new columns for previous latitude and longitude
    df['prev_latitude'] = df.groupby('vesselId')['latitude'].shift(1)
    df['prev_longitude'] = df.groupby('vesselId')['longitude'].shift(1)

    # Calculate distance between current and previous position
    df['distance_km'] = df.apply(
        lambda row: haversine(row['latitude'], row['longitude'], 
                               row['prev_latitude'], row['prev_longitude']) 
                     if pd.notna(row['prev_latitude']) and pd.notna(row['prev_longitude']) 
                     else np.nan, axis=1)
    
    # Drop the helper columns if not needed
    df.drop(columns=['prev_latitude', 'prev_longitude'], inplace=True)

    return df

In [24]:
df_uni = calculate_distances(df_uni)

### Calculate average speed in previous stretch

In [25]:
df_uni['mph_last_stretch'] = (df_uni['distance_km'] *  0.621371) / (df_uni['time_diff_minutes'] / 60)

In [26]:
df_uni[df_uni['mph_last_stretch']>60].shape[0] 

260

In [27]:
# Remove unrealist values over the datadase
df_uni = df_uni[df_uni['mph_last_stretch']<60] 

### Bettering data forecasting capabilities of the model with differencing and lag features

In [28]:
# Differencing with previous row
df_uni['cog_change'] = df_uni.groupby('vesselId')['cog'].diff().fillna(0)
df_uni['sog_change'] = df_uni.groupby('vesselId')['sog'].diff().fillna(0)
df_uni['rot_change'] = df_uni.groupby('vesselId')['rot'].diff().fillna(0)
df_uni['heading_change'] = df_uni.groupby('vesselId')['heading'].diff().fillna(0)

# 1 passage lag features
df_uni['cog_lag'] = df_uni.groupby('vesselId')['cog'].shift(1)
df_uni['sog_lag'] = df_uni.groupby('vesselId')['sog'].shift(1)
df_uni['rot_lag'] = df_uni.groupby('vesselId')['rot'].shift(1)
df_uni['heading_lag'] = df_uni.groupby('vesselId')['heading'].shift(1)
df_uni['mph_last_stretch'] = df_uni.groupby('vesselId')['mph_last_stretch'].shift(1)


# 2 passages lag features
df_uni['cog_lag_2'] = df_uni.groupby('vesselId')['cog'].shift(2)
df_uni['sog_lag_2'] = df_uni.groupby('vesselId')['sog'].shift(2)
df_uni['rot_lag_2'] = df_uni.groupby('vesselId')['rot'].shift(2)
df_uni['heading_lag_2'] = df_uni.groupby('vesselId')['heading'].shift(2)
df_uni['mph_last_stretch_2'] = df_uni.groupby('vesselId')['mph_last_stretch'].shift(2)

### Adding rolling mean

In [29]:
window_size = 5

df_uni['cog_rolling_mean'] = df_uni.groupby('vesselId')['cog'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
df_uni['sog_rolling_mean'] = df_uni.groupby('vesselId')['sog'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
df_uni['rot_rolling_mean'] = df_uni.groupby('vesselId')['rot'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
df_uni['heading_rolling_mean'] = df_uni.groupby('vesselId')['heading'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
df_uni['mph_l_s_rolling_mean'] = df_uni.groupby('vesselId')['mph_last_stretch'].rolling(window=window_size).mean().reset_index(level=0, drop=True)

### Adding a flag that tells if there's a non-zero possibility of the row beeing the beginning of a new trip

In [30]:
# Selecting a threshold where we tell the model to consider the posibility of a new trip starting
# Current: 8h20m, 200km traveled (15mph avg speed)
# Not realistic but reasonable to consider shorter path for training
# Maybe change later
df_uni['new_trip'] = np.where(
    (df_uni['time_diff_minutes'] == 0) | (df_uni['time_diff_minutes'] >= 500) |
    (df_uni['distance_km'].isna()) | (df_uni['distance_km'] >= 200),
    1, 0  # Set to 1 if conditions are met, else 0
)
# Might add differerence in eta as an indicator 

In [31]:
df_uni.head(10)

Unnamed: 0,time,cog,sog,rot,heading,etaStd,latitude,longitude,vesselId,portId,...,sog_lag_2,rot_lag_2,heading_lag_2,mph_last_stretch_2,cog_rolling_mean,sog_rolling_mean,rot_rolling_mean,heading_rolling_mean,mph_l_s_rolling_mean,new_trip
130795,2024-01-12 14:31:00,307.6,17.3,5.0,313.0,2024-01-14 23:30:00,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,,,,,,,,,,0
131027,2024-01-12 14:57:23,306.8,16.9,5.0,312.0,2024-01-14 23:30:00,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,,,,,,,,,,0
131207,2024-01-12 15:18:48,307.9,16.9,6.0,313.0,2024-01-14 23:30:00,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,17.3,5.0,313.0,,,,,,,0
131394,2024-01-12 15:39:47,307.0,16.3,7.0,313.0,2024-01-14 23:30:00,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.9,5.0,312.0,20.143059,,,,,,0
131545,2024-01-12 15:54:48,307.6,16.1,5.0,313.0,2024-01-14 23:30:00,7.81285,77.18147,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.9,6.0,313.0,20.178789,307.38,16.7,5.6,312.8,,0
131742,2024-01-12 16:14:59,309.5,16.1,-6.0,313.0,2024-01-14 23:30:00,7.86929,77.11032,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.3,7.0,313.0,19.922864,307.76,16.46,3.4,312.8,19.827169,0
131897,2024-01-12 16:35:24,308.7,16.0,2.0,311.0,2024-01-14 23:30:00,7.92585,77.03811,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.1,5.0,313.0,20.189616,308.14,16.28,2.8,312.6,19.541914,0
132039,2024-01-12 16:55:24,310.4,16.0,-1.0,311.0,2024-01-14 23:30:00,7.98258,76.9688,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.1,-6.0,313.0,18.701516,308.64,16.1,1.4,312.2,19.286359,0
132173,2024-01-12 17:14:36,307.5,16.1,6.0,307.0,2024-01-14 23:30:00,8.03598,76.90095,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.0,2.0,311.0,18.716786,308.74,16.06,1.2,311.0,18.993505,0
132385,2024-01-12 17:36:36,322.2,16.2,-4.0,319.0,2024-01-14 23:30:00,8.10476,76.83078,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,...,16.0,-1.0,311.0,18.901012,311.66,16.08,-0.6,312.2,18.700771,0


In [32]:
# Columns      , Number of vessel IDs
df_uni.shape[1], df_uni['vesselId'].nunique()

(79, 687)

In [34]:
df_uni.dtypes

time                    datetime64[ns]
cog                            float64
sog                            float64
rot                            float64
heading                        float64
etaStd                  datetime64[ns]
latitude                       float64
longitude                      float64
vesselId                        object
portId                          object
uncertain_rot                    int64
navstat_1                         bool
navstat_2                         bool
navstat_3                         bool
navstat_4                         bool
navstat_5                         bool
navstat_6                         bool
navstat_7                         bool
navstat_8                         bool
navstat_9                         bool
navstat_11                        bool
navstat_12                        bool
navstat_13                        bool
navstat_14                        bool
navstat_15                        bool
year_rec                 

### TRAIN SET MODIFICATIONS
- Data cleaning on cog, sog, rot and heading 
- One hot encoding of NavStat
- sin/cos transformation of dates of time and ETA
- Vessels info, one hot encoding vessel types
- Arrival port coordinates
- Time difference from last data collection
- Distance form last data collection
- Avg speed over last stretch
- Change in cog, sog, rot and heading
- Lag features 
- Rolling means
- Probable new trip flag

In [35]:
df_uni.to_csv('proc_test_0.1.csv', index=False)