# DATA PROCESSING FOR XGB TRAINED ON EVERY DAY DISTANCE

In [1]:
import pandas as pd 
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import geopandas as gpd 
from calendar import monthrange
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
ports = pd.read_csv('ports.csv', sep ='|')
vessels = pd.read_csv('vessels.csv', sep ='|')
# Clean port data
ports = ports.drop(columns=['name', 'portLocation', 'UN_LOCODE', 'countryName', 'ISO'], errors='ignore')
# Rename latitude and longitude to distinguish them
ports = ports.rename(columns={
    'latitude': 'port_latitude',
    'longitude': 'port_longitude'
})
# Clean vessels data
vessels = vessels[['shippingLineId', 'vesselId', 'CEU', 'vesselType', 'enginePower', 'freshWater', 'fuel', 'maxSpeed', 'rampCapacity']]
# One hot encoding for the vessel type
vessels = pd.get_dummies(vessels, columns=['vesselType']) # No drop_first to handle the NaN

def preprocess(df):
    # Replacing default with Nan bacause too close to valid values, eliminate non valid values
    df['cog'] = df['cog'].replace(360, np.nan)
    df = df[(df['cog'] <= 360) | (df['cog'].isna())]

    # Replacing default with Nan bacause too close to valid values
    df['sog'] = df['sog'].replace(1023, np.nan)

    # Replacing default with Nan bacause too close to valid values
    # Changing uncertain values to bigger number to be further away from sample pool
    # Adding uncertainty flag
    df['rot'] = df['rot'].replace(128, np.nan)
    df['rot'] = df['rot'].replace({127: 200, -127: -200})
    df['uncertain_rot'] = np.where(df['rot'].isin([200, -200]), 1, 0)

    # Replacing default value with NaN to not get taken in consideration by regression
    df['heading'] = df['heading'].replace(511, np.nan)

    # Adding a "is moored?" flag
    df['isMoored'] = (df['navstat'] == 5).astype(int)   

    # Time Handling 
    df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.tz_localize('UTC')
    # Standardize eta
    df['etaRaw'] = df['etaRaw'].fillna(0)
    df['etaRaw'] = df['etaRaw'].apply(lambda x: f"{2024}-{x}")
    df['etaRaw'] = pd.to_datetime(df['etaRaw'], errors='coerce').dt.tz_localize('UTC')
    df.rename(columns={'etaRaw': 'etaStd'}, inplace=True)
    # Handle first month of the years ETA year to be 2023
    df['etaStd'] = df.apply(lambda row: row['etaStd'].replace(year=row['etaStd'].year - 1)
                            if row['etaStd'].month in [11, 12] and row['time'].month in [1, 2] 
                            else row['etaStd'], axis=1) 
    

    # FEATURE ENGINEERING
    # Get day of the week 
    df['dayofweek'] = df['time'].dt.dayofweek
    df['eta_dayoftheweek'] = df['etaStd'].dt.dayofweek 
    # Converts time and eta to seconds and add difference between the two
    df['time_seq'] = df['time'].astype(int) / 10**9  
    df['eta_seq'] = df['etaStd'].astype(int) / 10**9 
    df['estimated_time_left'] = df['time_seq'] - df['eta_seq']

    # Add port coordinates
    df = pd.merge(df, ports, on='portId', how='left')
    df = pd.merge(df, vessels, on='vesselId', how='left')

    # Add a three days rolling average for the AIS data 
    df['sog_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['sog'].mean())
    df['cog_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['cog'].mean())
    df['rot_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['rot'].mean())
    df['heading_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['heading'].mean())
    
    return df

In [3]:
def make_training_set(df, steps):
    df_copy = df.copy()
    df_copy.sort_values(by=['vesselId', 'time'], inplace=True)
    
    # FEATURE ENGINEERING
    # Vessels last collenction data
    df_copy['latitude_lag'] = df_copy.groupby('vesselId')['latitude'].shift(steps)
    df_copy['longitude_lag'] = df_copy.groupby('vesselId')['longitude'].shift(steps)
    df_copy['port_longitude_lag'] = df_copy.groupby('vesselId')['port_longitude'].shift(steps)
    df_copy['port_latitude_lag'] = df_copy.groupby('vesselId')['port_latitude'].shift(steps)
    df_copy['isMoored_lag'] = df_copy.groupby('vesselId')['isMoored'].shift(steps)
    df_copy['sog_lag'] = df_copy.groupby('vesselId')['sog'].shift(steps)
    df_copy['sog_mean_lag'] = df_copy.groupby('vesselId')['sog_mean'].shift(steps)
    df_copy['cog_lag'] = df_copy.groupby('vesselId')['cog'].shift(steps)
    df_copy['cog_mean_lag'] = df_copy.groupby('vesselId')['cog_mean'].shift(steps)     
    df_copy['rot_lag'] = df_copy.groupby('vesselId')['rot'].shift(steps) 
    df_copy['rot_mean_lag'] = df_copy.groupby('vesselId')['rot_mean'].shift(steps)
    df_copy['uncertain_rot_lag'] = df_copy.groupby('vesselId')['uncertain_rot'].shift(steps) 
    df_copy['heading_lag'] = df_copy.groupby('vesselId')['heading'].shift(steps)
    df_copy['heading_mean_lag'] = df_copy.groupby('vesselId')['heading_mean'].shift(steps)  
    df_copy['dayofweek_lag'] = df_copy.groupby('vesselId')['dayofweek'].shift(steps)
    # Vessels stuff
    df_copy['shippingLineId_lag'] = df_copy.groupby('vesselId')['shippingLineId'].shift(steps)
    df_copy['CEU_lag'] = df_copy.groupby('vesselId')['CEU'].shift(steps)
    df_copy['vesselType_14.0_lag'] = df_copy.groupby('vesselId')['vesselType_14.0'].shift(steps)
    df_copy['vesselType_21.0_lag'] = df_copy.groupby('vesselId')['vesselType_21.0'].shift(steps)
    df_copy['vesselType_83.0_lag'] = df_copy.groupby('vesselId')['vesselType_83.0'].shift(steps)
    df_copy['enginePower_lag'] = df_copy.groupby('vesselId')['enginePower'].shift(steps)
    df_copy['freshWater_lag'] = df_copy.groupby('vesselId')['freshWater'].shift(steps)
    df_copy['fuel_lag'] = df_copy.groupby('vesselId')['fuel'].shift(steps)
    df_copy['maxSpeed_lag'] = df_copy.groupby('vesselId')['maxSpeed'].shift(steps)
    df_copy['rampCapacity_lag'] = df_copy.groupby('vesselId')['rampCapacity'].shift(steps)
    # Time since last data collection
    df_copy['time_diff'] = df_copy.groupby('vesselId')['time'].diff(steps)
    df_copy['time_diff_seconds'] = df_copy['time_diff'].dt.total_seconds()
    # Time to eta
    df_copy['estimated_time_left_lag'] = df_copy.groupby('vesselId')['estimated_time_left'].shift(steps)
    df_copy.dropna(subset=['time_diff'], inplace=True)
    return df_copy

# TRAIN ON DIFFERENT TIME HORIZONS

## Process train

In [4]:
# Load train data
known_positions = pd.read_csv('ais_train.csv', sep ='|')  # Replace with your dataset
test = pd.read_csv('ais_test.csv', sep =',')
# Preprocess train
known_positions = preprocess(known_positions)
train = known_positions.copy()
# Create training sets with variable time differences
train1 = make_training_set(train, 1)
train2 = make_training_set(train, 2)
train3 = make_training_set(train, 3)
train4 = make_training_set(train, 4)
train5 = make_training_set(train, 5)
train6 = make_training_set(train, 6)
train7 = make_training_set(train, 7)
train8 = make_training_set(train, 8)
train9 = make_training_set(train, 9)
train10 = make_training_set(train, 10)
train11 = make_training_set(train, 11)
train12 = make_training_set(train, 12)
train13 = make_training_set(train, 13)
train14 = make_training_set(train, 14)
train15 = make_training_set(train, 15)
train16 = make_training_set(train, 16)
train17 = make_training_set(train, 17)
train18 = make_training_set(train, 18)
train19 = make_training_set(train, 19) # 24 hours
train20 = make_training_set(train, 20) 
train22 = make_training_set(train, 22) 
train24 = make_training_set(train, 24) 
train26 = make_training_set(train, 26) 
train28 = make_training_set(train, 28) 
train30 = make_training_set(train, 30) 
train32 = make_training_set(train, 32) 
train34 = make_training_set(train, 34)
train36 = make_training_set(train, 36)
train38 = make_training_set(train, 38) # 48 hours
train40 = make_training_set(train, 40) 
train44 = make_training_set(train, 44) 
train48 = make_training_set(train, 48) 
train52 = make_training_set(train, 52) 
train56 = make_training_set(train, 56) 
train60 = make_training_set(train, 60) # 74 hours
train64 = make_training_set(train, 64) 
train68 = make_training_set(train, 68) 
train72 = make_training_set(train, 72)


In [5]:
train = pd.concat([
    train1, train2, train3, train4, train5, train6, train7, train8, train9, train10,
    train11, train12, train13, train14, train15, train16, train17, train18, train19, train20,
    train22, train24, train26, train28, train30, train32, train34, train36, train38, train40,
    train44, train48, train52, train56, train60, train64, train68, train72,
], ignore_index=True)

In [6]:
train['navstat'] = pd.Categorical(train['navstat']).codes
train['portId'] = pd.Categorical(train['portId']).codes
train['shippingLineId'] = pd.Categorical(train['shippingLineId']).codes
# Encoding test and train vesselID with the same encoder 
unique_vessel_ids = pd.concat([known_positions['vesselId'], test['vesselId']]).unique()
vessel_encoder = LabelEncoder()
vessel_encoder.fit(unique_vessel_ids)
# Transform the vesselId column in train
train['vesselId'] = vessel_encoder.transform(train['vesselId'])

# Clean missing data
train = train.dropna(subset=['latitude', 'longitude', 'time'])

In [None]:

X = train[[
    'vesselId', #try
    'latitude_lag',
    'longitude_lag',
    'port_latitude_lag',
    'port_longitude_lag',
    'isMoored_lag',
    'sog_lag',
    'sog_mean_lag',
    'cog_lag',
    'cog_mean_lag',
    'rot_lag',
    'rot_mean_lag',
    'uncertain_rot_lag',
    'heading_lag',
    'heading_mean_lag',
    'dayofweek_lag',
    'time_diff_seconds',
    'estimated_time_left_lag',
    # Vessels stuff
    'vesselType_14.0_lag',
    'vesselType_21.0_lag',
    'vesselType_83.0_lag',
    'CEU_lag',
    'shippingLineId_lag',
    'enginePower_lag',
    'freshWater_lag',
    'fuel_lag',
    'maxSpeed_lag',
    'rampCapacity_lag'
]]
y = train[['latitude', 'longitude']]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = X_train.apply(pd.to_numeric, errors='coerce') # Not needed

: 

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

base_model = xgb.XGBRegressor(
    learning_rate=0.24196263261990883,
    max_depth=9,
    min_child_weight=1,
    n_estimators=185,
    subsample=0.9136916108674354,
    random_state=42
)

model = MultiOutputRegressor(base_model)

# Fit
model.fit(X_train, y_train)

# Predict
y_pred_val = model.predict(X_val)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
mae = mean_absolute_error(y_val, y_pred_val)
mae

0.7712327947696873

In [None]:
test['time'] = pd.to_datetime(test['time'], errors='coerce').dt.tz_localize('UTC')
test['vesselId'] = vessel_encoder.transform(test['vesselId'])
known_positions['vesselId'] = vessel_encoder.transform(known_positions['vesselId'])

In [None]:
def prepare_prediction_data(test_df, known_positions):
    # Get last known position for each vessel before the test time
    predictions_data = []
    
    # Group by vessel ID to avoid repeated processing
    vessel_histories = dict(tuple(known_positions.groupby('vesselId')))
    
    for _, row in test_df.iterrows():
        vessel_history = vessel_histories[row['vesselId']]
        # Get last position before test time
        last_detection = vessel_history[vessel_history['time'] < row['time']].iloc[-1]
        
        predictions_data.append({
            'vesselId': last_detection['vesselId'],
            'latitude_lag': last_detection['latitude'],
            'longitude_lag': last_detection['longitude'],
            'port_latitude_lag': last_detection['port_latitude'],
            'port_longitude_lag': last_detection['port_longitude'],
            'isMoored_lag': last_detection['isMoored'],
            'sog_lag': last_detection['sog'],
            'sog_mean_lag': last_detection['sog_mean'],
            'cog_lag': last_detection['cog'],
            'cog_mean_lag': last_detection['cog_mean'],
            'rot_lag': last_detection['rot'],
            'rot_mean_lag': last_detection['rot_mean'],
            'uncertain_rot_lag': last_detection['uncertain_rot'],
            'heading_lag': last_detection['heading'],
            'heading_mean_lag': last_detection['heading_mean'],
            'dayofweek_lag': last_detection['dayofweek'],
            'time_diff_seconds': (pd.to_datetime(row['time']) - last_detection['time']).total_seconds(),
            'estimated_time_left_lag': last_detection['estimated_time_left'],

            # Adding one-hot encoded vesselType columns
            'vesselType_14.0_lag': last_detection.get('vesselType_14.0', 0),
            'vesselType_21.0_lag': last_detection.get('vesselType_21.0', 0),
            'vesselType_83.0_lag': last_detection.get('vesselType_83.0', 0),
            # Vessels stuff 
            'CEU_lag': last_detection['CEU'],
            'shippingLineId_lag': last_detection['shippingLineId'],
            'enginePower_lag': last_detection['enginePower'],
            'freshWater_lag': last_detection['freshWater'],
            'fuel_lag': last_detection['fuel'],
            'maxSpeed_lag': last_detection['maxSpeed'],
            'rampCapacity_lag': last_detection['rampCapacity']
        })
    
    return pd.DataFrame(predictions_data)

# Prepare all prediction data at once
input_df = prepare_prediction_data(test, known_positions)

# Make predictions for all rows at once
predictions = model.predict(input_df)

# Create results DataFrame
results = pd.DataFrame({
    'ID': test.index,
    'longitude_predicted': predictions[:, 1],
    'latitude_predicted': predictions[:, 0]
})

In [None]:
results.head(1000)

Unnamed: 0,ID,longitude_predicted,latitude_predicted
0,0,-80.941093,31.264187
1,1,119.937454,15.943081
2,2,10.746855,38.86776
3,3,172.998428,-43.347828
4,4,-5.669545,48.539047
5,5,3.142569,51.447361
6,6,-8.481432,43.321041
7,7,3.363414,51.401638
8,8,29.565992,40.947762
9,9,15.748086,38.398216


In [None]:
results.to_csv('results_2.4.csv', index=False)