# DATA PROCESSING FOR XGB TRAINED ON EVERY DAY DISTANCE

In [10]:
import pandas as pd 
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import geopandas as gpd 
import matplotlib.pyplot as plt 
from calendar import monthrange
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [11]:
def preprocess(df):
    # Replacing default with Nan bacause too close to valid values, eliminate non valid values
    df['cog'] = df['cog'].replace(360, np.nan)
    df = df[(df['cog'] <= 360) | (df['cog'].isna())]

    # Replacing default with Nan bacause too close to valid values
    df['sog'] = df['sog'].replace(1023, np.nan)

    # Replacing default with Nan bacause too close to valid values
    # Changing uncertain values to bigger number to be further away from sample pool
    # Adding uncertainty flag
    df['rot'] = df['rot'].replace(128, np.nan)
    df['rot'] = df['rot'].replace({127: 200, -127: -200})
    df['uncertain_rot'] = np.where(df['rot'].isin([200, -200]), 1, 0)

    # Replacing default value with NaN to not get taken in consideration by regression
    df['heading'] = df['heading'].replace(511, np.nan)
    
    # Time Handling 
    df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.tz_localize('UTC')

    # FEATURE ENGINEERING
    # Separate Date-Time in single attributes
    df['year_rec'] = df['time'].dt.year
    df['month_rec'] = df['time'].dt.month
    df['day_rec'] = df['time'].dt.day
    df['hour_rec'] = df['time'].dt.hour
    df['minute_rec'] = df['time'].dt.minute
    df['dayofweek_rec'] = df['time'].dt.dayofweek
    # Converts to seconds
    df['time_seq'] = df['time'].astype(int) / 10**9  

    df['sog_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['sog'].mean())
    
    return df

In [12]:
def make_training_set(df, steps):
    df_copy = df.copy()
    df_copy.sort_values(by=['vesselId', 'time'], inplace=True)
    
    # FEATURE ENGINEERING
    # Vessels last data colletion 
    df_copy['latitude_lag'] = df_copy.groupby('vesselId')['latitude'].shift(steps)
    df_copy['longitude_lag'] = df_copy.groupby('vesselId')['longitude'].shift(steps)
    df_copy['sog_lag'] = df_copy.groupby('vesselId')['sog'].shift(steps)
    df_copy['sog_mean_lag'] = df_copy.groupby('vesselId')['sog_mean'].shift(steps)
    df_copy['cog_lag'] = df_copy.groupby('vesselId')['cog'].shift(steps)        # normalized
    df_copy['rot_lag'] = df_copy.groupby('vesselId')['rot'].shift(steps) 
    df_copy['heading_lag'] = df_copy.groupby('vesselId')['heading'].shift(steps)  
    
    # Time since last data collection
    df_copy['time_diff'] = df_copy.groupby('vesselId')['time'].diff(steps)
    df_copy['time_diff_seconds'] = df_copy['time_diff'].dt.total_seconds()
    # Apply the moving average function to each vessel group
    # TOCHECK: df.dropna(inplace=True)
    df_copy.dropna(subset=['time_diff'], inplace=True)
    return df_copy

# TRAIN ON DIFFERENT TIME HORIZONS

## Process train

In [13]:
# Load train data
known_positions = pd.read_csv('ais_train.csv', sep ='|')  # Replace with your dataset
test = pd.read_csv('ais_test.csv', sep =',')
# Preprocess train
known_positions = preprocess(known_positions)
train = known_positions.copy()
# Create training sets with variable time differences
train1 = make_training_set(train, 1)
train2 = make_training_set(train, 2)
train3 = make_training_set(train, 3)
train4 = make_training_set(train, 4)
train5 = make_training_set(train, 5)
train6 = make_training_set(train, 6)
train7 = make_training_set(train, 7)
train8 = make_training_set(train, 8)
train9 = make_training_set(train, 9)
train10 = make_training_set(train, 10)
train11 = make_training_set(train, 11)
train12 = make_training_set(train, 12)
train13 = make_training_set(train, 13)
train14 = make_training_set(train, 14)
train15 = make_training_set(train, 15)
train16 = make_training_set(train, 16)
train17 = make_training_set(train, 17)
train18 = make_training_set(train, 18)


In [14]:
train = pd.concat([train1, train2, train3, train4, train5, train6, train7, train8, train9, train10, train11, train12, train13, train14, train15, train16, train17, train18], ignore_index=True)

In [15]:
train['navstat'] = pd.Categorical(train['navstat']).codes
train['portId'] = pd.Categorical(train['portId']).codes
# Encoding test and train vesselID with the same encoder 
unique_vessel_ids = pd.concat([known_positions['vesselId'], test['vesselId']]).unique()
vessel_encoder = LabelEncoder()
vessel_encoder.fit(unique_vessel_ids)
# Transform the vesselId column in train
train['vesselId'] = vessel_encoder.transform(train['vesselId'])

# Clean missing data
train = train.dropna(subset=['latitude', 'longitude', 'time'])

In [16]:

X = train[[
    'vesselId', #try
    'latitude_lag',
    'longitude_lag',
    'sog_lag',
    'sog_mean_lag',
    'cog_lag',
    'rot_lag',
    'heading_lag',
    'time_diff_seconds',
]]
y = train[['latitude', 'longitude']]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = X_train.apply(pd.to_numeric, errors='coerce') # Not needed

In [18]:
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBRegressor(random_state=42))
])
# Create MultiOutputRegressor with the pipeline
model = MultiOutputRegressor(
    TransformedTargetRegressor(regressor=xgb_pipeline, transformer=StandardScaler()))

# Define parameter grid
# Note: No need for 'xgb__' prefix here because MultiOutputRegressor handles it
param_grid = {
    'estimator__regressor__xgb__n_estimators': [100, 200],
    'estimator__regressor__xgb__max_depth': [3, 5, 7],
    'estimator__regressor__xgb__learning_rate': [0.01, 0.1],
    'estimator__regressor__xgb__min_child_weight': [1, 3],
    'estimator__regressor__xgb__subsample': [0.8, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(
    model,
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters:", grid_search.best_params_)

# Get best model performance
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_val)

# # Fit
# model.fit(X_train, y_train)

# # Predict
# y_pred_val = model.predict(X_val)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


KeyboardInterrupt: 

In [None]:
mae = mean_absolute_error(y_val, y_pred_val)
mae

0.6509145839610831

In [None]:
test['time'] = pd.to_datetime(test['time'], errors='coerce').dt.tz_localize('UTC')
test['vesselId'] = vessel_encoder.transform(test['vesselId'])
known_positions['vesselId'] = vessel_encoder.transform(known_positions['vesselId'])

In [None]:
def prepare_prediction_data(test_df, known_positions):
    # Get last known position for each vessel before the test time
    predictions_data = []
    
    # Group by vessel ID to avoid repeated processing
    vessel_histories = dict(tuple(known_positions.groupby('vesselId')))
    
    for _, row in test_df.iterrows():
        vessel_history = vessel_histories[row['vesselId']]
        # Get last position before test time
        last_detection = vessel_history[vessel_history['time'] < row['time']].iloc[-1]
        
        predictions_data.append({
            'vesselId': last_detection['vesselId'],
            'latitude_lag': last_detection['latitude'],
            'longitude_lag': last_detection['longitude'],
            'sog_lag': last_detection['sog'],
            'sog_mean_lag': last_detection['sog_mean'],
            'cog_lag': last_detection['cog'],
            'rot_lag': last_detection['rot'],
            'heading_lag': last_detection['heading'],
            'time_diff_seconds': (pd.to_datetime(row['time']) - last_detection['time']).total_seconds(),
        })
    
    return pd.DataFrame(predictions_data)

# Prepare all prediction data at once
input_df = prepare_prediction_data(test, known_positions)

# Make predictions for all rows at once
predictions = model.predict(input_df)

# Create results DataFrame
results = pd.DataFrame({
    'ID': test.index,
    'longitude_predicted': predictions[:, 1],
    'latitude_predicted': predictions[:, 0]
})

In [None]:
results.head(1000)

Unnamed: 0,ID,longitude_predicted,latitude_predicted
0,0,-81.347046,31.507151
1,1,120.173126,13.977752
2,2,9.964231,38.457649
3,3,171.827515,-43.112675
4,4,-6.092918,48.517307
5,5,3.205324,51.305489
6,6,-8.274386,43.362976
7,7,3.280089,51.300655
8,8,28.741175,40.884163
9,9,15.915777,38.426678


In [None]:
results.to_csv('results_1.3.csv', index=False)