# DATA PROCESSING FOR XGB TRAINED ON EVERY DAY DISTANCE

In [1]:
import pandas as pd 
import xgboost as xgb
import numpy as np
import geopandas as gpd 
import matplotlib.pyplot as plt 
from calendar import monthrange
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def preprocess(df):
    # Replacing default with Nan bacause too close to valid values, eliminate non valid values
    df['cog'] = df['cog'].replace(360, np.nan)
    df = df[(df['cog'] <= 360) | (df['cog'].isna())]

    # Replacing default with Nan bacause too close to valid values
    df['sog'] = df['sog'].replace(1023, np.nan)

    # Replacing default with Nan bacause too close to valid values
    # Changing uncertain values to bigger number to be further away from sample pool
    # Adding uncertainty flag
    df['rot'] = df['rot'].replace(128, np.nan)
    df['rot'] = df['rot'].replace({127: 200, -127: -200})
    df['uncertain_rot'] = np.where(df['rot'].isin([200, -200]), 1, 0)

    # Replacing default value with NaN to not get taken in consideration by regression
    df['heading'] = df['heading'].replace(511, np.nan)
    
    # Time Handling 
    df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.tz_localize('UTC')

    # FEATURE ENGINEERING
    # Separate Date-Time in single attributes
    df['year_rec'] = df['time'].dt.year
    df['month_rec'] = df['time'].dt.month
    df['day_rec'] = df['time'].dt.day
    df['hour_rec'] = df['time'].dt.hour
    df['minute_rec'] = df['time'].dt.minute
    df['dayofweek_rec'] = df['time'].dt.dayofweek
    # Converts to seconds
    df['time_seq'] = df['time'].astype(int) / 10**9  

    df['sog_mean'] = df.groupby('vesselId', group_keys=False).apply(
    lambda x: x.sort_values('time').rolling('3D', on='time')['sog'].mean())
    
    return df

In [3]:
def make_training_set(df, steps):
    df_copy = df.copy()
    df_copy.sort_values(by=['vesselId', 'time'], inplace=True)
    
    # FEATURE ENGINEERING
    # Vessels last data colletion 
    df_copy['latitude_lag'] = df_copy.groupby('vesselId')['latitude'].shift(steps)
    df_copy['longitude_lag'] = df_copy.groupby('vesselId')['longitude'].shift(steps)
    df_copy['sog_lag'] = df_copy.groupby('vesselId')['sog'].shift(steps)
    df_copy['sog_mean_lag'] = df_copy.groupby('vesselId')['sog_mean'].shift(steps)
    df_copy['cog_lag'] = df_copy.groupby('vesselId')['cog'].shift(steps)        # normalized
    df_copy['rot_lag'] = df_copy.groupby('vesselId')['rot'].shift(steps) 
    df_copy['heading_lag'] = df_copy.groupby('vesselId')['heading'].shift(steps)  
    
    # Time since last data collection
    df_copy['time_diff'] = df_copy.groupby('vesselId')['time'].diff(steps)
    df_copy['time_diff_seconds'] = df_copy['time_diff'].dt.total_seconds()
    # Apply the moving average function to each vessel group
    # TOCHECK: df.dropna(inplace=True)
    df_copy.dropna(subset=['time_diff'], inplace=True)
    return df_copy

# TRAIN ON DIFFERENT TIME HORIZONS

## Process train

In [None]:
# Load train data
known_positions = pd.read_csv('ais_train.csv', sep ='|')  # Replace with your dataset
train = known_positions.copy()
# Preprocess train
train = preprocess(train)

# Create training sets with variable time differences
train1 = make_training_set(train, 1)
train2 = make_training_set(train, 2)
train3 = make_training_set(train, 3)
train4 = make_training_set(train, 4)
train5 = make_training_set(train, 5)
train6 = make_training_set(train, 6)


In [None]:
train = pd.concat([train1, train2, train3, train4, train5, train6], ignore_index=True)

NameError: name 'pd' is not defined

In [6]:
train['navstat'] = pd.Categorical(train['navstat']).codes
train['vesselId'] = pd.Categorical(train['vesselId']).codes
train['portId'] = pd.Categorical(train['portId']).codes

# Clean missing data
train = train.dropna(subset=['latitude', 'longitude', 'time'])

In [None]:

X = train[[
    'vesselId', #try
    'latitude_lag',
    'longitude_lag',
    'sog_lag',
    'sog_mean_lag',
    'cog_lag',
    'rot_lag',
    'heading_lag',
    'time_diff_seconds',
]]
y = train[['latitude', 'longitude']]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = X_train.apply(pd.to_numeric, errors='coerce')

In [10]:
model = MultiOutputRegressor(xgb.XGBRegressor())
model.fit(X_train, y_train)

In [11]:
y_pred_val = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred_val)
mae


0.3047931669227297

In [None]:
test = pd.read_csv('ais_test.csv', sep =',')
known_pos = pd.read_csv('ais_test.csv', sep= '|')
test['time'] = pd.to_datetime(test['time'], errors='coerce').dt.tz_localize('UTC')

In [None]:
def predict_future_position(id, vessel_id, time):
    # Fetch the latest known position of the vessel
    latest_data_points = t[training_data['vesselId'] == vessel_id]
    latest_data_points_sorted = latest_data_points.sort_values(by='time')
    
    # Set 'time' as the index to allow for time-based rolling window
    latest_data_points_sorted = latest_data_points_sorted.set_index('time')
    
    # Apply rolling window on 'sog' with a 100-day window
    latest_data_points_sorted['3_day_avg_speed'] = latest_data_points_sorted['sog'].rolling('3D').mean()
    
    # Get the latest data point
    latest_data_point = latest_data_points_sorted.iloc[-1]

    # Prepare the new data for prediction
    new_data = {
        'prev_lat': latest_data_point['latitude'],
        'prev_lon': latest_data_point['longitude'],
        'prev_speed': latest_data_point['sog'],
        'prev_course': (latest_data_point['cog'] / 180) - 1,
        'prev_rotation': latest_data_point['rot'],
        'prev_heading': (latest_data_point['heading'] / 180) - 1,

        # Use the datetime objects for the time difference
        'time_diff_seconds': (pd.to_datetime(time) - latest_data_point.name).total_seconds(),  # .name gives the index (time)
        #'3_day_avg_speed': latest_data_point['3_day_avg_speed'],
    }

    # Make predictions
    return id, model_lat.predict([list(new_data.values())])[0], model_lon.predict([list(new_data.values())])[0]
