# DATA PROCESSING FOR XGB TRAINED ON EVERY DAY DISTANCE

In [1]:
import pandas as pd 
import xgboost as xgb
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from calendar import monthrange
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [18]:
def preprocess(df):
    # Replacing default with Nan bacause too close to valid values, eliminate non valid values
    df['cog'] = df['cog'].replace(360, np.nan)
    df = df[(df['cog'] <= 360) | (df['cog'].isna())]

    # Replacing default with Nan bacause too close to valid values
    df['sog'] = df['sog'].replace(1023, np.nan)

    # Replacing default with Nan bacause too close to valid values
    # Changing uncertain values to bigger number to be further away from sample pool
    # Adding uncertainty flag
    df['rot'] = df['rot'].replace(128, np.nan)
    df['rot'] = df['rot'].replace({127: 200, -127: -200})
    df['uncertain_rot'] = np.where(df['rot'].isin([200, -200]), 1, 0)

    # Replacing default value with NaN to not get taken in consideration by regression
    df['heading'] = df['heading'].replace(511, np.nan)

    
    df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.tz_localize('UTC')
    df['etaRaw'] = df['etaRaw'].fillna(0)
    df['etaRaw'] = df['etaRaw'].apply(lambda x: f"{2024}-{x}")
    df['etaRaw'] = pd.to_datetime(df['etaRaw'], errors='coerce').dt.tz_localize('UTC')
    df.rename(columns={'etaRaw': 'etaStd'}, inplace=True)

    # Handle first month of the years ETA year to be 2023
    df['etaStd'] = df.apply(lambda row: row['etaStd'].replace(year=row['etaStd'].year - 1)
                            if row['etaStd'].month in [11, 12] and row['time'].month in [1, 2] 
                            else row['etaStd'], axis=1)  
    
    
    return df

In [3]:
def process_time(df):
    # Separate Date-Time in single attributes
    df['year_rec'] = df['time'].dt.year
    df['month_rec'] = df['time'].dt.month
    df['day_rec'] = df['time'].dt.day
    df['hour_rec'] = df['time'].dt.hour
    df['minute_rec'] = df['time'].dt.minute
    df['dayofweek_rec'] = df['time'].dt.dayofweek
    
    df['etaStd_seq'] = df['etaStd'].astype(int) / 10**9  # Converts to seconds
    df['time_seq'] = df['time'].astype(int) / 10**9  # Converts to seconds
    return df

In [4]:
def add_new_trip_flag(df):
    # Selecting a threshold where we tell the model to consider the posibility of a new trip starting
    # Current: 8h20m, 200km traveled (15mph avg speed)
    # Not realistic but reasonable to consider shorter path for training
    # Maybe change later
    df['new_trip'] = np.where(
        (df['time_diff_minutes'] == 0) | (df['time_diff_minutes'] >= 500) |
        (df['distance_km'].isna()) | (df['distance_km'] >= 200),
        1, 0  # Set to 1 if conditions are met, else 0
    )
    # Might add differerence in eta as an indicator
    return df

In [5]:
def add_lag_features(df, features_to_lag, steps=1):
    
    for step in range(1, steps + 1):
            for feature in features_to_lag:
                lag_col = f'{feature}_lag{step}'
                df[lag_col] = df.groupby('vesselId')[feature].shift(step)
        
    return df

# TRAIN ON DIFFERENT TIME HORIZON 

## Process train

In [19]:
# Load train data
train = pd.read_csv('ais_train.csv', sep ='|')  # Replace with your dataset

# Preprocess train
train = preprocess(train)

train['navstat'] = pd.Categorical(train['navstat']).codes
train['vesselId'] = pd.Categorical(train['vesselId']).codes
train['portId'] = pd.Categorical(train['portId']).codes


# Clean missing data
train = train.dropna(subset=['latitude', 'longitude', 'time'])

# Add lags for latitude and longitude
# Based on an average data collection interval of 60 minutes
for days in range(1, 6):
    train[f'latitude_{days}day'] = train['latitude'].shift(-days * 24)
    train[f'longitude_{days}day'] = train['longitude'].shift(-days * 24)


In [22]:
X_train = train.drop(columns=['latitude', 'longitude', 'time'] + [f'latitude_{days}day' for days in range(1, 6)] + [f'longitude_{days}day' for days in range(1, 6)])

X_train = X_train.apply(pd.to_numeric, errors='coerce')

In [23]:
X_train.head()

Unnamed: 0,cog,sog,rot,heading,navstat,etaStd,vesselId,portId,uncertain_rot
0,284.0,0.7,0.0,88.0,0,1704841200000000000,50,40,0
1,109.6,0.0,-6.0,347.0,1,1703880000000000000,189,674,0
2,111.0,11.0,0.0,112.0,0,1704186000000000000,432,353,0
3,96.4,0.0,0.0,142.0,1,1704052800000000000,110,18,0
4,214.0,19.7,0.0,215.0,0,1706184000000000000,356,605,0


In [None]:
test = pd.read_csv('ais_test.csv', sep ='|')

test['time'] = pd.to_datetime(test['time'], errors='coerce').dt.tz_localize('UTC')

In [None]:
# Train a different model for each time horizon

for days in range(1, 6):
    print(f"\n--- Predicting {days} day(s) into the future ---")

    # Define target (Y) for latitude and longitude for the current day horizon
    Y_latitude_future = train[f'latitude_{days}day']
    Y_longitude_future = train[f'longitude_{days}day']

    # Split the data into training and testing sets (80% training, 20% testing)
    X_train, X_test, Y_lat_train, Y_lat_test = train_test_split(X, Y_latitude_future, test_size=0.2, random_state=42)
    X_train, X_test, Y_lon_train, Y_lon_test = train_test_split(X, Y_longitude_future, test_size=0.2, random_state=42)

    # Instanciate model
    rf_reg_lat = RandomForestRegressor(n_estimators=n_estimators, random_state=42, warm_start=True)
    rf_reg_lon = RandomForestRegressor(n_estimators=n_estimators, random_state=42, warm_start=True)

    # Train and display progress for latitude prediction
    for i in tqdm(range(1, n_estimators + 1), desc=f"Training RandomForest for Latitude ({days} days ahead)"):
        rf_reg_lat.set_params(n_estimators=i)
        rf_reg_lat.fit(X_train, Y_lat_train)

    # Train and display progress for longitude prediction
    for i in tqdm(range(1, n_estimators + 1), desc=f"Training RandomForest for Longitude ({days} days ahead)"):
        rf_reg_lon.set_params(n_estimators=i)
        rf_reg_lon.fit(X_train, Y_lon_train)

    # Make predictions on the test set for both latitude and longitude
    lat_predictions = rf_reg_lat.predict(X_test)
    lon_predictions = rf_reg_lon.predict(X_test)