# DATA PROCESSING FOR XGB AUTO-REGRESSION

In [1]:
import pandas as pd 
import xgboost as xgb
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from calendar import monthrange
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv('ais_train.csv', sep='|')
test = pd.read_csv('ais_test.csv')

In [3]:
train.shape

(1522065, 11)

In [4]:
train['source'] = 'train'
test['source'] = 'test'

comb = pd.concat([train, test], axis=0, ignore_index=True)

In [5]:
comb[comb['source'] == 'test'].head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,source,ID,scaling_factor
1522065,2024-05-08 00:03:16,,,,,,,,,61e9f3aeb937134a3c4bfe3d,,test,0.0,0.3
1522066,2024-05-08 00:06:17,,,,,,,,,61e9f473b937134a3c4c02df,,test,1.0,0.3
1522067,2024-05-08 00:10:02,,,,,,,,,61e9f469b937134a3c4c029b,,test,2.0,0.3
1522068,2024-05-08 00:10:34,,,,,,,,,61e9f45bb937134a3c4c0221,,test,3.0,0.3
1522069,2024-05-08 00:12:27,,,,,,,,,61e9f38eb937134a3c4bfd8d,,test,4.0,0.3


In [6]:
test.shape

(51739, 5)

In [7]:
comb.sort_values(by=['vesselId', 'time']).head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,source,ID,scaling_factor
131115,2024-01-12 14:07:47,308.1,17.1,-6.0,316.0,0.0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,train,,
131279,2024-01-12 14:31:00,307.6,17.3,5.0,313.0,0.0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,train,,
131514,2024-01-12 14:57:23,306.8,16.9,5.0,312.0,0.0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,train,,
131696,2024-01-12 15:18:48,307.9,16.9,6.0,313.0,0.0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,train,,
131885,2024-01-12 15:39:47,307.0,16.3,7.0,313.0,0.0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,train,,


In [8]:
comb[comb['source'] == 'test'].head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,source,ID,scaling_factor
1522065,2024-05-08 00:03:16,,,,,,,,,61e9f3aeb937134a3c4bfe3d,,test,0.0,0.3
1522066,2024-05-08 00:06:17,,,,,,,,,61e9f473b937134a3c4c02df,,test,1.0,0.3
1522067,2024-05-08 00:10:02,,,,,,,,,61e9f469b937134a3c4c029b,,test,2.0,0.3
1522068,2024-05-08 00:10:34,,,,,,,,,61e9f45bb937134a3c4c0221,,test,3.0,0.3
1522069,2024-05-08 00:12:27,,,,,,,,,61e9f38eb937134a3c4bfd8d,,test,4.0,0.3


## Cleaning AIS data

In [9]:
# Replacing default with Nan bacause too close to valid values, eliminate non valid values
comb['cog'] = comb['cog'].replace(360, np.nan)
comb = comb[(comb['cog'] <= 360) | (comb['cog'].isna())]

In [10]:
comb[comb['source'] == 'test'].head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,source,ID,scaling_factor
1522065,2024-05-08 00:03:16,,,,,,,,,61e9f3aeb937134a3c4bfe3d,,test,0.0,0.3
1522066,2024-05-08 00:06:17,,,,,,,,,61e9f473b937134a3c4c02df,,test,1.0,0.3
1522067,2024-05-08 00:10:02,,,,,,,,,61e9f469b937134a3c4c029b,,test,2.0,0.3
1522068,2024-05-08 00:10:34,,,,,,,,,61e9f45bb937134a3c4c0221,,test,3.0,0.3
1522069,2024-05-08 00:12:27,,,,,,,,,61e9f38eb937134a3c4bfd8d,,test,4.0,0.3


In [11]:
err

NameError: name 'err' is not defined

In [None]:


# Replacing default with Nan bacause too close to valid values
comb['sog'] = comb['sog'].replace(1023, np.nan)

# Replacing default with Nan bacause too close to valid values
# Changing uncertain values to bigger number to be further away from sample pool
# Adding uncertainty flag
comb['rot'] = comb['rot'].replace(128, np.nan)
comb['rot'] = comb['rot'].replace({127: 200, -127: -200})
comb['uncertain_rot'] = np.where(comb['rot'].isin([200, -200]), 1, 0)

# Replacing default value with NaN to not get taken in consideration by regression
comb['heading'] = comb['heading'].replace(511, np.nan)

In [None]:
comb[comb['source'] == 'test'].head()

In [None]:
# One hot encoding for the navigation status
comb = pd.get_dummies(comb, columns=['navstat'], drop_first = True)

In [None]:
# Get Date-Time from standard and raw data
comb['time'] = pd.to_datetime(comb['time'], errors='coerce').dt.tz_localize('UTC')
comb['etaRaw'] = comb['etaRaw'].fillna(0)
comb['etaRaw'] = comb['etaRaw'].apply(lambda x: f"{2024}-{x}")
comb['etaRaw'] = pd.to_datetime(comb['etaRaw'], errors='coerce').dt.tz_localize('UTC')
comb.rename(columns={'etaRaw': 'etaStd'}, inplace=True)

# Handle first month of the years ETA year to be 2023
comb['etaStd'] = comb.apply(lambda row: row['etaStd'].replace(year=row['etaStd'].year - 1)
                        if row['etaStd'].month in [11, 12] and row['time'].month in [1, 2] 
                        else row['etaStd'], axis=1) 
comb['etaStd'].head(10)

## Clean schedules data
TODO: decide if to use the schedule data

In [None]:
schedules = pd.read_csv('schedules_to_may_2024.csv', sep='|')
#schedules.drop(['shippingLineName', 'portName'], axis=1, inplace=True)
#schedules.rename(columns={'portId': 'portId_sched'}, inplace=True)

In [None]:
schedules.sort_values(by=['vesselId', 'arrivalDate']).head(100)

In [None]:
#Schedules has a lot of duplicates
schedules.duplicated().sum(), schedules.drop_duplicates(inplace=True) 

In [None]:
schedules['arrivalDate'] = pd.to_datetime(schedules['arrivalDate'])
schedules['sailingDate'] = pd.to_datetime(schedules['sailingDate'])

In [None]:
# Check information distribution in schedules
schedules['year_month'] = schedules['arrivalDate'].dt.to_period('M')

monthly_counts = schedules.groupby('year_month').size()

plt.figure(figsize=(12, 6))
monthly_counts.plot(kind='bar', color='skyblue')
plt.title('Monthly Frequency of Vessel Information in Schedules')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
schedules.drop(['year_month'], axis=1, inplace=True)

## Cleaning vessels data

In [None]:
vessels = pd.read_csv('vessels.csv', sep='|')
# Drop useless data
vessels.drop('GT', axis=1, inplace=True)
vessels.drop('NT', axis=1, inplace=True)
vessels.drop('depth', axis=1, inplace=True)
vessels.drop('draft', axis=1, inplace=True)
vessels.drop('homePort', axis=1, inplace=True)
vessels.drop('maxHeight', axis=1, inplace=True)
vessels.drop('maxWidth', axis=1, inplace=True)
# One hot encoding for vessels types
vessels = pd.get_dummies(vessels, columns=['vesselType']) #no drop_first to handle the NaN

In [None]:
comb = pd.merge(comb, vessels, on='vesselId', how='left')
comb.head(10)

# Added: shippingLineId | CEU | DWT | vesselType(1 Hot) | breath | lenght 

## Cleaning ports data

In [None]:
ports = pd.read_csv('ports.csv', sep='|')
# Drop useless
ports.drop('name', axis=1, inplace=True)
ports.drop('portLocation', axis=1, inplace=True)
ports.drop('countryName', axis=1, inplace=True)
ports.drop('UN_LOCODE', axis=1, inplace=True)
ports.drop('ISO', axis=1, inplace=True)
# Rename to differenciate
ports.rename(columns={'longitude': 'portLongitude', 'latitude': 'portLatitude'}, inplace=True)

In [None]:
comb = pd.merge(comb, ports, on='portId', how='left')
comb.head(10)
# Added: portLongitude | portLatitude

# FEATURE ENGINEERING

In [None]:
# Sort train data on vesselId and by time
comb = comb.sort_values(by=['vesselId', 'time'])

### Time since last data collection

In [None]:
comb['time_diff_minutes'] = comb.groupby('vesselId')['time'].diff().dt.total_seconds() / 60
# Make integer changing numbers 0<x<1 to one so that the 0 is only a separator between ships
comb['time_diff_minutes'] = comb['time_diff_minutes'].fillna(0).apply(lambda x: 1 if 0 < x < 1 else x).astype(int)

### Distance since last data collection

In [None]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))


In [None]:
def calculate_distances(df):
    # Create new columns for previous latitude and longitude
    df['prev_latitude'] = df.groupby('vesselId')['latitude'].shift(1)
    df['prev_longitude'] = df.groupby('vesselId')['longitude'].shift(1)

    # Calculate distance between current and previous position
    df['distance_km'] = df.apply(
        lambda row: haversine(row['latitude'], row['longitude'], 
                               row['prev_latitude'], row['prev_longitude']) 
                     if pd.notna(row['prev_latitude']) and pd.notna(row['prev_longitude']) 
                     else np.nan, axis=1)
    
    # Drop the helper columns if not needed
    df.drop(columns=['prev_latitude', 'prev_longitude'], inplace=True)

    return df

In [None]:
comb = calculate_distances(comb)

### Calculate average speed in previous stretch

In [None]:
comb['mph_last_stretch'] = (comb['distance_km'] *  0.621371) / (comb['time_diff_minutes'] / 60)

In [None]:
comb[comb['source'] == 'test'].head()

In [None]:
# Remove unrealist values over the datadase
comb = comb[comb['mph_last_stretch']<60] 

In [None]:
comb[comb['source'] == 'test'].head()

## Lag features and moving average

In [None]:
# Lagging features 
comb['latitude_lag1'] = comb.groupby('vesselId')['latitude'].shift(1)
comb['longitude_lag1'] = comb.groupby('vesselId')['longitude'].shift(1)
comb['mph_last_stretch_lag'] = comb.groupby('vesselId')['mph_last_stretch'].shift(1)
comb['distance_km_lag'] = comb.groupby('vesselId')['distance_km'].shift(1)

## Time sin/cos encoding

In [None]:
# Separate Date-Time in single attributes
comb['year_rec'] = comb['time'].dt.year
comb['month_rec'] = comb['time'].dt.month
comb['day_rec'] = comb['time'].dt.day
comb['hour_rec'] = comb['time'].dt.hour
comb['minute_rec'] = comb['time'].dt.minute



comb['year_eta'] = comb['etaStd'].dt.year.fillna(0).astype('int32')
comb['month_eta'] = comb['etaStd'].dt.month.fillna(0).astype('int32')
comb['day_eta'] = comb['etaStd'].dt.day.fillna(0).astype('int32')
comb['hour_eta'] = comb['etaStd'].dt.hour.fillna(0).astype('int32')
comb['minute_eta'] = comb['etaStd'].dt.minute.fillna(0).astype('int32')


In [None]:
def get_month_progress(row, time_type = '_rec'):
    if row[f'month{time_type}'] == 0:
        return np.nan
    days_in_month = monthrange(row[f'year{time_type}'], row[f'month{time_type}'])[1]
    day_progress = (row[f'day{time_type}'] - 1 + row[f'hour{time_type}']/24 + row[f'minute{time_type}']/(24*60)) / days_in_month
    return day_progress

In [None]:
# Recordings time
comb['month_sin_rec'] = np.sin(2 * np.pi * comb['month_rec'] / 12)
comb['month_cos_rec'] = np.cos(2 * np.pi * comb['month_rec'] / 12)

comb['day_progress_rec'] = comb.apply(lambda row: get_month_progress(row, '_rec'), axis=1)
comb['day_sin_rec'] = np.sin(2 * np.pi * comb['day_progress_rec'])
comb['day_cos_rec'] = np.cos(2 * np.pi * comb['day_progress_rec'])

hour_progress = comb['hour_rec'] + comb['minute_rec']/60
comb['hour_sin_rec'] = np.sin(2 * np.pi * hour_progress / 24)
comb['hour_cos_rec'] = np.cos(2 * np.pi * hour_progress / 24)

comb['minute_sin_rec'] = np.sin(2 * np.pi * comb['minute_rec'] / 60)
comb['minute_cos_rec'] = np.cos(2 * np.pi * comb['minute_rec'] / 60)



# ETA
comb['month_sin_eta'] = np.sin(2 * np.pi * comb['month_eta'] / 12)
comb['month_cos_eta'] = np.cos(2 * np.pi * comb['month_eta'] / 12)

comb['day_progress_eta'] = comb.apply(lambda row: get_month_progress(row, '_eta'), axis=1)
comb['day_sin_eta'] = np.sin(2 * np.pi * comb['day_progress_eta'])
comb['day_cos_eta'] = np.cos(2 * np.pi * comb['day_progress_eta'])

hour_progress = comb['hour_eta'] + comb['minute_eta']/60
comb['hour_sin_eta'] = np.sin(2 * np.pi * hour_progress / 24)
comb['hour_cos_eta'] = np.cos(2 * np.pi * hour_progress / 24)


## Add a flag if there's a probability of looking at a new trip 

In [None]:
# Selecting a threshold where we tell the model to consider the posibility of a new trip starting
# Current: 8h20m, 200km traveled (15mph avg speed)
# Not realistic but reasonable to consider shorter path for training
# Maybe change later
comb['new_trip'] = np.where(
    (comb['time_diff_minutes'] == 0) | (comb['time_diff_minutes'] >= 500) |
    (comb['distance_km'].isna()) | (comb['distance_km'] >= 200),
    1, 0  # Set to 1 if conditions are met, else 0
)
# Might add differerence in eta as an indicator

# TRAINING

In [None]:
comb.dtypes

In [None]:
# Encode IDs 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
comb['vesselId_encoded'] = label_encoder.fit_transform(comb['vesselId'])
comb['portId_encoded'] = label_encoder.fit_transform(comb['portId'])
comb['shippingLineId_encoded'] = label_encoder.fit_transform(comb['shippingLineId'])

# Drop non valid columns
comb.drop(columns=['vesselId'], inplace=True)
comb.drop(columns=['portId'], inplace=True)
comb.drop(columns=['shippingLineId'], inplace=True)
comb.drop(columns=['time'], inplace=True)
comb.drop(columns=['etaStd'], inplace=True)

# Split data after porcessing
train_separated = comb[comb['source'] == 'train']
test_separated = comb[comb['source'] == 'train']

In [None]:
test_separated = comb[comb['source'] == 'test']

In [None]:
train_separated.head()

In [None]:
test_separated.head()

In [None]:
err
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# Replace 'target_column1' and 'target_column2' with the names of your target columns
target_columns = ['latitude', 'longitude']
X_train = train_separated.drop(columns=target_columns)
y_train = train_separated[target_columns]

X_test = test_separated.drop(columns=target_columns)

# Instantiate the multi-output model with XGBoost as the base regressor
multi_output_model = MultiOutputRegressor(XGBRegressor(enable_categorical=True))

# Train the model
multi_output_model.fit(X_train, y_train)

# Now you can use multi_output_model to predict on new data
y_pred = multi_output_model.predict(X_test)

In [None]:
# Now you can use multi_output_model to predict on new data
y_pred = multi_output_model.predict(X_test)

In [None]:
y_pred_df = pd.DataFrame(y_pred, columns=['predicted_longitude', 'predicted_latitude'])
y_pred_df.head()