# Make training dataset

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

## Reading in train data and sorting

In [2]:
train_data = pd.read_csv('../data/raw/ais_train.csv', sep='|') #Reading train data

#train =train.drop(['cog','sog','rot','heading','navstat','portId','etaRaw'], axis=1) # Dropping columns

train_data = train_data.drop(['portId','etaRaw'], axis=1) #Dropping portID and etaRaw

train_data['time'] = pd.to_datetime(train_data['time']) #Convert time to datetime object

train_data = train_data.sort_values(by=['vesselId','time']) #Sort by vesselID and time

train_data.head(10) #Printing first rows

Unnamed: 0,time,cog,sog,rot,heading,navstat,latitude,longitude,vesselId
131115,2024-01-12 14:07:47,308.1,17.1,-6,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b
131279,2024-01-12 14:31:00,307.6,17.3,5,313,0,7.57302,77.49505,61e9f38eb937134a3c4bfd8b
131514,2024-01-12 14:57:23,306.8,16.9,5,312,0,7.65043,77.39404,61e9f38eb937134a3c4bfd8b
131696,2024-01-12 15:18:48,307.9,16.9,6,313,0,7.71275,77.31394,61e9f38eb937134a3c4bfd8b
131885,2024-01-12 15:39:47,307.0,16.3,7,313,0,7.77191,77.23585,61e9f38eb937134a3c4bfd8b
132038,2024-01-12 15:54:48,307.6,16.1,5,313,0,7.81285,77.18147,61e9f38eb937134a3c4bfd8b
132237,2024-01-12 16:14:59,309.5,16.1,-6,313,0,7.86929,77.11032,61e9f38eb937134a3c4bfd8b
132394,2024-01-12 16:35:24,308.7,16.0,2,311,0,7.92585,77.03811,61e9f38eb937134a3c4bfd8b
132538,2024-01-12 16:55:24,310.4,16.0,-1,311,0,7.98258,76.9688,61e9f38eb937134a3c4bfd8b
132673,2024-01-12 17:14:36,307.5,16.1,6,307,0,8.03598,76.90095,61e9f38eb937134a3c4bfd8b


### (Optional): Include other datasets in prediction

In [5]:
vessel_data = pd.read_csv('../data/formatted/vessels_converted.csv', sep=',') #Importing vessel data
ports_data = None
schedules_data = None


#### (Optional vessel): Include vessel data in train dataset

In [3]:
# Merge with training data
train_data = train_data.merge(vessel_data, on='vesselId', how='left')

# Handle missing values in vessel data (if any)
# For demonstration, fill missing numerical values with median and categorical with mode
vessel_numerical_cols = ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft', 'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity', 'yearBuilt']
vessel_categorical_cols = ['vesselType', 'homePort', 'fuel', 'freshWater']

for col in vessel_numerical_cols:
    train_data[col] = train_data[col].fillna(train_data[col].median())

for col in vessel_categorical_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

# Combine train and test vesselType categories to avoid unseen categories in test set
vessel_type_le = LabelEncoder()
train_data['vesselType_encoded'] = vessel_type_le.fit_transform(train_data['vesselType'])

home_port_le = LabelEncoder()
train_data['homePort_encoded'] = home_port_le.fit_transform(train_data['homePort'])

# Calculate vessel age at the time of each observation
train_data['vessel_age'] = train_data['time'].dt.year - train_data['yearBuilt']

# Drop original categorical columns if not needed
#train = train.drop(['vesselType', 'homePort', 'fuel', 'freshWater', 'yearBuilt'], axis=1)

# Reorder columns if desired
train_data.head(10)

NameError: name 'vessel_data' is not defined

In [None]:
print(train_data["vesselId"].value_counts())

## Creating the training dataset with past observations

`train_df`: The preprocessed training dataframe

`N_KEEP_PAST`: The number og past (historical) observations to retain as features

The function creates a new dataset containing historical data for each vesselID in the ais_train dataset in a increasing manner. 




In [4]:
def create_training_data(train_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    vessel_list = train_df['vesselId'].unique() # List of all unique vesselIDs
    
    for vessel in tqdm(vessel_list):
        
        vessel_data = train_df[train_df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True) #Sortin the data for each vessel chronologically by time
        
        num_rows = vessel_data.shape[0] #number of rows for the current vesselID
        
        if num_rows <= N_KEEP_PAST: # If a vessel have fewer records than N_KEEP_PAST, then skip
            continue  # Skip vessels with insufficient data
        
        for i in range(N_KEEP_PAST, num_rows): # For i in range (5 -> number of rows for that vessel) ---> skipping to first record which has variable histroical records behind
            
            # Collect past N_KEEP_PAST locations and timestamps
            past_data = vessel_data.loc[i - N_KEEP_PAST:i - 1].reset_index(drop=True) #Extracting past N_KEEP_PAST records
            
            current_data = vessel_data.loc[i] #The current record
            
            # Prepare a dictionary to hold the features and target
            data_row = {}
            target_time = current_data['time'] #The current time of the current record
            
            for j in range(N_KEEP_PAST): #For j in range 0 -> 5
                past_time = past_data.loc[j, 'time']
                
                # Calculate the difference in minutes between past time and target time
                time_diff = (target_time - past_time).total_seconds() / 60.0  # Difference in minutes
                data_row[f'minutes_from_target_{j}'] = time_diff
                data_row[f'lat_{j}'] = past_data.loc[j, 'latitude']
                data_row[f'lon_{j}'] = past_data.loc[j, 'longitude']
                #######
                data_row[f'cog_{j}'] = past_data.loc[j, 'cog']
                data_row[f'sog_{j}'] = past_data.loc[j, 'sog']
                data_row[f'rot_{j}'] = past_data.loc[j, 'rot']
                data_row[f'heading_{j}'] = past_data.loc[j, 'heading']
                data_row[f'navstat_{j}'] = past_data.loc[j, 'navstat']
                ######
                
            #Vessel attributes from current record is included (Will trigger errror if not vessel data included in train dataset)
            
            # for col in ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft',
            #             'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity',
            #             'vesselType_encoded', 'homePort_encoded', 'vessel_age']:
            #     data_row[col] = current_data[col]
                
                
            # Add current location as the target_
            data_row['target_lat'] = current_data['latitude'] #Current latitude
            
            data_row['target_lon'] = current_data['longitude'] #Current longitude
            
            data_row['vesselId'] = vessel  # Include vesselId if needed
            
            data_row['target_time'] = target_time # Remove eventually
            
            # Append the row to the list
            data_rows.append(data_row)
            
    # Create final DataFrame from the list of data rows
    final_df = pd.DataFrame(data_rows)
    return final_df
        

### (Optional): Sampling dataset to see output

In [10]:
#Modifying train
train_data = train_data.iloc[0:50]

## Defying N_KEEP_PAST and creating train dataset

In [5]:
N_KEEP_PAST = 7 #Declare N_KEEP_PAST

final_train = create_training_data(train_data, N_KEEP_PAST) #Creating train dataset
final_train.to_csv('final_train_data.csv', index=False) #Writing to csv file

# print(final_train.shape) #Print dimensions of dataframe
# display(final_train.tail()) #Printing tail of dataset


100%|██████████| 688/688 [08:53<00:00,  1.29it/s]  


# Modify Test dataset

In [6]:
test_data = pd.read_csv('../data/raw/ais_test.csv') #Reading test dataset

test_data['time'] = pd.to_datetime(test_data['time']) #Converting time column to datetime object

### (Optional): Include vessel data in test dataset

In [None]:
test_data = test_data.merge(vessel_data, on='vesselId', how='left') #Merging with vessels data

# Encoding categorical variables in test data (such as in train) 
#Needs to be excluded when dropping vessel
test_data['vesselType_encoded'] = vessel_type_le.transform(test_data['vesselType'])
test_data['homePort_encoded'] = home_port_le.transform(test_data['homePort'])


for col in vessel_numerical_cols:
    test_data[col] = test_data[col].fillna(train_data[col].median())  # Use median from train

for col in vessel_categorical_cols:
    test_data[col] = test_data[col].fillna(train_data[col].mode()[0]) 
    
if 'yearBuilt' in test_data.columns:
    test_data['vessel_age'] = test_data['time'].dt.year - test_data['yearBuilt']
else:
    print("'yearBuilt' is not present in test data after merging.")
    # Decide how to handle this case
    # For example, you can set a default vessel age
    test_data['vessel_age'] = test_data['time'].dt.year - train_data['yearBuilt'].median()

In [None]:
print(test_data.shape)
display(test_data.head())

In [7]:
def create_test_features(train_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    vessel_list = test_df['vesselId'].unique() #List of all unique vesselIDs
    
    for vessel in tqdm(vessel_list):
        
        test_vessel_data = test_df[test_df['vesselId'] == vessel] #Get test data for the current vessel
        
        for idx, test_row in test_vessel_data.iterrows():
            target_time = test_row['time']
            ID = test_row['ID']
            scaling_factor = test_row['scaling_factor']  # If needed later
            
            
            # Get past data from train_data for this vessel before the target time
            vessel_train_data = train_df[(train_df['vesselId'] == vessel) & (train_df['time'] < target_time)] #Retrieving data before target time
            vessel_train_data = vessel_train_data.sort_values(by='time').reset_index(drop=True) #Sorted by time
            num_past_points = vessel_train_data.shape[0] #Number of past records before target time
            
            if num_past_points < N_KEEP_PAST:
                # Not enough past data; decide how to handle (skip or pad with NaNs)
                continue  # Or handle as per your requirement
            
            # Get the last N_KEEP_PAST records
            past_data = vessel_train_data.iloc[-N_KEEP_PAST:].reset_index(drop=True)
            
            # Prepare a dictionary to hold the features
            data_row = {}
            for j in range(N_KEEP_PAST):
                past_time = past_data.loc[j, 'time']
                # Calculate the difference in minutes between past time and target time
                time_diff = (target_time - past_time).total_seconds() / 60.0  # Difference in minutes
                data_row[f'minutes_from_target_{j}'] = time_diff
                data_row[f'lat_{j}'] = past_data.loc[j, 'latitude']
                data_row[f'lon_{j}'] = past_data.loc[j, 'longitude']
                ###
                data_row[f'cog_{j}'] = past_data.loc[j, 'cog']
                data_row[f'sog_{j}'] = past_data.loc[j, 'sog']
                data_row[f'rot_{j}'] = past_data.loc[j, 'rot']
                data_row[f'heading_{j}'] = past_data.loc[j, 'heading']
                data_row[f'navstat_{j}'] = past_data.loc[j, 'navstat']
                ###
            # for col in ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft',
            #             'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity',
            #             'vesselType_encoded', 'homePort_encoded', 'vessel_age']:
            #     data_row[col] = test_row[col]
            # Include vesselId and ID for result matching
            data_row['vesselId'] = vessel
            data_row['ID'] = ID
            # Append the row to the list
            data_rows.append(data_row)
    
    # Create test features DataFrame from the list of data rows
    test_features = pd.DataFrame(data_rows)
    return test_features

In [8]:
test_final = create_test_features(train_data, test_data, N_KEEP_PAST)

100%|██████████| 215/215 [36:36<00:00, 10.22s/it]


In [9]:
display(test_final.head())

Unnamed: 0,minutes_from_target_0,lat_0,lon_0,cog_0,sog_0,rot_0,heading_0,navstat_0,minutes_from_target_1,lat_1,...,minutes_from_target_6,lat_6,lon_6,cog_6,sog_6,rot_6,heading_6,navstat_6,vesselId,ID
0,846.083333,31.14645,-81.49791,316.3,0.0,-4,344,5,810.116667,31.14645,...,15.0,31.14647,-81.49789,179.6,0.0,0,344,5,61e9f3aeb937134a3c4bfe3d,0
1,879.05,31.14645,-81.49791,316.3,0.0,-4,344,5,843.083333,31.14645,...,47.966667,31.14647,-81.49789,179.6,0.0,0,344,5,61e9f3aeb937134a3c4bfe3d,143
2,894.016667,31.14645,-81.49791,316.3,0.0,-4,344,5,858.05,31.14645,...,62.933333,31.14647,-81.49789,179.6,0.0,0,344,5,61e9f3aeb937134a3c4bfe3d,282
3,915.05,31.14645,-81.49791,316.3,0.0,-4,344,5,879.083333,31.14645,...,83.966667,31.14647,-81.49789,179.6,0.0,0,344,5,61e9f3aeb937134a3c4bfe3d,426
4,933.05,31.14645,-81.49791,316.3,0.0,-4,344,5,897.083333,31.14645,...,101.966667,31.14647,-81.49789,179.6,0.0,0,344,5,61e9f3aeb937134a3c4bfe3d,551


In [10]:
final_train.to_csv('final_train.csv', index=False)
test_final.to_csv('test_final.csv', index=False)

# Machine learning part

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from scipy.optimize import minimize

# Load the data
test_final = pd.read_csv('test_final.csv')
final_train = pd.read_csv('final_train.csv')

# Define features and targets
features = final_train.drop(columns=['target_lat', 'target_lon', 'vesselId', 'target_time'])
targets = final_train[['target_lat', 'target_lon']]

feature_columns = features.columns
X_test = test_final[feature_columns]

# Define the models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
    'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
    'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
    'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
}

# Prepare arrays to hold OOF predictions and test predictions
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# Perform cross-validation and collect predictions
for fold, (train_idx, valid_idx) in enumerate(kf.split(features, targets)):
    X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
    X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
    for model_name, model in models.items():
        print(f"Training and predicting with model: {model_name}")
        clf = model
        clf.fit(X_train, y_train)
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
        
        # Save OOF predictions
        oof_preds[model_name][valid_idx] = y_pred_valid
        # Save test predictions
        test_preds[model_name][:,:,fold] = y_pred_test

# Define the loss function for optimization
def mse_loss(weights):
    weights = np.array(weights)
    # Normalize weights to sum to 1
    weights = weights / np.sum(weights)
    # Combine OOF predictions using the weights
    final_oof = np.zeros_like(targets.values)
    for i, model_name in enumerate(models):
        final_oof += weights[i] * oof_preds[model_name]
    # Compute mean squared error
    mse = mean_squared_error(targets.values, final_oof)
    return mse

# Optimization methods to try
#methods = [
#    'Nelder-Mead', 'Powell', 'trust-constr', 'CG', 'BFGS', 'Newton-CG',
#    'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'dogleg', 'trust-ncg',
#    'trust-exact', 'trust-krylov'
#]
methods = [
    'Nelder-Mead', 'Powell',  'CG', 'BFGS',
    'L-BFGS-B', 'TNC', 'SLSQP', 
] # 'trust-constr' is just really slow and not performing well at the this moment

# Initial weights
initial_weights = np.ones(len(models)) / len(models)

# Constraints and bounds
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1)] * len(models)

# Optimize weights using different methods
best_mse = np.inf
best_weights = None
best_method = None

for method in methods:
    print(f"Optimizing weights using method: {method}")
    try:
        if method in ['trust-constr', 'COBYLA', 'SLSQP', 'trust-ncg', 'trust-krylov', 'trust-exact']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
        elif method in ['L-BFGS-B', 'TNC']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds)
        else:
            # For unconstrained methods, weights will be normalized in mse_loss
            res = minimize(mse_loss, initial_weights, method=method)
        if res.fun < best_mse:
            best_mse = res.fun
            best_weights = res.x / np.sum(res.x)  # Normalize weights
            best_method = method
        print(f"Method: {method}, MSE: {res.fun}")
    except Exception as e:
        print(f"Method: {method}, failed with error: {e}")

# Average test predictions over folds for each model
for model_name in models:
    test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# Combine the test predictions using the best weights
final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
for i, model_name in enumerate(models):
    final_test_pred += best_weights[i] * test_preds[model_name]

# Create a DataFrame with IDs and predictions
prediction_df = pd.DataFrame({
    'ID': test_final['ID'],
    'longitude_predicted': final_test_pred[:, 1],
    'latitude_predicted': final_test_pred[:, 0]
})

# Save the predictions to a CSV file
prediction_df.to_csv('predictions10.csv', index=False)

# Print the best method and weights
print(f"Best optimization method: {best_method}")
print("Best weights:")
for i, model_name in enumerate(models):
    print(f"{model_name}: {best_weights[i]:.4f}")


Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM


2.4149950074734456
