# Make training dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
train = pd.read_csv('../data/raw/ais_train.csv', sep='|')
#train =train.drop(['cog','sog','rot','heading','navstat','portId','etaRaw'], axis=1)
train =train.drop(['portId','etaRaw'], axis=1)
train['time'] = pd.to_datetime(train['time'])
train = train.sort_values(by=['vesselId','time'])
train.head()

# Load vessels data
vessels = pd.read_csv('../data/cleaned/cleaned_vessels.csv', sep=',')
vessels.head()

# Merge with training data
train = train.merge(vessels, on='vesselId', how='left')
train.head()
print(train.columns)


# Handle missing values in vessel data (if any)
# For demonstration, fill missing numerical values with median and categorical with mode
numerical_cols = ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft', 'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity', 'yearBuilt']
categorical_cols = ['vesselType', 'homePort', 'fuel', 'freshWater']

for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())

for col in categorical_cols:
    train[col] = train[col].fillna(train[col].mode()[0])

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

# Combine train and test vesselType categories to avoid unseen categories in test set
vessel_type_le = LabelEncoder()
train['vesselType_encoded'] = vessel_type_le.fit_transform(train['vesselType'])

home_port_le = LabelEncoder()
train['homePort_encoded'] = home_port_le.fit_transform(train['homePort'])

# Calculate vessel age at the time of each observation
train['vessel_age'] = train['time'].dt.year - train['yearBuilt']

# Drop original categorical columns if not needed
#train = train.drop(['vesselType', 'homePort', 'fuel', 'freshWater', 'yearBuilt'], axis=1)

# Reorder columns if desired
train.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/ais_train.csv'

In [None]:
print(train["vesselId"].value_counts())

# modify train so it only contains vesselId: 6323f2287abc89c0a9631e57 and 61e9f466b937134a3c4c0273

#train = train[train['vesselId'].isin(['6323f2287abc89c0a9631e57', '61e9f466b937134a3c4c0273'])]

In [52]:
def create_training_data(df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    vessel_list = df['vesselId'].unique()
    for vessel in tqdm(vessel_list):
        vessel_data = df[df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
        num_rows = vessel_data.shape[0]
        if num_rows <= N_KEEP_PAST:
            continue  # Skip vessels with insufficient data
        for i in range(N_KEEP_PAST, num_rows):
            # Collect past N_KEEP_PAST locations and timestamps
            past_data = vessel_data.loc[i - N_KEEP_PAST:i - 1].reset_index(drop=True)
            current_data = vessel_data.loc[i]
            # Prepare a dictionary to hold the features and target
            data_row = {}
            target_time = current_data['time']
            for j in range(N_KEEP_PAST):
                past_time = past_data.loc[j, 'time']
                # Calculate the difference in minutes between past time and target time
                time_diff = (target_time - past_time).total_seconds() / 60.0  # Difference in minutes
                data_row[f'minutes_from_target_{j}'] = time_diff
                data_row[f'lat_{j}'] = past_data.loc[j, 'latitude']
                data_row[f'lon_{j}'] = past_data.loc[j, 'longitude']
                #######
                data_row[f'cog_{j}'] = past_data.loc[j, 'cog']
                data_row[f'sog_{j}'] = past_data.loc[j, 'sog']
                data_row[f'rot_{j}'] = past_data.loc[j, 'rot']
                data_row[f'heading_{j}'] = past_data.loc[j, 'heading']
                data_row[f'navstat_{j}'] = past_data.loc[j, 'navstat']
                ######
            for col in ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft',
                        'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity',
                        'vesselType_encoded', 'homePort_encoded', 'vessel_age']:
                data_row[col] = current_data[col]
            # Add current location as the target
            data_row['target_lat'] = current_data['latitude']
            data_row['target_lon'] = current_data['longitude']
            data_row['vesselId'] = vessel  # Include vesselId if needed
            data_row['target_time'] = target_time # Remove eventually
            # Append the row to the list
            data_rows.append(data_row)
    # Create final DataFrame from the list of data rows
    final_df = pd.DataFrame(data_rows)
    return final_df
        

In [None]:
N_KEEP_PAST = 5
final_train = create_training_data(train, N_KEEP_PAST)
final_train.head()

In [None]:
print(final_train.shape)
display(final_train.tail())

# Modify Test dataset

In [1]:
test = pd.read_csv('../data/raw/ais_test.csv')
test['time'] = pd.to_datetime(test['time'])
test = test.merge(vessels, on='vesselId', how='left')
# Encode categorical variables in test data
test['vesselType_encoded'] = vessel_type_le.transform(test['vesselType'])
test['homePort_encoded'] = home_port_le.transform(test['homePort'])


for col in numerical_cols:
    test[col] = test[col].fillna(train[col].median())  # Use median from train

for col in categorical_cols:
    test[col] = test[col].fillna(train[col].mode()[0]) 
    
if 'yearBuilt' in test.columns:
    test['vessel_age'] = test['time'].dt.year - test['yearBuilt']
else:
    print("'yearBuilt' is not present in test data after merging.")
    # Decide how to handle this case
    # For example, you can set a default vessel age
    test['vessel_age'] = test['time'].dt.year - train['yearBuilt'].median()


NameError: name 'pd' is not defined

In [None]:
print(test.shape)
# modify test so it only contains vesselId: 6323f2287abc89c0a9631e57 and 61e9f466b937134a3c4c0273
#test = test[test['vesselId'].isin(['6323f2287abc89c0a9631e57', '61e9f466b937134a3c4c0273'])]
print(test.shape)
display(test.head())

In [64]:
def create_test_features(train_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    vessel_list = test_df['vesselId'].unique()
    
    for vessel in tqdm(vessel_list):
        # Get the test data for this vessel
        test_vessel_data = test_df[test_df['vesselId'] == vessel]
        for idx, test_row in test_vessel_data.iterrows():
            target_time = test_row['time']
            ID = test_row['ID']
            scaling_factor = test_row['scaling_factor']  # If needed later
            
            # Get past data from train for this vessel before the target time
            vessel_train_data = train_df[(train_df['vesselId'] == vessel) & (train_df['time'] < target_time)]
            vessel_train_data = vessel_train_data.sort_values(by='time').reset_index(drop=True)
            num_past_points = vessel_train_data.shape[0]
            
            if num_past_points < N_KEEP_PAST:
                # Not enough past data; decide how to handle (skip or pad with NaNs)
                continue  # Or handle as per your requirement
            
            # Get the last N_KEEP_PAST records
            past_data = vessel_train_data.iloc[-N_KEEP_PAST:].reset_index(drop=True)
            
            # Prepare a dictionary to hold the features
            data_row = {}
            for j in range(N_KEEP_PAST):
                past_time = past_data.loc[j, 'time']
                # Calculate the difference in minutes between past time and target time
                time_diff = (target_time - past_time).total_seconds() / 60.0  # Difference in minutes
                data_row[f'minutes_from_target_{j}'] = time_diff
                data_row[f'lat_{j}'] = past_data.loc[j, 'latitude']
                data_row[f'lon_{j}'] = past_data.loc[j, 'longitude']
                ###
                data_row[f'cog_{j}'] = past_data.loc[j, 'cog']
                data_row[f'sog_{j}'] = past_data.loc[j, 'sog']
                data_row[f'rot_{j}'] = past_data.loc[j, 'rot']
                data_row[f'heading_{j}'] = past_data.loc[j, 'heading']
                data_row[f'navstat_{j}'] = past_data.loc[j, 'navstat']
                ###
            for col in ['CEU', 'DWT', 'GT', 'NT', 'breadth', 'depth', 'draft',
                        'enginePower', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity',
                        'vesselType_encoded', 'homePort_encoded', 'vessel_age']:
                data_row[col] = test_row[col]
            # Include vesselId and ID for result matching
            data_row['vesselId'] = vessel
            data_row['ID'] = ID
            # Append the row to the list
            data_rows.append(data_row)
    
    # Create test features DataFrame from the list of data rows
    test_features = pd.DataFrame(data_rows)
    return test_features

In [None]:
test_final = create_test_features(train, test, N_KEEP_PAST)

In [None]:
display(test_final.head())

In [None]:
final_train.to_csv('final_train.csv', index=False)
test_final.to_csv('test_final.csv', index=False)

# Machine learning part

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from scipy.optimize import minimize

# Load the data
test_final = pd.read_csv('test_final.csv')
final_train = pd.read_csv('final_train.csv')



# Define features and targets
features = final_train.drop(columns=['target_lat', 'target_lon', 'vesselId', 'target_time'])
targets = final_train[['target_lat', 'target_lon']]

feature_columns = features.columns
X_test = test_final[feature_columns]

# Define the models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
    'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
    'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
    'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
}

# Prepare arrays to hold OOF predictions and test predictions
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# Perform cross-validation and collect predictions
for fold, (train_idx, valid_idx) in enumerate(kf.split(features, targets)):
    X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
    X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
    for model_name, model in models.items():
        print(f"Training and predicting with model: {model_name}")
        clf = model
        clf.fit(X_train, y_train)
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
        
        # Save OOF predictions
        oof_preds[model_name][valid_idx] = y_pred_valid
        # Save test predictions
        test_preds[model_name][:,:,fold] = y_pred_test

# Define the loss function for optimization
def mse_loss(weights):
    weights = np.array(weights)
    # Normalize weights to sum to 1
    weights = weights / np.sum(weights)
    # Combine OOF predictions using the weights
    final_oof = np.zeros_like(targets.values)
    for i, model_name in enumerate(models):
        final_oof += weights[i] * oof_preds[model_name]
    # Compute mean squared error
    mse = mean_squared_error(targets.values, final_oof)
    return mse

# Optimization methods to try
#methods = [
#    'Nelder-Mead', 'Powell', 'trust-constr', 'CG', 'BFGS', 'Newton-CG',
#    'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'dogleg', 'trust-ncg',
#    'trust-exact', 'trust-krylov'
#]
methods = [
    'Nelder-Mead', 'Powell',  'CG', 'BFGS',
    'L-BFGS-B', 'TNC', 'SLSQP', 
] # 'trust-constr' is just really slow and not performing well at the this moment

# Initial weights
initial_weights = np.ones(len(models)) / len(models)

# Constraints and bounds
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1)] * len(models)

# Optimize weights using different methods
best_mse = np.inf
best_weights = None
best_method = None

for method in methods:
    print(f"Optimizing weights using method: {method}")
    try:
        if method in ['trust-constr', 'COBYLA', 'SLSQP', 'trust-ncg', 'trust-krylov', 'trust-exact']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
        elif method in ['L-BFGS-B', 'TNC']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds)
        else:
            # For unconstrained methods, weights will be normalized in mse_loss
            res = minimize(mse_loss, initial_weights, method=method)
        if res.fun < best_mse:
            best_mse = res.fun
            best_weights = res.x / np.sum(res.x)  # Normalize weights
            best_method = method
        print(f"Method: {method}, MSE: {res.fun}")
    except Exception as e:
        print(f"Method: {method}, failed with error: {e}")

# Average test predictions over folds for each model
for model_name in models:
    test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# Combine the test predictions using the best weights
final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
for i, model_name in enumerate(models):
    final_test_pred += best_weights[i] * test_preds[model_name]

# Create a DataFrame with IDs and predictions
prediction_df = pd.DataFrame({
    'ID': test_final['ID'],
    'longitude_predicted': final_test_pred[:, 1],
    'latitude_predicted': final_test_pred[:, 0]
})

# Save the predictions to a CSV file
prediction_df.to_csv('predictions_ensemble2.csv', index=False)

# Print the best method and weights
print(f"Best optimization method: {best_method}")
print("Best weights:")
for i, model_name in enumerate(models):
    print(f"{model_name}: {best_weights[i]:.4f}")


2.4149950074734456
