# Preprocessing the training dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [16]:
# Load AIS train dataset
train_dataset = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_train.csv', sep='|')

# Load vessels.csv dataset
vessels_df = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/cleaned/cleaned_vessels.csv', delimiter=',')

# Drop columns we don't want to use in the AIS dataset
train_dataset = train_dataset.drop(['etaRaw', 'portId', 'cog', 'sog', 'rot', 'navstat'], axis=1)

# Transform time to datetime
train_dataset['time'] = pd.to_datetime(train_dataset['time'])

vessels_df = vessels_df.drop(['DWT', 'NT', 'CEU', 'breadth', 'length'], axis=1)

# Merge the train dataset with vessels dataset on 'vesselId'
merged_dataset = pd.merge(train_dataset, vessels_df, on='vesselId', how='left')

# Sort the merged data by vesselId and time
merged_dataset = merged_dataset.sort_values(by=['vesselId', 'time'])

# Display the first few rows to verify the merge
merged_dataset.head()


Unnamed: 0,time,heading,latitude,longitude,vesselId,shippingLineId,GT,vesselType,depth,draft,enginePower,freshWater,fuel,homePort,maxHeight,maxSpeed,maxWidth,rampCapacity,yearBuilt
131115,2024-01-12 14:07:47,316,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61a8e672f9cba188601e84ab,58684,83.0,22.2,,0.0,,,OSLO,5.0,18.6,15.2,150.0,2000
131279,2024-01-12 14:31:00,313,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61a8e672f9cba188601e84ab,58684,83.0,22.2,,0.0,,,OSLO,5.0,18.6,15.2,150.0,2000
131514,2024-01-12 14:57:23,312,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61a8e672f9cba188601e84ab,58684,83.0,22.2,,0.0,,,OSLO,5.0,18.6,15.2,150.0,2000
131696,2024-01-12 15:18:48,313,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61a8e672f9cba188601e84ab,58684,83.0,22.2,,0.0,,,OSLO,5.0,18.6,15.2,150.0,2000
131885,2024-01-12 15:39:47,313,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61a8e672f9cba188601e84ab,58684,83.0,22.2,,0.0,,,OSLO,5.0,18.6,15.2,150.0,2000


In [17]:
def create_training_data(df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to store data rows
    unique_vessels = df['vesselId'].unique()  # Get unique vessel IDs

    for vessel in tqdm(unique_vessels):
        vessel_data = df[df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
        total_rows = len(vessel_data)
        
        if total_rows <= N_KEEP_PAST:
            continue

        # Convert time to seconds for performance reasons
        vessel_data['time_seconds'] = vessel_data['time'].astype(np.int64) // 10**9

        # Loop through the dataset only once to collect features
        for index in range(N_KEEP_PAST, total_rows):
            current_record = vessel_data.iloc[index]
            past_records = vessel_data.iloc[index - N_KEEP_PAST:index]

            feature_row = {
                'vesselId': vessel,
                'target_lat': current_record['latitude'],
                'target_lon': current_record['longitude'],
                'target_time': current_record['time']
            }

            # Incorporate features from vessels.csv only once per vessel (assuming static)
            if not feature_row.get('vesselType'):  # Add only if not already added
                for col in vessels_df.columns:
                    if col not in ['vesselId']:  # Avoid duplicating the key column
                        feature_row[col] = vessel_data[col].iloc[0]

            # Collect past features efficiently
            time_diffs = (current_record['time_seconds'] - past_records['time_seconds']).values / 60.0
            feature_row.update({f'minutes_from_target_{i}': time_diffs[i] for i in range(N_KEEP_PAST)})
            feature_row.update({f'lat_{i}': past_records['latitude'].iloc[i] for i in range(N_KEEP_PAST)})
            feature_row.update({f'lon_{i}': past_records['longitude'].iloc[i] for i in range(N_KEEP_PAST)})

            data_rows.append(feature_row)

        # Drop the temporary time column to save memory
        vessel_data.drop(columns=['time_seconds'], inplace=True)
    
    return pd.DataFrame(data_rows)


In [18]:
processed_train = create_training_data(merged_dataset, 3)

  vessel_data['time_seconds'] = vessel_data['time'].astype(np.int64) // 10**9
100%|██████████| 688/688 [10:03<00:00,  1.14it/s]


In [5]:
processed_train.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv', index=False)

In [6]:
def sort_training_data(merged_df: pd.DataFrame) -> pd.DataFrame:
    # Ensure that the 'time' column is of datetime type for correct sorting
    if not pd.api.types.is_datetime64_any_dtype(merged_df['time']):
        merged_df['time'] = pd.to_datetime(merged_df['time'])
    
    # Sort the DataFrame by 'vesselId' and 'time' in ascending order
    sorted_df = merged_df.sort_values(by=['vesselId', 'time'], ascending=True).reset_index(drop=True)
    
    return sorted_df

In [7]:
sorted_train = sort_training_data(merged_dataset)

# Preprocessing the test dataset

In [8]:
test_dataset = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_test.csv')

test_dataset['time'] = pd.to_datetime(test_dataset['time'])

test_dataset.head()

Unnamed: 0,ID,vesselId,time,scaling_factor
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,0.3
1,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,0.3
2,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,0.3
3,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,0.3
4,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,0.3


In [9]:
import pandas as pd
from tqdm import tqdm

def create_test_data(merged_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    unique_vessels = test_df['vesselId'].unique()  # Get unique vessel IDs

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter out the train data for this vessel that is less than the max time in test data
        vessel_train_data = merged_df[merged_df['vesselId'] == vessel]
        # Sort the train data by time, if not already sorted
        vessel_train_data = vessel_train_data.sort_values(by='time').reset_index(drop=True)

        # Get test data for the current vessel
        test_vessel_data = test_df[test_df['vesselId'] == vessel]

        for _, test_row in test_vessel_data.iterrows():
            target_time = test_row['time']
            ID = test_row['ID']
            # Get the last N_KEEP_PAST records from the train data before the target time
            past_data = vessel_train_data[vessel_train_data['time'] < target_time].tail(N_KEEP_PAST)

            # Check if we have enough past data
            if len(past_data) < N_KEEP_PAST:
                continue  # Not enough past data; skip this test row

            # Prepare a dictionary to hold the features
            feature_row = {
                'vesselId': vessel,
                'target_time': target_time,  # Only include target time
                'ID': ID
            }

            # Incorporate vessel-specific features from vessels.csv for the current vessel
            vessel_info = vessel_train_data.iloc[0]  # Assume all records for this vessel share the same static features
            for col in vessels_df.columns:
                if col not in ['vesselId']:  # Avoid duplicating the key column
                    feature_row[col] = vessel_info[col]

            # Loop through past records to collect features
            for j in range(N_KEEP_PAST):
                past_record = past_data.iloc[j]
                time_diff = (target_time - past_record['time']).total_seconds() / 60.0  # Difference in minutes

                # Add features for minutes from target and heading
                feature_row[f'minutes_from_target_{j}'] = time_diff
                feature_row[f'heading_{j}'] = past_record['heading']
                feature_row[f'lat_{j}'] = past_record['latitude']
                feature_row[f'lon_{j}'] = past_record['longitude']

            # Append the row to the list
            data_rows.append(feature_row)

    # Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)



In [10]:
processed_test = create_test_data(sorted_train, test_dataset, 5)

100%|██████████| 215/215 [01:11<00:00,  3.00it/s]


In [11]:
processed_test.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv', index=False)

# Code for making predictions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder

# Load datasets into pandas DataFrames
train_df = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv')
test_df = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv')

# Encode the vesselId using LabelEncoder
le_vessel = LabelEncoder()
train_df['vesselId_encoded'] = le_vessel.fit_transform(train_df['vesselId'])
test_df['vesselId_encoded'] = le_vessel.transform(test_df['vesselId'])

# Drop the original vesselId column now that we have the encoded version
train_df = train_df.drop(columns=['vesselId'])
test_df = test_df.drop(columns=['vesselId'])

# Identify non-numeric columns
non_numeric_cols = train_df.select_dtypes(include=['object']).columns
print("Non-numeric columns in training set:", non_numeric_cols)

# Combine train and test data for consistent encoding
combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])

for col in non_numeric_cols:
    nunique = combined_df[col].nunique()
    if nunique < 100:
        # One-hot encode columns with few unique values
        combined_df = pd.get_dummies(combined_df, columns=[col], prefix=col)
    else:
        # Apply Label Encoding for high-cardinality columns
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col].astype(str))

# Split the combined data back into train and test sets
train_df = combined_df.xs('train')
test_df = combined_df.xs('test')

# Ensure that the target columns are not included in the test set
target_columns = ['target_lat', 'target_lon', 'target_time']
X = train_df.drop(columns=target_columns)
y = train_df[['target_lat', 'target_lon']]
X_test = test_df[X.columns]  # Ensure X_test has the same columns as X

# Initialize XGBoost model wrapped in a MultiOutputRegressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)
multi_output_model = MultiOutputRegressor(xgb_model)

# Set up K-Fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize arrays for predictions
oof_preds = np.zeros((X.shape[0], y.shape[1]))
test_preds = np.zeros((X_test.shape[0], y.shape[1], n_folds))

# Perform training and predictions using K-Fold
for fold_index, (train_indices, valid_indices) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_val, y_val = X.iloc[valid_indices], y.iloc[valid_indices]
    
    print(f"Training fold {fold_index + 1}/{n_folds} using XGBoost...")
    multi_output_model.fit(X_train, y_train)

    # Make predictions
    val_preds = multi_output_model.predict(X_val)
    test_fold_preds = multi_output_model.predict(X_test)

    # Save the predictions
    oof_preds[valid_indices] = val_preds
    test_preds[:, :, fold_index] = test_fold_preds

# Define the function to compute mean squared error for weight optimization
def compute_mse(weights):
    normalized_weights = np.array(weights) / np.sum(weights)  # Normalize weights
    combined_preds = normalized_weights[0] * oof_preds  # Since there's only one model
    mse = mean_squared_error(y, combined_preds)
    return mse

# Initial weight for optimization
initial_weight = [1.0]  # Single model
bounds = [(0, 1)]  # Weights should be between 0 and 1

# Optimize the weights to minimize the MSE
optimal_mse = float('inf')
optimal_weights = None

print("Starting weight optimization...")
try:
    optimization_result = minimize(compute_mse, initial_weight, method='Nelder-Mead', bounds=bounds)
    optimal_mse = optimization_result.fun
    optimal_weights = optimization_result.x / np.sum(optimization_result.x)  # Normalize weights
except Exception as e:
    print(f"Error during optimization: {e}")

# Average the test predictions across folds
mean_test_predictions = np.mean(test_preds, axis=2)

# Finalize predictions using optimized weights
final_predictions = optimal_weights[0] * mean_test_predictions  # Since there's only one model

# Create a DataFrame for output
results_df = pd.DataFrame({
    'ID': test_df['ID'],
    'longitude_predicted': final_predictions[:, 1],
    'latitude_predicted': final_predictions[:, 0]
})

# Save the predictions to a CSV file
results_df.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions6.csv', index=False)

# Print optimized results
print(f"Optimized Mean Squared Error: {optimal_mse}")
print(f"Final Prediction Weight: {optimal_weights[0]:.4f}")
