In [None]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks

# === Step 1: Load the data ===
# Load the dataset containing buoy trajectory and wind data
buoy_data = pd.read_csv('../combined_buoy_data.csv')

# Keep only the necessary columns for the task
columns_to_keep = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'displacement', 'heading']
buoy_data = buoy_data[columns_to_keep].copy()

# Convert the datetime column to a proper datetime object
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])

# === Step 2: Define features and targets ===
# Features include buoy location, wind data, and metadata
X = buoy_data[['Latitude', 'Longitude', 'era5_uwnd', 'era5_vwnd', 'BuoyID', 'datetime']]

# Targets are displacement and heading (trajectory data)
y = buoy_data[['displacement', 'heading']]

# Grouping information (BuoyID) is needed for GroupKFold
groups = buoy_data['BuoyID']

# Ensure the directory for saving predictions exists
predictions_dir = '../data/processed/predictions'
os.makedirs(predictions_dir, exist_ok=True)

# === Step 3: Set up GroupKFold for cross-validation ===
cv_folds = 5
group_kf = GroupKFold(n_splits=cv_folds)

# === Step 4: Define deep learning architectures ===

# Fully Connected Network (FCN)
def build_fcn(input_shape):
    """
    A simple fully connected network (FCN) for baseline comparisons.
    """
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(128, activation='relu'),  # First dense layer with ReLU
        layers.Dropout(0.2),  # Dropout for regularization
        layers.Dense(64, activation='relu'),  # Second dense layer with ReLU
        layers.Dense(2)  # Output layer for displacement and heading
    ])
    return model

# Recurrent Neural Network (RNN)
def build_rnn(input_shape):
    """
    An RNN using LSTM layers to capture temporal dependencies.
    """
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.LSTM(64, return_sequences=True),  # First LSTM layer with 64 units
        layers.LSTM(32),  # Second LSTM layer with 32 units
        layers.Dense(2)  # Output layer for displacement and heading
    ])
    return model

# Convolutional Neural Network (CNN)
def build_cnn(input_shape):
    """
    A CNN to capture spatial relationships in the data.
    """
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv1D(32, kernel_size=3, activation='relu'),  # First convolutional layer
        layers.Conv1D(64, kernel_size=3, activation='relu'),  # Second convolutional layer
        layers.GlobalAveragePooling1D(),  # Pooling to reduce dimensionality
        layers.Dense(64, activation='relu'),  # Dense layer with ReLU
        layers.Dense(2)  # Output layer for displacement and heading
    ])
    return model

# === Step 5: Define the physics-informed loss function ===
def physics_informed_loss(y_true, y_pred, wind_u, wind_v, lambda1=1.0, lambda2=0.1):
    """
    Combines traditional MSE loss with a physics-informed constraint.
    """
    # Mean Squared Error (MSE) loss for trajectory predictions
    mse_loss = tf.reduce_mean(tf.square(y_true - y_pred))
    
    # Constraint: Predicted displacement should align with wind data
    physical_constraint = tf.reduce_mean(tf.square(y_pred - (wind_u + wind_v)))
    
    # Combined loss
    return lambda1 * mse_loss + lambda2 * physical_constraint

# === Step 6: Train and evaluate models ===
results = []  # Store results for comparison
architectures = {'FCN': build_fcn, 'RNN': build_rnn, 'CNN': build_cnn}  # Model architectures

for arch_name, build_model in architectures.items():
    print(f"\nTesting architecture: {arch_name}")
    model_scores = []  # RMSE for each fold
    fold_times = []  # Time taken for each fold

    for fold_num, (train_idx, val_idx) in enumerate(group_kf.split(X, y, groups=groups)):
        print(f"\nFold {fold_num + 1}")
        start_time = time.time()

        # === Prepare train and validation data ===
        # Separate training and validation data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Extract wind data for physics-informed loss
        wind_u_train, wind_v_train = X_train['era5_uwnd'], X_train['era5_vwnd']
        wind_u_val, wind_v_val = X_val['era5_uwnd'], X_val['era5_vwnd']

        # Drop non-numerical features for training
        X_train = X_train.drop(columns=['BuoyID', 'datetime'])
        X_val = X_val.drop(columns=['BuoyID', 'datetime'])

        # === Build and compile the model ===
        model = build_model(input_shape=(X_train.shape[1],))
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),  # Adam optimizer
            loss=lambda y_true, y_pred: physics_informed_loss(y_true, y_pred, wind_u_train, wind_v_train),  # Custom loss
            metrics=['mse']  # Mean squared error as a metric
        )

        # === Train the model ===
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=50,  # Maximum 50 epochs
            batch_size=32,  # Mini-batch size
            verbose=1,
            callbacks=[callbacks.EarlyStopping(patience=5, restore_best_weights=True)]  # Stop early if no improvement
        )

        # === Evaluate the model ===
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))  # Root mean squared error
        model_scores.append(rmse)
        print(f"Fold {fold_num + 1} RMSE: {rmse:.3f}")

        # Record fold time
        fold_time = time.time() - start_time
        fold_times.append(fold_time)
        print(f"Fold {fold_num + 1} time: {fold_time:.2f} seconds")

        # === Save predictions for analysis ===
        predictions_df = pd.DataFrame({
            'BuoyID': X.iloc[val_idx]['BuoyID'].values,
            'True displacement': y_val['displacement'].values,
            'True heading': y_val['heading'].values,
            'Predicted displacement': y_pred[:, 0],
            'Predicted heading': y_pred[:, 1]
        })
        predictions_file = os.path.join(predictions_dir, f"{arch_name}_fold{fold_num + 1}_predictions.csv")
        predictions_df.to_csv(predictions_file, index=False)

    # === Record results for this architecture ===
    mean_rmse = np.mean(model_scores)  # Average RMSE
    std_rmse = np.std(model_scores)  # Standard deviation of RMSE
    total_time = sum(fold_times)  # Total time for training and evaluation

    results.append({
        'Model': arch_name,
        'Mean RMSE': mean_rmse,
        'RMSE StdDev': std_rmse,
        'Total Time (s)': total_time,
        'Mean Time per Fold (s)': np.mean(fold_times)
    })

    print(f"\nCompleted cross-validation for {arch_name}. "
          f"Mean RMSE: {mean_rmse:.3f}, Std. Dev: {std_rmse:.3f}, Total Time: {total_time:.2f} seconds")

# === Save and summarize results ===
results_df = pd.DataFrame(results)
results_df.to_csv('model_comparison_results.csv', index=False)

# Identify the best model based on Mean RMSE
best_model_row = results_df.loc[results_df['Mean RMSE'].idxmin()]
print(f"\n=== Best model selected: {best_model_row['Model']} ===")
print(f"Mean RMSE: {best_model_row['Mean RMSE']:.3f}, Total Time: {best_model_row['Total Time (s)']:.2f} seconds")
