In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

Import Tabular training data

In [None]:
labels = pd.read_csv('FILEPATH')
df = pd.read_csv('FILEPATH')

Prepare data for modeling

In [None]:
df = pd.merge(labels, df, on = 'ID')
df = df.drop(columns=['ID'])
df_wlabs = df.dropna()

In [None]:
X = df_wlabs.drop(['HWMdepth_m'] , axis= 1).values
y =  df_wlabs['HWMdepth_m'].values

plt.plot(X,y)

plt.show()

In [None]:
indices = np.arange(len(X))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [None]:
# First split: 70% train, 30% temp (which will be split into 15% val + 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=54)

# Second split: Split the 30% temp into 15% val and 15% test (50-50 split of the temp data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=54)

print('Number of training observations:', y_train.shape[0], f'({y_train.shape[0]/len(y)*100:.1f}%)')
print('Number of validation observations:', y_val.shape[0], f'({y_val.shape[0]/len(y)*100:.1f}%)')
print('Number of test observations:', y_test.shape[0], f'({y_test.shape[0]/len(y)*100:.1f}%)')

In [None]:
# Z-score standardization based on training set statistics (prevents data leakage)
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)

# Apply z-score standardization: (X - mean) / std
X_train_norm = (X_train - X_mean) / X_std
X_val_norm = (X_val - X_mean) / X_std
X_test_norm = (X_test - X_mean) / X_std

print('Training set standardized - shape:', X_train_norm.shape)
print('Validation set standardized - shape:', X_val_norm.shape)
print('Test set standardized - shape:', X_test_norm.shape)
print('\nTraining set statistics used for standardization:')
print('Mean:', X_mean)
print('Std:', X_std)
print('\nAfter z-score standardization:')
print('Training set - Mean: {:.6f}, Std: {:.6f}'.format(X_train_norm.mean(), X_train_norm.std()))
print('Validation set - Mean: {:.6f}, Std: {:.6f}'.format(X_val_norm.mean(), X_val_norm.std()))
print('Test set - Mean: {:.6f}, Std: {:.6f}'.format(X_test_norm.mean(), X_test_norm.std()))

Hyperparameter Tuning and Training of the RF Model

In [None]:
# Robust Hyperparameter Tuning with Cross-Validation
rf = RandomForestRegressor(random_state=54)

# Parameter grid for regression
rf_params = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Cross-validation for regression
rf_cv = RandomizedSearchCV(
    rf, 
    rf_params, 
    n_iter=100,  # More iterations for better search
    cv=5,       # 10-fold CV for more robust evaluation
    scoring='neg_mean_squared_error',  # MSE for regression
    random_state=54,
    n_jobs=-1,   # Use all cores
    verbose=1
)

print("Starting hyperparameter optimization...")
rf_optimized = rf_cv.fit(X_train_norm, y_train)

print('Best parameters:', rf_optimized.best_params_)
print('Best cross-validation score (neg_MSE):', rf_optimized.best_score_)

In [None]:
# Model Evaluation on All Sets
best_rf = rf_optimized.best_estimator_

# Predictions on all sets
y_train_pred = best_rf.predict(X_train_norm)
y_val_pred = best_rf.predict(X_val_norm)
y_test_pred = best_rf.predict(X_test_norm)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)

def calculate_regression_metrics(y_true, y_pred, set_name):
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    
    print(f"\n{set_name} Set Metrics:")
    print("-" * 30)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

# Calculate metrics for all sets
train_metrics = calculate_regression_metrics(y_train, y_train_pred, "Training")
val_metrics = calculate_regression_metrics(y_val, y_val_pred, "Validation")
test_metrics = calculate_regression_metrics(y_test, y_test_pred, "Test")

In [None]:
# Regression Visualization: Actual vs Predicted
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

def plot_actual_vs_predicted(y_true, y_pred, ax, title, metrics):
    ax.scatter(y_true, y_pred, alpha=0.6)
    
    # Perfect prediction line
    min_val = min(min(y_true), min(y_pred))
    max_val = max(max(y_true), max(y_pred))
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Values')
    ax.set_ylabel('Predicted Values')
    ax.set_title(f'{title} Set\nR² = {metrics["r2"]:.3f}, RMSE = {metrics["rmse"]:.3f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plot_actual_vs_predicted(y_train, y_train_pred, axes[0], 'Training', train_metrics)
plot_actual_vs_predicted(y_val, y_val_pred, axes[1], 'Validation', val_metrics)
plot_actual_vs_predicted(y_test, y_test_pred, axes[2], 'Test', test_metrics)

plt.tight_layout()
plt.show()

# Residual plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

def plot_residuals(y_true, y_pred, ax, title):
    residuals = y_true - y_pred
    ax.scatter(y_pred, residuals, alpha=0.6)
    ax.axhline(y=0, color='r', linestyle='--')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Residuals')
    ax.set_title(f'{title} Set - Residual Plot')
    ax.grid(True, alpha=0.3)

plot_residuals(y_train, y_train_pred, axes[0], 'Training')
plot_residuals(y_val, y_val_pred, axes[1], 'Validation')
plot_residuals(y_test, y_test_pred, axes[2], 'Test')

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Analysis
feature_importance = best_rf.feature_importances_

# Get feature names (assuming they match the original dataframe columns)
try:
    feature_names = df_wlabs.drop(['HWMdepth_m'], axis=1).columns
except:
    # If feature names not available, create generic names
    feature_names = [f'Feature_{i}' for i in range(len(feature_importance))]

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("="*60)
print("FEATURE IMPORTANCE RANKINGS")
print("="*60)
print(importance_df.to_string(index=False))

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)

# Horizontal bar plot for top 20 features
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importance (Random Forest)')
plt.gca().invert_yaxis()

# Add importance values on bars
for i, v in enumerate(top_features['importance']):
    plt.text(v + 0.001, i, f'{v:.3f}', va='center')

plt.subplot(2, 1, 2)

# Cumulative importance plot
cumulative_importance = np.cumsum(importance_df['importance'].values)
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-', linewidth=2)
plt.axhline(y=0.8, color='r', linestyle='--', label='80% of importance')
plt.axhline(y=0.9, color='orange', linestyle='--', label='90% of importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find number of features needed for 80% and 90% of importance
idx_80 = np.argmax(cumulative_importance >= 0.8) + 1
idx_90 = np.argmax(cumulative_importance >= 0.9) + 1

print(f"\nFeature Selection Insights:")
print(f"Number of features needed for 80% of importance: {idx_80}")
print(f"Number of features needed for 90% of importance: {idx_90}")
print(f"Total number of features: {len(feature_importance)}")

In [None]:
# Final Model Summary
print("="*80)
print("FINAL MODEL SUMMARY")
print("="*80)
print(f"Best Random Forest Parameters: {rf_optimized.best_params_}")
print(f"Best Cross-Validation Score (neg_MSE): {rf_optimized.best_score_:.4f}")
print(f"\nFinal Test Set Performance:")
print(f"  R² Score: {test_metrics['r2']:.4f}")
print(f"  RMSE: {test_metrics['rmse']:.4f}")
print(f"  MAE: {test_metrics['mae']:.4f}")
print(f"  MSE: {test_metrics['mse']:.4f}")

# Check for overfitting
print(f"\nOverfitting Check:")
print(f"  Training R²: {train_metrics['r2']:.4f}")
print(f"  Validation R²: {val_metrics['r2']:.4f}")
print(f"  Test R²: {test_metrics['r2']:.4f}")
print(f"  Train-Val R² Gap: {train_metrics['r2'] - val_metrics['r2']:.4f}")
print(f"  Train-Test R² Gap: {train_metrics['r2'] - test_metrics['r2']:.4f}")

if train_metrics['r2'] - test_metrics['r2'] > 0.2:
    print("  WARNING: Potential overfitting detected (>20% R² gap between train and test)")
else:
    print("  Model shows good generalization")

print("\n" + "="*80)