# Supervised Learning Task Part 1 - Deep Learning Models
SIADS 696 Milestone 2 - Team 24

Notebook author: Seungdo Woo

This notebook addresses the ImportError caused by using the experimental `HalvingGridSearchCV` without enabling it. It imports `enable_halving_search_cv` from `sklearn.experimental` and uses 5-fold cross-validation for tuning and evaluation. It includes data upload, preprocessing with float32 conversion, baseline model evaluation, memory-efficient hyperparameter tuning with 20% sampling (with optional 40% comparison), and final model evaluation.

In [2]:
# # Upload the dataset file from your local machine into this Colab environment.
# # This will prompt you to select the local 'OCEAN_PROCESSED.CSV' file for upload.
# from google.colab import files
# uploaded = files.upload()

# # Rename the uploaded file to 'OCEAN_PROCESSED.CSV' if necessary.
# import os
# for fn in uploaded.keys():
#     if fn != 'OCEAN_PROCESSED.CSV':
#         os.rename(fn, 'OCEAN_PROCESSED.CSV')


## Library Imports

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Experimental import to enable HalvingGridSearchCV
from sklearn.experimental import enable_halving_search_cv  # noqa: F401

# Scikit-learn utilities for model training and evaluation
from sklearn.model_selection import train_test_split, KFold, cross_validate, HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import time


## Data Ingestion

In [None]:
# file_path_formatted = '/content/drive/MyDrive/Colab Notebooks/MADS/SIADS 696/Data/ocean_processed.csv' # Google Colab Path to Processed Ocean Dataset
file_path_formatted = '../Data/ocean_processed.csv' # Local Path to Processed Ocean Dataset
df = pd.read_csv(file_path_formatted)

df.head()

       CAST  Latitude  Longitude  Year  Month  Day   Time Country  \
0  16493420    54.207     13.567  2000      1    3  10.72      DE   
1  16493421    54.007     14.233  2000      1    3  13.68      DE   
2  16493422    53.938     14.225  2000      1    3  14.63      DE   
3  16493423    54.113     14.117  2000      1    3  16.02      DE   
4  13746677    35.795    129.532  2000      1    4   3.92      KR   

   Bottom depth  Depth  Temperature  Salinity  Oxygen  Phosphate  Silicate  \
0           9.0    1.5         2.50      7.40   403.0       0.53     27.30   
1          11.0    1.5         2.10      7.10   406.0       1.15     30.30   
2           7.0    1.5         2.30      6.40   381.0       1.43     32.90   
3          14.0    1.5         2.40      8.50   427.0       0.70     20.90   
4          46.0    0.0        14.62     34.19   245.0       0.43      8.13   

   Nitrate  Pressure  
0     9.70   1.50000  
1    18.20   1.50000  
2    20.20   1.50000  
3    10.00   1.50000  
4

## Deep Learning Models

In [5]:
# Separate features and target variable
X = df.drop(columns=['CAST', 'Oxygen'])
y = df['Oxygen']

# Label encode the 'Country' column if necessary
if X['Country'].dtype == object:
    le = LabelEncoder()
    X['Country'] = le.fit_transform(X['Country'])

# Split into training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to float32 to reduce memory usage
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

print('Training samples:', X_train.shape[0], 'Test samples:', X_test.shape[0])


Training samples: 482320 Test samples: 120580


In [6]:
# Baseline neural network model inside a pipeline
# Increase max_iter and enable early_stopping to reduce convergence warnings.
baseline_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42,
                         early_stopping=True, n_iter_no_change=10, tol=1e-4))
])

# Configure 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the baseline model using cross-validation
cv_results = cross_validate(
    baseline_pipeline, X_train, y_train,
    cv=cv,
    scoring={'rmse': 'neg_root_mean_squared_error', 'r2': 'r2'},
    return_train_score=False
)

# Convert negative RMSE scores back to positive values
rmse_scores = -cv_results['test_rmse']
r2_scores = cv_results['test_r2']

print(f'Baseline CV RMSE: {rmse_scores.mean():.2f} ± {rmse_scores.std():.2f}')
print(f'Baseline CV R²:  {r2_scores.mean():.4f} ± {r2_scores.std():.4f}')


Baseline CV RMSE: 47.65 ± 21.19
Baseline CV R²:  0.7480 ± 0.2453


In [7]:
# Hyperparameter tuning using HalvingGridSearchCV with 5-fold CV

# Create a subsample of the training data for tuning (20%)
X_tune, _, y_tune, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (100, 100)],
    'mlp__alpha': [0.0001, 0.001]
}

# Create the tuning pipeline
tune_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(max_iter=500, random_state=42, early_stopping=True,
                         n_iter_no_change=10, tol=1e-4))
])

# Use 5-fold CV for tuning
cv_tune = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform halving grid search
grid_search = HalvingGridSearchCV(
    estimator=tune_pipeline,
    param_grid=param_grid,
    factor=2,
    cv=cv_tune,
    scoring='neg_root_mean_squared_error',
    n_jobs=1,
    verbose=2
)

# Fit the grid search on the subsampled data
grid_search.fit(X_tune, y_tune)

# Output best parameters and CV RMSE
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print('Best parameters:', best_params)
print(f'Best CV RMSE (mean): {best_cv_rmse:.2f}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 24116
max_resources_: 96464
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 24116
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ...mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,); total time=   4.0s
[CV] END ...mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,); total time=   5.5s
[CV] END ...mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,); total time=   4.9s
[CV] END ...mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,); total time=   4.2s
[CV] END ...mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,); total time=   4.9s
[CV] END ..mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,); total time=  10.0s
[CV] END ..mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,); total time=   6.0s
[CV] END ..mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,); total time=   8.5s
[CV] END ..mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,); total time=   7.8s
[CV] EN

In [8]:
# Optional: Compare 20% vs 40% sampling for hyperparameter tuning
def tune_with_sample(sample_fraction):
    X_sample, _, y_sample, _ = train_test_split(X_train, y_train, train_size=sample_fraction, random_state=42)
    param_grid = {
        'mlp__hidden_layer_sizes': [(50,), (100,), (100, 100)],
        'mlp__alpha': [0.0001, 0.001]
    }
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(max_iter=500, random_state=42, early_stopping=True,
                             n_iter_no_change=10, tol=1e-4))
    ])
    cv_local = KFold(n_splits=5, shuffle=True, random_state=42)
    halving_search = HalvingGridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        factor=2,
        cv=cv_local,
        scoring='neg_root_mean_squared_error',
        n_jobs=1,
        verbose=1
    )
    halving_search.fit(X_sample, y_sample)
    return halving_search.best_params_, -halving_search.best_score_

# Run comparison for 20% and 40% samples
for frac in [0.2, 0.4]:
    params, rmse = tune_with_sample(frac)
    print(f'Sample {int(frac*100)}% -> Best params: {params}, CV RMSE: {rmse:.2f}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 24116
max_resources_: 96464
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 24116
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 3
n_resources: 48232
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 2
n_candidates: 2
n_resources: 96464
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Sample 20% -> Best params: {'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50,)}, CV RMSE: 26.12
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 48232
max_resources_: 192928
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 48232
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 3
n_resources: 96464
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 2
n_candidat

In [10]:
# Train the best model on the full training set and evaluate on the test set

best_model = grid_search.best_estimator_

# Fit the model on full training data
start_time = time.time()
best_model.fit(X_train, y_train)
train_time = time.time() - start_time

# Predict on test set
y_pred = best_model.predict(X_test)

# Test metrics
# Compute RMSE manually because older versions of scikit-learn may not support 'squared' parameter
mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(mse)
test_r2 = r2_score(y_test, y_pred)

print(f'Training time: {train_time:.2f} s')
print(f'Test RMSE: {test_rmse:.2f}')
print(f'Test R²:  {test_r2:.4f}')


Training time: 601.61 s
Test RMSE: 33.08
Test R²:  0.9006
