In [1]:
import pandas as pd

In [2]:
fp = 'nns_vr_tmax_toc.csv'
with open(fp, encoding='utf-8') as f:
    df = pd.read_csv(f, skipinitialspace=True, encoding='utf-8')
df

Unnamed: 0,Base MD.Sample(m),Lat.Any(°),Long.Any(°),Type.Sample,Lith.Phys,TOC.Any(%wt),T-max.Any(°C),HI.Any(ppt (mg/g) (a/b)),S1.Any(ppt (mg/g)),S2.Any(ppt (mg/g)),VR Mean.Any(%)
0,4059.935994,56.396339,3.315175,Cuttings,Claystone,2.83,445,186.572438,0.93,5.28,0.28
1,4320.000000,56.146997,3.454261,pCuttings,Coal,21.15,446,218.061466,3.66,46.12,0.83
2,3105.000000,56.234464,3.707639,pCuttings,Shale/Claystone,0.70,426,100.000000,0.50,0.70,0.39
3,3105.000000,56.234464,3.707639,pCuttings,Claystone,0.20,408,50.000000,0.20,0.10,0.38
4,4170.000000,56.146997,3.454261,pCuttings,Shale/Claystone,2.96,445,77.702703,0.39,2.30,0.71
...,...,...,...,...,...,...,...,...,...,...,...
1795,4522.000000,59.316653,2.211850,Cuttings,Claystone,5.77,457,139.168111,0.15,8.03,1.09
1796,4552.000000,59.316653,2.211850,Cuttings,Claystone,7.63,462,130.275229,0.28,9.94,1.15
1797,4627.000000,59.316653,2.211850,Cuttings,Claystone,3.94,450,142.639594,0.12,5.62,1.15
1798,4681.000000,59.316653,2.211850,Cuttings,Claystone,2.10,447,263.333333,0.12,5.53,1.23


In [3]:
def get_n_hidden_units(n_rows: int, n_features: int) -> int:
    return int(min(100, max(10, (n_features * n_rows)/5)))

hidden_layer_sizes = [get_n_hidden_units(*df.shape)]
print(f"Hidden layer sizes: {hidden_layer_sizes}")

Hidden layer sizes: [100]


In [4]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np

features = df.columns.values[:-1]
# excl text features for now
features = features.tolist()[:3] + features.tolist()[5:] 
targets = df.columns.values[-1]
seed = 42

X, y = df[features], df[targets]
X_trn, X_test, y_trn, y_test = train_test_split(X, y, train_size=0.7, random_state=seed)

regressor = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, random_state=seed)

def build_and_eval_pipeline(regressor):
    pipeline = Pipeline(steps=[
        ('Scale', StandardScaler()),
        ('Impute', KNNImputer()),
        ('Regression', regressor)])

    pipeline.fit(X_trn, y_trn)
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"R²: {r2:.3f}, RMSE: {rmse:.2f}")
    
build_and_eval_pipeline(regressor)

R²: 0.579, RMSE: 0.17


In [5]:
# try with grid search
from sklearn.model_selection import GridSearchCV, KFold

param_grid = {
    "hidden_layer_sizes": [
        (100), (150), (200),
        (50, 50), (60, 40), (40, 60), 
        (33, 33, 33), (25, 50, 25),
        (25, 25, 25, 25),
        (20, 20, 20, 20, 20)],  
    "solver": ["lbfgs", "adam"]}

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
regressorSearch = GridSearchCV(regressor, param_grid, cv=kfold, verbose=2)
build_and_eval_pipeline(regressorSearch)

regressorSearch.best_params_

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ...............hidden_layer_sizes=100, solver=lbfgs; total time=   0.5s
[CV] END ................hidden_layer_sizes=100, solver=adam; total time=   0.2s
[CV] END ................hidden_layer_sizes=10

[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.7s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.7s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.8s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.7s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.7s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.7s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.8s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.8s
[CV] END ..........hidden_layer_sizes=(40, 60), solver=lbfgs; total time=   0.8s
[CV] END ...........hidden_layer_sizes=(40, 60), solver=adam; total time=   0.6s
[CV] END ...........hidden_layer_sizes=(40, 60), solver=adam; total time=   0.4s
[CV] END ...........hidden_layer_sizes=(40, 60), solver=adam; total time=   0.5s
[CV] END ...........hidden_l

{'hidden_layer_sizes': (25, 50, 25), 'solver': 'adam'}

In [9]:
pd.DataFrame(regressorSearch.cv_results_).sort_values("rank_test_score")[:10]  # top 10

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_hidden_layer_sizes,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
15,0.247675,0.024059,0.0003,0.000458,"(25, 50, 25)",adam,"{'hidden_layer_sizes': (25, 50, 25), 'solver':...",0.72947,0.600421,0.477587,0.630348,0.67658,0.660431,0.715417,0.682975,0.745444,0.674447,0.659312,0.073422,1
11,0.586559,0.070133,0.0006,0.00049,"(40, 60)",adam,"{'hidden_layer_sizes': (40, 60), 'solver': 'ad...",0.730774,0.576561,0.47516,0.629478,0.656643,0.686364,0.72715,0.682413,0.704693,0.654726,0.652396,0.073643,2
3,0.383735,0.061579,0.0008,0.0004,150,adam,"{'hidden_layer_sizes': 150, 'solver': 'adam'}",0.748153,0.539177,0.478367,0.618707,0.653903,0.663037,0.712688,0.665379,0.693545,0.676879,0.644984,0.076858,3
5,0.549758,0.088224,0.0007,0.000458,200,adam,"{'hidden_layer_sizes': 200, 'solver': 'adam'}",0.703845,0.573315,0.462783,0.648571,0.623847,0.615554,0.675833,0.662362,0.722593,0.703408,0.639211,0.073131,4
17,0.273793,0.032703,0.000503,0.000503,"(25, 25, 25, 25)",adam,"{'hidden_layer_sizes': (25, 25, 25, 25), 'solv...",0.71341,0.545721,0.468406,0.610872,0.646042,0.581047,0.728785,0.628244,0.731408,0.668568,0.63225,0.080421,5
9,0.537717,0.059958,0.000901,0.0003,"(60, 40)",adam,"{'hidden_layer_sizes': (60, 40), 'solver': 'ad...",0.709555,0.543099,0.447527,0.615752,0.576486,0.657738,0.712458,0.647408,0.704672,0.683542,0.629824,0.081583,6
7,0.60076,0.04462,0.000897,0.000299,"(50, 50)",adam,"{'hidden_layer_sizes': (50, 50), 'solver': 'ad...",0.70731,0.549467,0.43711,0.584262,0.63561,0.654207,0.693803,0.647803,0.681773,0.660871,0.625222,0.07757,7
13,0.186797,0.022567,0.0003,0.000458,"(33, 33, 33)",adam,"{'hidden_layer_sizes': (33, 33, 33), 'solver':...",0.694905,0.537817,0.463642,0.590998,0.63723,0.652958,0.70087,0.59952,0.680926,0.675565,0.623443,0.072579,8
19,0.3827,0.05061,0.0006,0.00049,"(20, 20, 20, 20, 20)",adam,"{'hidden_layer_sizes': (20, 20, 20, 20, 20), '...",0.670297,0.565596,0.388367,0.612417,0.580607,0.609389,0.728867,0.620757,0.716343,0.672052,0.616469,0.09197,9
1,0.338899,0.024631,0.0007,0.000458,100,adam,"{'hidden_layer_sizes': 100, 'solver': 'adam'}",0.666358,0.518587,0.425992,0.572632,0.588353,0.681602,0.624957,0.558764,0.646469,0.669593,0.595331,0.076132,10


In [7]:
features

['Base MD.Sample(m)',
 'Lat.Any(°)',
 'Long.Any(°)',
 'TOC.Any(%wt)',
 'T-max.Any(°C)',
 'HI.Any(ppt (mg/g) (a/b))',
 'S1.Any(ppt (mg/g))',
 'S2.Any(ppt (mg/g))']