# 02 — Simple 1D CNN (Keras)
Use the same preprocessing; train a tiny CNN; report metrics + parity plot.


In [None]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.signal import savgol_filter
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

DATA_PATH = Path("../data/soil_spectra_teaching.csv")
TARGET = "SOC"  # change if needed

df = pd.read_csv(DATA_PATH)
spec_cols = [c for c in df.columns if str(c).startswith("wl_")]
assert len(spec_cols) > 10, "Need >10 spectral columns named like wl_400, wl_402, ..."
X = df[spec_cols].to_numpy(dtype=float)
y = df[TARGET].to_numpy(dtype=float)



# Preprocessing: SNV + SG 1st derivative
def snv(mat):
    return (mat - mat.mean(axis=1, keepdims=True)) / (mat.std(axis=1, keepdims=True) + 1e-12)

X_snv = snv(X)
window = min(21, max(5, (X_snv.shape[1]//20)*2+1))
X_sg = savgol_filter(X_snv, window_length=window, polyorder=2, deriv=1, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_sg, y, test_size=0.2, random_state=42)
X_train_r, X_test_r = X_train[..., None], X_test[..., None]



# Model
inputs = keras.Input(shape=(X_train_r.shape[1], 1))
x = layers.Conv1D(16, 7, padding="same", activation="relu")(inputs)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(32, 5, padding="same", activation="relu")(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(64, 3, padding="same", activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse", metrics=["mae"])
model.summary()



# Train
es = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")
hist = model.fit(
    X_train_r, y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=32,
    callbacks=[es],
    verbose=0
)
print("Best val_loss:", np.min(hist.history["val_loss"]))



# Evaluate
y_pred = model.predict(X_test_r, verbose=0).ravel()
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

def rpd(y_true, y_hat):
    sd = np.std(y_true, ddof=1)
    rmse_ = mean_squared_error(y_true, y_hat, squared=False)
    return sd / (rmse_ + 1e-12)

print({"R2": r2, "RMSE": rmse, "MAE": mae, "RPD": rpd(y_test, y_pred)})



# Parity plot
plt.figure(figsize=(5,5))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolor='none')
mn, mx = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
plt.plot([mn, mx], [mn, mx], linestyle="--")
plt.xlabel(f"Observed {TARGET}")
plt.ylabel(f"Predicted {TARGET}")
plt.title(f"CNN Parity Plot — {TARGET} (R2={r2:.2f})")
plt.tight_layout()
plt.show()
