In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
import paths_cpt
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
import joblib
from hmmlearn import hmm
# ---------------------------------------------------------------
# Step 1: Load your dataset
# ---------------------------------------------------------------
# df should contain columns: ['qc','fs','rf','qtn','fr','icn','diepte','lithostrat_id','sondeernummer']
df = pd.read_parquet(paths_cpt.PATH_TO_PARQUET)


In [2]:
df.head()

Unnamed: 0,sondering_id,index,pkey_sondering,sondeernummer,x,y,start_sondering_mtaw,diepte_sondering_tot,diepte,diepte_mtaw,qc,fs,qtn,rf,fr,icn,sbt,ksbt,lithostrat_id
0,314,2593,https://www.dov.vlaanderen.be/data/sondering/1...,GEO-97/127-S2,153278.2,181734.6,15.26,25.4,1.6,13.66,1.17,0.035,35.894004,2.991453,3.058371,2.56434,5.0,1.434e-07,Quartair
1,314,2594,https://www.dov.vlaanderen.be/data/sondering/1...,GEO-97/127-S2,153278.2,181734.6,15.26,25.4,1.7,13.56,1.57,0.033,42.562319,2.101911,2.138968,2.406724,5.0,4.321e-07,Quartair
2,314,2595,https://www.dov.vlaanderen.be/data/sondering/1...,GEO-97/127-S2,153278.2,181734.6,15.26,25.4,1.8,13.46,1.43,0.036,38.536991,2.517483,2.569226,2.491219,5.0,2.392e-07,Quartair
3,314,2596,https://www.dov.vlaanderen.be/data/sondering/1...,GEO-97/127-S2,153278.2,181734.6,15.26,25.4,1.9,13.36,0.5,0.024,15.678501,4.8,5.111166,2.982185,3.0,7.7e-09,Quartair
4,314,2597,https://www.dov.vlaanderen.be/data/sondering/1...,GEO-97/127-S2,153278.2,181734.6,15.26,25.4,2.0,13.26,1.33,0.023,33.203119,1.729323,1.77211,2.440158,5.0,3.419e-07,Quartair


In [4]:
print("Rows, cols:", df.shape)
print("Columns:", df.columns.tolist())

Rows, cols: (1220548, 19)
Columns: ['sondering_id', 'index', 'pkey_sondering', 'sondeernummer', 'x', 'y', 'start_sondering_mtaw', 'diepte_sondering_tot', 'diepte', 'diepte_mtaw', 'qc', 'fs', 'qtn', 'rf', 'fr', 'icn', 'sbt', 'ksbt', 'lithostrat_id']


In [5]:
# Keep only rows with valid lithostratigraphic labels
df = df[df["lithostrat_id"].notna()].copy()

print("Filtered dataset shape:", df.shape)
print("Unique lithostrat units:", df["lithostrat_id"].nunique())


Filtered dataset shape: (267174, 19)
Unique lithostrat units: 35


In [5]:
# Define features and target (without x and y)
feature_cols = [
    "diepte_mtaw",    # depth relative to sea level
    "qc",              # cone resistance
    "fs",              # sleeve friction
    "qtn",             # normalized cone resistance
    "rf",              # friction ratio
    "fr",              # another friction ratio variant
    "icn",             # normalized soil behavior index
    "sbt",             # soil behavior type
    "ksbt"             # another soil behavior type index
]

target_col = "lithostrat_id"

X = df[feature_cols]
y = df[target_col]

print("Feature matrix shape:", X.shape)
print("Target variable distribution:")
print(y.value_counts(normalize=True).head())


Feature matrix shape: (267174, 9)
Target variable distribution:
lithostrat_id
Quartair          0.240424
Brussel           0.179935
Lede              0.093613
Mons_en_Pevele    0.082029
Mont_Panisel      0.080390
Name: proportion, dtype: float64


In [6]:
# Define the main lithostratigraphic units of interest
segments_oi = [
    "Quartair", "Diest", "Bolderberg", "Sint_Huibrechts_Hern",
    "Ursel", "Asse", "Wemmel", "Lede", "Brussel", "Merelbeke",
    "Kwatrecht", "Mont_Panisel", "Aalbeke", "Mons_en_Pevele"
]

# Keep only rows where lithostrat_id is one of these
df_labelled = df[df['lithostrat_id'].isin(segments_oi)].copy()

print("Filtered dataset shape:", df_labelled.shape)
print("Unique lithostratigraphic units:", df_labelled['lithostrat_id'].unique())


Filtered dataset shape: (236393, 19)
Unique lithostratigraphic units: ['Quartair' 'Mont_Panisel' 'Aalbeke' 'Mons_en_Pevele' 'Brussel' 'Ursel'
 'Asse' 'Wemmel' 'Lede' 'Bolderberg' 'Merelbeke' 'Kwatrecht' 'Diest'
 'Sint_Huibrechts_Hern']


In [7]:
# Drop unwanted columns
X = df_labelled.drop(columns=['sondering_id', 'index', 'pkey_sondering',
                              'sondeernummer', 'x', 'y', 'lithostrat_id'])

# Encode labels numerically
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df_labelled['lithostrat_id'])

print("Number of classes:", len(le.classes_))


Number of classes: 14


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import paths_cpt
# ==========================================
# 6. XGBoost model tuning
# ==========================================
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# ==========================================
# 7. Fit tuned model
# ==========================================
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("\nBest parameters:", grid_search.best_params_)

# ==========================================
# 8. Evaluate model
# ==========================================
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n✅ XGBoost Model Accuracy: {acc*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))



Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200, 'subsample': 1.0}

✅ XGBoost Model Accuracy: 99.61%

Classification Report:
                      precision    recall  f1-score   support

             Aalbeke       1.00      1.00      1.00      2293
                Asse       0.99      0.99      0.99       981
          Bolderberg       1.00      1.00      1.00       436
             Brussel       1.00      0.99      1.00      9615
               Diest       1.00      1.00      1.00      1742
           Kwatrecht       1.00      0.99      0.99       860
                Lede       1.00      0.99      1.00      5003
           Merelbeke       0.98      1.00      0.99       268
      Mons_en_Pevele       1.00      1.00      1.00      4383
        Mont_Panisel       1.00      1.00      1.00      4296
            Quartair       0.99      1.00      1.00     12847
Sint_Huibrechts_Hern       1.00      1.00      1.00      1625
              

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate performance
print("Test Set Classification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix (optional but informative)
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Test Set Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2293
           1       0.99      0.99      0.99       981
           2       1.00      1.00      1.00       436
           3       1.00      0.99      1.00      9615
           4       1.00      1.00      1.00      1742
           5       1.00      0.99      0.99       860
           6       1.00      0.99      1.00      5003
           7       0.98      1.00      0.99       268
           8       1.00      1.00      1.00      4383
           9       1.00      1.00      1.00      4296
          10       0.99      1.00      1.00     12847
          11       1.00      1.00      1.00      1625
          12       0.99      0.99      0.99       793
          13       0.99      0.99      0.99      2137

    accuracy                           1.00     47279
   macro avg       0.99      1.00      0.99     47279
weighted avg       1.00      1.00      1.00    

In [None]:

from sklearn.model_selection import train_test_split

# Step 1: unique sonderings
unique_sonderings = df_labelled['sondering_id'].unique()

# Step 2: split sonderings (not rows)
train_sonderings, test_sonderings = train_test_split(
    unique_sonderings, test_size=0.2, random_state=42
)

# Step 3: assign rows based on sondering_id
train_df = df_labelled[df_labelled['sondering_id'].isin(train_sonderings)]
test_df = df_labelled[df_labelled['sondering_id'].isin(test_sonderings)]

# Step 4: drop identifiers for modeling
X_train = train_df.drop(columns=['lithostrat_id', 'sondering_id', 'x', 'y'])
y_train = train_df['lithostrat_id']
X_test = test_df.drop(columns=['lithostrat_id', 'sondering_id', 'x', 'y'])
y_test = test_df['lithostrat_id']

# Optional: verify no leakage
print("Overlap in sondering_id:", set(train_df['sondering_id']).intersection(set(test_df['sondering_id'])))


Overlap in sondering_id: set()
