In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

# Ensure we can import from the current directory
if str(Path.cwd()) not in sys.path:
    sys.path.append(str(Path.cwd()))

# Import utilities from your existing script
try:
    import data_processing as dp
except ImportError:
    # Fallback if running from project root
    sys.path.append("modeling")
    import data_processing as dp

print("Imported data_processing successfully.")

Parameters:
  EXTRACT_TREND: True
  BIN_W: 0.6
  SET_SEED: 100
  EXTRACT_TREND_TYPE: multiplicative
Imported data_processing successfully.


In [2]:
# Configuration
DATA_FOLDER = Path("../data")  # Adjust relative path if needed
RESULTS_FOLDER = Path("../results")
PARQUET_FILENAME = "vw_cpt_brussels_params_completeset_20250318_remapped.parquet"

BIN_W = 0.6
SEED = 42
EXTRACT_TREND = True
TREND_TYPE = "additive"

RESULTS_FOLDER.mkdir(parents=True, exist_ok=True)

## 1. Load Data

In [3]:
parquet_path = DATA_FOLDER / PARQUET_FILENAME
if not parquet_path.exists():
    raise FileNotFoundError(f"Parquet file not found: {parquet_path}")

cpt_df = pd.read_parquet(parquet_path, engine="fastparquet")
cpt_df = cpt_df[~cpt_df["lithostrat_id"].isna()].copy()

print(f"Loaded data with {len(cpt_df)} rows.")

Loaded data with 267174 rows.


## 2. Split Train/Test IDs

We perform the stratified split on the raw IDs.

In [5]:
# Filter rare classes (logic copied from data_processing.py to ensure consistency)
litho_counts = (
    cpt_df.drop_duplicates(subset=["sondering_id", "lithostrat_id"])
          .groupby("lithostrat_id", dropna=False)
          .size()
          .reset_index(name="N")
)
rare_litho = set(litho_counts.loc[litho_counts["N"] < 5, "lithostrat_id"])
if rare_litho:
    print(f"Removing rare classes: {rare_litho}")
    cpt_df = cpt_df[~cpt_df["lithostrat_id"].isin(rare_litho)].copy()

# Perform Split
cpt_unique = cpt_df.drop_duplicates(subset=["sondering_id", "lithostrat_id"]).copy()
#split_res = dp.group_strat_split(cpt_unique, prop=0.7, tol=0.05, seed=SEED)

#train_ids = set(split_res["train_ids"])
#test_ids = set(split_res["test_ids"])
## load json file split_res.json
import json
with open("../results/split_res.json", "r") as f:
    split_res = json.load(f)
train_ids = set(split_res["train_ids"])
test_ids = set(split_res["test_ids"])


print(f"Train IDs: {len(train_ids)}")
print(f"Test IDs:  {len(test_ids)}")

# Create separate DataFrames
train_raw_df = cpt_df[cpt_df["sondering_id"].isin(train_ids)].copy()
test_raw_df = cpt_df[cpt_df["sondering_id"].isin(test_ids)].copy()

Train IDs: 199
Test IDs:  86


## 3. Process Data

We use the `process_test_train` function from `data_processing` to process each set independently.

In [6]:
print("Processing Training Data...")
train_processed = dp.process_test_train(
    cpt_df=train_raw_df, 
    sondering_ids=list(train_ids), 
    bin_w=BIN_W, 
    do_extract_trend=EXTRACT_TREND, 
    trend_type=TREND_TYPE
)

print("Processing Test Data...")
# print head
print(test_raw_df.head())
test_processed = dp.process_test_train(
    cpt_df=test_raw_df, 
    sondering_ids=list(test_ids), 
    bin_w=BIN_W, 
    do_extract_trend=EXTRACT_TREND, 
    trend_type=TREND_TYPE
)

print(test_processed.head())

Processing Training Data...


  .apply(_trend_and_fill))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Processing Test Data...
   sondering_id  index                                     pkey_sondering  \
0           314   2593  https://www.dov.vlaanderen.be/data/sondering/1...   
1           314   2594  https://www.dov.vlaanderen.be/data/sondering/1...   
2           314   2595  https://www.dov.vlaanderen.be/data/sondering/1...   
3           314   2596  https://www.dov.vlaanderen.be/data/sondering/1...   
4           314   2597  https://www.dov.vlaanderen.be/data/sondering/1...   

   sondeernummer         x         y  start_sondering_mtaw  \
0  GEO-97/127-S2  153278.2  181734.6                 15.26   
1  GEO-97/127-S2  153278.2  181734.6                 15.26   
2  GEO-97/127-S2  153278.2  181734.6                 15.26   
3  GEO-97/127-S2  153278.2  181734.6                 15.26   
4  GEO-97/127-S2  153278.2  181734.6                 15.26   

   diepte_sondering_tot  diepte  diepte_mtaw    qc     fs        qtn  \
0                  25.4     1.6        13.66  1.17  0.035  35.894004

  .apply(_trend_and_fill))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


   sondering_id   depth_bin     qc_sd     fs_sd     rf_sd    qtn_sd     fr_sd  \
0           314  (1.2, 1.8]  0.020355  0.000424  0.629001  0.768622  0.650116   
1           314  (1.8, 2.4]  0.100368  0.001635  1.274885  2.863361  1.375155   
2           314  (2.4, 3.0]  0.186748  0.001380  0.365309  4.393847  0.372325   
3           314  (3.0, 3.6]  0.106500  0.002760  1.633421  2.370866  2.122284   
4           314  (3.6, 4.2]  0.099661  0.002709  1.012828  2.390063  1.117128   

   diepte_sd  diepte_mtaw_sd   qc_mean  ...  diepte_whole_q90  \
0   0.070711        0.070711  1.871289  ...             23.02   
1   0.216025        0.216025  1.698995  ...             23.02   
2   0.187083        0.187083  1.639667  ...             23.02   
3   0.158114        0.158114  1.143800  ...             23.02   
4   0.216025        0.216025  0.687500  ...             23.02   

   diepte_mtaw_whole_q90  diepte_whole_cv  diepte_mtaw_whole_cv  \
0                  11.28          0.51213              