In [None]:
# ======================================================
# Simple Random forest FOR CPT DATA
# ======================================================
# 
# Purpose: Predict lithostratigraphic units from CPT data using CRF
# ======================================================

In [1]:
# =========================================
# 1. Imports
# =========================================
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# =========================================
# 2. Read data
# =========================================
home = Path.home()
path_to_parquet = home / "Documents" / "MSC_2026" / "Project_DataScience" / "Data" / "remapped.parquet"

print("Reading:", path_to_parquet)
df = pd.read_parquet(path_to_parquet)
print("Raw shape:", df.shape)

# =========================================
# 3. Basic config
# =========================================
cpt_col   = "sondeernummer"
depth_col = "diepte"
label_col = "lithostrat_id"

segments_oi = [
    "Quartair",
    "Diest",
    "Bolderberg",
    "Sint_Huibrechts_Hern",
    "Ursel",
    "Asse",
    "Wemmel",
    "Lede",
    "Brussel",
    "Merelbeke",
    "Kwatrecht",
    "Mont_Panisel",
    "Aalbeke",
    "Mons_en_Pevele",
]

invalid_labels = {"", "none", "nan", "onbekend"}

# =========================================
# 4. Imputation function (same as before)
# =========================================
def impute_params(df_in: pd.DataFrame, overwrite: bool = False) -> pd.DataFrame:
    df = df_in.copy()

    # icn
    mask_icn = df["icn"].isna() if not overwrite else np.ones(len(df), dtype=bool)
    valid_icn = mask_icn & df["qtn"].gt(0) & df["fr"].gt(0)

    icn_new = np.sqrt(
        (3.47 - np.log10(df.loc[valid_icn, "qtn"])) ** 2
        + (np.log10(df.loc[valid_icn, "fr"]) + 1.22) ** 2
    )
    df.loc[valid_icn, "icn"] = icn_new

    # sbt from icn
    def sbt_from_icn(icn):
        if pd.isna(icn):
            return np.nan
        if icn < 1.31:
            return 1
        elif icn < 2.05:
            return 2
        elif icn < 2.60:
            return 3
        elif icn < 2.95:
            return 4
        elif icn < 3.60:
            return 5
        else:
            return 6

    mask_sbt = df["sbt"].isna() if not overwrite else np.ones(len(df), dtype=bool)
    df.loc[mask_sbt, "sbt"] = df.loc[mask_sbt, "icn"].apply(sbt_from_icn)

    # ksbt from icn
    df["ksbt"] = pd.to_numeric(df["ksbt"], errors="coerce")

    def ksbt_from_icn(icn):
        if pd.isna(icn):
            return np.nan
        if 1.0 < icn <= 3.27:
            return 10 ** (0.952 - 3.04 * icn)
        elif icn > 3.27:
            return 10 ** (-4.52 - 1.37 * icn)

    mask_ksbt = df["ksbt"].isna() if not overwrite else np.ones(len(df), dtype=bool)
    df.loc[mask_ksbt, "ksbt"] = df.loc[mask_ksbt, "icn"].apply(ksbt_from_icn)

    return df

# =========================================
# 5. Filter to segments of interest and clean labels
# =========================================
df_known = df[df[label_col].isin(segments_oi)].copy()
print("Known before imputation:", df_known.shape)

df_known = impute_params(df_known, overwrite=False)
print("Known after imputation:", df_known.shape)

df_known = df_known[~df_known[label_col].isin(invalid_labels)]
df_known = df_known.dropna(subset=[label_col])
print("Known after label cleaning:", df_known.shape)

# sort by CPT and depth (not strictly needed for RF, but nice to keep)
df_known = df_known.sort_values([cpt_col, depth_col])

# =========================================
# 6. Feature engineering (same ideas as CRF)
# =========================================

# depth normalization per CPT
df_known["max_depth_cpt"] = df_known.groupby(cpt_col)[depth_col].transform("max")
df_known["depth_norm"] = df_known[depth_col] / df_known["max_depth_cpt"]

# qc / Rf bins
def add_qc_rf_bins(df_in):
    df = df_in.copy()
    df["qc_bin"] = pd.cut(
        df["qc"],
        bins=[-np.inf, 2, 5, 10, 20, np.inf],
        labels=["qc_vlow", "qc_low", "qc_med", "qc_high", "qc_vhigh"],
    )
    df["rf_bin"] = pd.cut(
        df["fr"],
        bins=[-np.inf, 1, 2, 4, 6, np.inf],
        labels=["rf_vlow", "rf_low", "rf_med", "rf_high", "rf_vhigh"],
    )
    return df

df_known = add_qc_rf_bins(df_known)

# ratios
eps = 1e-6
df_known["qc_fs_ratio"]  = df_known["qc"]  / (df_known["fs"].abs()  + eps)
df_known["qtn_fr_ratio"] = df_known["qtn"] / (df_known["fr"].abs() + eps)

# expert flags
df_known["is_clay_like"] = ((df_known["qc"] < 4) & (df_known["fr"] > 3)).astype(int)
df_known["is_merelbeke_like"] = ((df_known["qc"] < 2) & (df_known["fr"] > 5)).astype(int)

# =========================================
# 7. Build feature matrix X and target y
# =========================================

# numeric features
num_feats = [
    "qc", "fs", "rf", "qtn", "fr", "icn", "ksbt",
    depth_col, "depth_norm",
    "qc_fs_ratio", "qtn_fr_ratio",
    "is_clay_like", "is_merelbeke_like",
]

# categorical features (one-hot)
cat_feats = ["sbt", "qc_bin", "rf_bin"]

# keep rows that have all required columns
needed_cols = num_feats + cat_feats + [label_col, cpt_col]
df_model = df_known[needed_cols].dropna(subset=num_feats)  # allow NaNs in cats, they become their own category

# one-hot encode categorical vars
X = pd.get_dummies(df_model[num_feats + cat_feats], columns=cat_feats)
y = df_model[label_col]
groups = df_model[cpt_col]   # for CPT-based split

print("Feature matrix shape:", X.shape)

# =========================================
# 8. Train/test split by CPT (no leakage)
# =========================================

unique_cpts = groups.unique()
train_cpts, test_cpts = train_test_split(
    unique_cpts, test_size=0.3, random_state=22
)

train_mask = groups.isin(train_cpts)
test_mask  = groups.isin(test_cpts)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print("Train rows:", X_train.shape[0], " Test rows:", X_test.shape[0])

# =========================================
# 9. Random Forest training
# =========================================
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=22,
    class_weight="balanced"   # handle class imbalance a bit
)

rf.fit(X_train, y_train)

# =========================================
# 10. Evaluation: accuracy + report
# =========================================
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest accuracy: {acc:.3f}\n")

print("Classification report:")
print(classification_report(y_test, y_pred))

# (optional) Show top 15 features by importance â€“ nice for your slides
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print("\nTop 15 features by importance:")
print(importances.sort_values(ascending=False).head(15))


Reading: C:\Users\dorothy.chepkoech\Documents\MSC_2026\Project_DataScience\Data\remapped.parquet
Raw shape: (1220548, 19)
Known before imputation: (236393, 19)
Known after imputation: (236393, 19)
Known after label cleaning: (236393, 19)
Feature matrix shape: (236393, 29)
Train rows: 160363  Test rows: 76030

Random Forest accuracy: 0.534

Classification report:
                      precision    recall  f1-score   support

             Aalbeke       0.49      0.43      0.46      4473
                Asse       0.28      0.28      0.28      1609
          Bolderberg       0.03      0.00      0.01      1551
             Brussel       0.54      0.72      0.62     13133
               Diest       0.51      0.34      0.41      2572
           Kwatrecht       0.08      0.10      0.09      1103
                Lede       0.37      0.33      0.35      5330
           Merelbeke       0.32      0.32      0.32       326
      Mons_en_Pevele       0.35      0.34      0.34      7288
        Mont_P