In [None]:
from __future__ import annotations

# MLflow Experiment Tracking: Random Forest on Geotechnical Datasets

This notebook demonstrates end-to-end experiment tracking with MLflow for geotechnical machine learning.

What you'll learn:
- How to set up a local MLflow tracking store (file-based) in this repository (`./experiments`).
- How to create a small synthetic CPT-like dataset and train a Random Forest classifier.
- How to track parameters, metrics, and artifacts (confusion matrix) with MLflow.
- How to save trained models both as MLflow artifacts and to the local `models/` directory.
- Optional: Repeat the workflow on a numeric-only slice of the provided earthquake dataset.

Teaching flow:
1. Briefly explain MLflow concepts (experiments, runs, params, metrics, artifacts).
2. Run each cell live. Students run alongside and inspect outputs.

Notes and constraints for this lesson:
- CPU-only; no GPU dependencies.
- Simple train/test split (no CV) to keep runtime fast and concepts clear.
- Keep preprocessing minimal (Random Forest is robust to feature scaling).
- No fixed global random seed (to illustrate natural variability across runs).
- We'll only log a confusion matrix plot as an artifact (keeps focus).

## Setup: Imports and MLflow configuration
We'll configure MLflow to use a local file-based tracking URI under the repository's `experiments/` directory. If it doesn't exist, we'll create it. We'll also pick a descriptive experiment name.

In [None]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
sns.set_theme()

# Resolve project paths robustly whether the notebook is run from notebooks/ or repo root
cwd = Path.cwd().resolve()
repo_root_candidates = [cwd, cwd.parent]
project_root: Path | None = None
for base in repo_root_candidates:
    if (base / "experiments").exists() and (base / "models").exists():
        project_root = base
        break
if project_root is None:
    # Fall back to parent of current working directory (common for notebooks/)
    project_root = cwd.parent

experiments_dir = project_root / "experiments" / "mlruns"
experiments_dir.mkdir(parents=True, exist_ok=True)

models_dir = project_root / "models"
models_dir.mkdir(parents=True, exist_ok=True)

# Configure MLflow to use local file-based tracking store
mlflow.set_tracking_uri(experiments_dir.as_uri())
mlflow.set_experiment("rf_geotech_demo")

print(f"MLflow tracking URI set to: {mlflow.get_tracking_uri()}")
print(f"Experiment directory: {experiments_dir}")
print(f"Models directory: {models_dir}")

MLflow tracking URI set to: file:///C:/Users/TFH/git_projects/course-machine-learning-for-geotechnics/experiments/mlruns
Experiment directory: C:\Users\TFH\git_projects\course-machine-learning-for-geotechnics\experiments\mlruns
Models directory: C:\Users\TFH\git_projects\course-machine-learning-for-geotechnics\models


## Create a small synthetic CPT-like dataset
We'll synthesize a few common CPT features and assign simplified soil classes.
- depth: m (0–30 m)
- qc: cone resistance (MPa), higher in sands
- fs: sleeve friction (MPa)
- Rf: friction ratio (%) = 100 * fs / qc

We'll generate three classes (1=gravel/sand-like, 2=silt-like, 3=clay-like) based on plausible value ranges with noise.

In [2]:
# Synthetic dataset size
n = 900  # keep small for fast teaching runs

# We'll allocate roughly equal class sizes
n1 = n // 3
n2 = n // 3
n3 = n - n1 - n2

# Class 1: sand/gravel-like (higher qc, lower Rf)
depth1 = 30 * np.random.rand(n1)
qc1 = np.random.normal(loc=12.0, scale=2.5, size=n1).clip(min=1e-3)
fs1 = np.random.normal(loc=0.2, scale=0.08, size=n1).clip(min=1e-4)
Rf1 = 100 * fs1 / qc1

# Class 2: silt-like (moderate qc, moderate Rf)
depth2 = 30 * np.random.rand(n2)
qc2 = np.random.normal(loc=6.0, scale=1.5, size=n2).clip(min=1e-3)
fs2 = np.random.normal(loc=0.25, scale=0.09, size=n2).clip(min=1e-4)
Rf2 = 100 * fs2 / qc2

# Class 3: clay-like (lower qc, higher Rf)
depth3 = 30 * np.random.rand(n3)
qc3 = np.random.normal(loc=2.5, scale=0.8, size=n3).clip(min=1e-3)
fs3 = np.random.normal(loc=0.22, scale=0.07, size=n3).clip(min=1e-4)
Rf3 = 100 * fs3 / qc3

X = np.concatenate(
    [
        np.vstack([depth1, qc1, fs1, Rf1]).T,
        np.vstack([depth2, qc2, fs2, Rf2]).T,
        np.vstack([depth3, qc3, fs3, Rf3]).T,
    ]
)
y = np.array([1] * n1 + [2] * n2 + [3] * n3)

feature_names = ["depth", "qc", "fs", "Rf"]
df = pd.DataFrame(X, columns=feature_names)
df["soil_class"] = y
df.head()

Unnamed: 0,depth,qc,fs,Rf,soil_class
0,25.88635,12.535763,0.144508,1.15277,1
1,10.831681,14.64012,0.199889,1.365348,1
2,14.422509,14.899111,0.137522,0.923022,1
3,10.093557,9.946199,0.349857,3.517491,1
4,11.589926,8.406629,0.082164,0.977368,1


## Train/test split
We'll use a simple train/test split to keep runtime and complexity low. No scaling or advanced preprocessing is applied.

In [3]:
X = df[feature_names].values
y = df["soil_class"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
X_train.shape, X_test.shape

((675, 4), (225, 4))

## Train Random Forest and track with MLflow
We'll record: parameters, accuracy and F1 metrics, a confusion matrix figure artifact, the classification report text, and the trained model (both as an MLflow artifact and to `models/`).

In [11]:
# Helper to create and log a confusion matrix image as an MLflow artifact
def log_confusion_matrix(
    y_true, y_pred, labels, artifact_name="confusion_matrix_rf.png"
):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(4.5, 4.5), dpi=120)
    disp.plot(ax=ax, cmap="Blues", colorbar=False)
    ax.set_title("Confusion Matrix")
    ax.grid(False)
    fig.tight_layout()
    out_path = Path.cwd() / artifact_name
    fig.savefig(out_path)
    plt.close(fig)
    mlflow.log_artifact(str(out_path), artifact_path="plots")
    try:
        out_path.unlink(missing_ok=True)
    except Exception:
        pass


# Random Forest hyperparameters (no fixed seed for illustration)
params = {
    "n_estimators": 200,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
}

with mlflow.start_run(run_name="rf_cpt_synthetic"):
    mlflow.log_params(params)
    mlflow.log_text(",".join(feature_names), artifact_file="feature_columns.txt")

    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1w = f1_score(y_test, y_pred, average="weighted")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_weighted", f1w)
    mlflow.log_text(
        classification_report(y_test, y_pred, digits=3),
        artifact_file="classification_report.txt",
    )

    # Confusion matrix artifact (only plot we log as per lesson scope)
    unique_labels = np.unique(np.concatenate([y_train, y_test]))
    log_confusion_matrix(y_test, y_pred, labels=unique_labels)

    # Log model to MLflow (artifacts)
    mlflow.sklearn.log_model(clf, artifact_path="model")

    # # Also save locally to models/ for easy reuse outside MLflow
    # local_model_path = models_dir / "rf_cpt_synthetic.pkl"
    # with open(local_model_path, "wb") as f:
    #     pickle.dump(clf, f)

    # print(f"Logged metrics: accuracy={acc:.3f}, f1_weighted={f1w:.3f}")
    # print(f"Local model saved to: {local_model_path}")

acc, f1w



(0.92, 0.9191444520391889)

## Explore MLflow UI (optional)
Start the UI locally in a separate terminal to browse runs, parameters, metrics, and artifacts:

1. Change directory to the experiments folder.
2. Launch MLflow UI.

Commands (PowerShell):

```powershell
cd "$PSScriptRoot/.."  # go to repo root if you're in notebooks/
cd experiments
uv run mlflow ui --port 5000
```

Then open http://127.0.0.1:5000 in your browser.

## Optional: Earthquake dataset example (predicting tsunami flag)
We'll reuse the same pattern on `data/raw/earthquake_data.csv` to predict whether an event has a tsunami flag (0/1).
We only use numeric columns to keep preprocessing minimal.

In [9]:
# Load and prepare earthquake data (numeric-only slice)
data_raw_dir = project_root / "data" / "raw"
eq_path = data_raw_dir / "earthquake_data.csv"
eq_df = pd.read_csv(eq_path)

# Select numeric features. We'll avoid heavy preprocessing or encoding.
numeric_cols = [
    "magnitude",
    "cdi",
    "mmi",
    "sig",
    "nst",
    "dmin",
    "gap",
    "depth",
    "latitude",
    "longitude",
]
target_col = "tsunami"  # 0/1
eq_df = eq_df[numeric_cols + [target_col]].dropna()

Xe = eq_df[numeric_cols].values
ye = eq_df[target_col].values.astype(int)
Xe_train, Xe_test, ye_train, ye_test = train_test_split(
    Xe, ye, test_size=0.25, stratify=ye
)
Xe_train.shape, Xe_test.shape, np.bincount(ye)

((586, 10), (196, 10), array([478, 304], dtype=int64))

In [None]:
params_eq = {
    "n_estimators": 300,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
}

with mlflow.start_run(run_name="rf_earthquake_tsunami"):
    mlflow.set_experiment("rf_geotech_demo")  # ensure same experiment
    mlflow.log_params(params_eq)
    mlflow.log_text(",".join(numeric_cols), artifact_file="feature_columns.txt")

    clf_eq = RandomForestClassifier(**params_eq)
    clf_eq.fit(Xe_train, ye_train)

    ye_pred = clf_eq.predict(Xe_test)
    acc_eq = accuracy_score(ye_test, ye_pred)
    f1w_eq = f1_score(ye_test, ye_pred, average="weighted")

    mlflow.log_metric("accuracy", acc_eq)
    mlflow.log_metric("f1_weighted", f1w_eq)
    mlflow.log_text(
        classification_report(ye_test, ye_pred, digits=3),
        artifact_file="classification_report.txt",
    )

    log_confusion_matrix(ye_test, ye_pred, labels=np.unique(ye))
    mlflow.sklearn.log_model(clf_eq, artifact_path="model")

    # local_eq_model_path = models_dir / "rf_earthquake_tsunami.pkl"
    # with open(local_eq_model_path, "wb") as f:
    #     pickle.dump(clf_eq, f)

    # print(f"Logged EQ metrics: accuracy={acc_eq:.3f}, f1_weighted={f1w_eq:.3f}")
    # print(f"Local EQ model saved to: {local_eq_model_path}")

acc_eq, f1w_eq



Logged EQ metrics: accuracy=0.898, f1_weighted=0.899
Local EQ model saved to: C:\Users\TFH\git_projects\course-machine-learning-for-geotechnics\models\rf_earthquake_tsunami.pkl


(0.8979591836734694, 0.8988903129689879)

## Student task
- Modify Random Forest hyperparameters (e.g., `n_estimators`, `max_depth`).
- Re-run the training cells to generate new MLflow runs.
- Open the MLflow UI and compare metrics across runs.
- Optional: Add one more numeric feature to the earthquake example (e.g., compute `magnitude^2`) and see if it impacts performance.