Imports - config - repo paths

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import json

repo_root = Path.cwd().parents[0]
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from helper.utils import (
    find_repo_root,
    load_pipeline_config,
    save_run_log,
    ensure_dir,
    sha256_file
)

repo_root = find_repo_root()
cfg = load_pipeline_config(repo_root)

raw_dir = repo_root / cfg["paths"]["raw_dir"]
processed_dir = repo_root / cfg["paths"]["processed_dir"]

ensure_dir(processed_dir)

cfg["dataset"], cfg["ml"]

({'source': 'zenodo',
  'doi': '10.5281/zenodo.16256961',
  'raw_filename': 'BioFairNet_Pilot1_Testrun.csv',
  'separator': ';',
  'encoding': 'utf-8',
  'dummy_mode': False},
 {'features': ['time', 'temperature', 'Stiring'],
  'target': 'pressure',
  'random_seed': 42,
  'splits': {'holdout_fraction': 0.2, 'train_fraction_within_train': 0.8},
  'models_default': ['linreg', 'rf', 'gbr', 'svr', 'mlp'],
  'gridsearch': {'cv_folds': 5,
   'n_jobs': -1,
   'refit_metric': 'rmse',
   'scoring': ['rmse', 'r2']}})

### Load raw dataset

In [13]:
raw_filename = cfg["dataset"]["raw_filename"]
sep = cfg["dataset"].get("separator", ";")

raw_path = raw_dir / raw_filename
assert raw_path.exists(), f"Raw file not found: {raw_path}"

df = pd.read_csv(raw_path, sep=sep)
print("Shape:", df.shape)
df.head()

Shape: (9670, 4)


Unnamed: 0,Time (min),Temperature (°C),Stiring,Pressure (bar)
0,14:12:16,1148,1,7
1,14:12:17,1149,1,7
2,14:12:18,1149,1,7
3,14:12:19,1149,1,7
4,14:12:20,1149,1,7


### Basic cleaning & standardization
This handles:
- deicmal commas --> dots
- numeric coercion
- NaN truncation
- column name normalization

prints the available columns and lets the user pick the feature columns + target column (with sensible defaults). Then it writes those choices back into `metadata/prepared_schema.json` (and optionally updates `pipeline_config.json`).

<div class="alert alert-block alert-info">
<b>Info:</b> Depending on the dataset, you may do additional operations.
</div>

In [17]:
import pandas as pd
import numpy as np

# Keep original column names but strip whitespace
df.columns = [str(c).strip() for c in df.columns]

print("DF shape", df.shape)
print("Available columns:")
for i, c in enumerate(df.columns):
    print(f"  [{i}] {c}")

# ---- Interactive selection ----
time_col = input("\nEnter TIME column name exactly as shown (or leave empty if none): ").strip()
feat_idx = input("Enter FEATURE column indices (comma-separated, e.g., 1,2,3): ").strip()
tgt_idx  = input("Enter TARGET column index (single number): ").strip()


if time_col and time_col not in df.columns:
    raise ValueError(f"Time column '{time_col}' not found. Available columns: {list(df.columns)}")

feat_idx = [int(x.strip()) for x in feat_idx.split(",") if x.strip() != ""]
tgt_idx  = int(tgt_idx)

features = [df.columns[i] for i in feat_idx]
target   = df.columns[tgt_idx]

required_cols = ([time_col] if time_col else []) + features + [target]
work = df[required_cols].copy()

print("\n✅ Selected:")
print("Time:", time_col if time_col else "(none)")
print("Features:", features)
print("Target:", target)

print("work shape before time handling:", work.shape)
# ---- TIME handling (hh:mm:ss → seconds; rebuild series starting at 0) ----
if time_col:
    col_data = work.loc[:, time_col]

    # If duplicate columns exist, take the first
    if isinstance(col_data, pd.DataFrame):
        print("⚠️ Duplicate time columns detected, using the first one.")
        col_data = col_data.iloc[:, 0]

    # Ensure we actually have rows
    if len(col_data) == 0:
        raise ValueError("No rows available after selecting required columns. Check your column indices / filtering.")

    s = col_data.astype(str).str.strip()

    t = pd.to_timedelta(s, errors="coerce")

    # Diagnostics
    na_ratio = float(t.isna().mean())
    print(f"Time parsing NaN ratio: {na_ratio:.3f} (0.0 is perfect)")

    if na_ratio > 0.95:
        bad = s[t.isna()].head(10).tolist()
        raise ValueError(
            f"Time parsing failed for >95% of rows in '{time_col}'. "
            f"Examples: {bad}"
        )

    t_sec = t.dt.total_seconds()

    # dt between consecutive rows
    dt = t_sec.diff()

    # if dt <= 0 (wrap/reset), mark as NaN
    dt = dt.where(dt > 0)

    # choose default dt from data; fallback to 15s
    default_dt = dt.dropna().median()
    if pd.isna(default_dt) or default_dt <= 0:
        default_dt = 15.0

    dt = dt.fillna(default_dt)

    # only set first element if dt not empty
    if len(dt) > 0:
        dt.iloc[0] = 0.0
    else:
        # extreme edge case: no dt computed
        dt = pd.Series([0.0])

    work["time_s"] = dt.cumsum().astype(float)

    # drop original time column
    work = work.drop(columns=[time_col])

    # ensure features list updated
    features = [c for c in features if c != time_col]
    features = ["time_s"] + features


# ---- Numeric cleanup ONLY for feature/target columns (exclude time strings) ----
num_cols = features + [target]

for col in num_cols:
    # Allow commas as decimal separators; keep minus signs; strip spaces
    work[col] = (
        work[col]
        .astype(str)
        .str.strip()
        .str.replace(",", ".", regex=False)
    )
    work[col] = pd.to_numeric(work[col], errors="coerce")

# ---- Drop rows where target is NaN, and where ALL features are NaN ----
work = work.dropna(subset=[target])
work = work.dropna(subset=features, how="all")
work = work.reset_index(drop=True)

print("\n✅ Cleaned working frame shape:", work.shape)
print(work.head())

# Make downstream cells use this cleaned df
df = work

DF shape (9670, 4)
Available columns:
  [0] Time (min)
  [1] Temperature (°C)
  [2] Stiring
  [3] Pressure (bar)



Enter TIME column name exactly as shown (or leave empty if none):  Time (min)
Enter FEATURE column indices (comma-separated, e.g., 1,2,3):  1,2
Enter TARGET column index (single number):  3



✅ Selected:
Time: Time (min)
Features: ['Temperature (°C)', 'Stiring']
Target: Pressure (bar)
work shape before time handling: (9670, 4)
Time parsing NaN ratio: 0.308 (0.0 is perfect)

✅ Cleaned working frame shape: (9225, 4)
   Temperature (°C)  Stiring  Pressure (bar)  time_s
0             114.8      0.1             0.7     0.0
1             114.9      0.1             0.7     1.0
2             114.9      0.1             0.7     2.0
3             114.9      0.1             0.7     3.0
4             114.9      0.1             0.7     4.0


### Train/Test/Validation split (80/20 + 80/20)

In [18]:
from sklearn.model_selection import train_test_split

X = df[features]
y = df[target]

seed = cfg["ml"]["random_seed"]

# Holdout split (80/20)
X_train_full, X_val, y_train_full, y_val = train_test_split(
    X, y, test_size=cfg["ml"]["splits"]["holdout_fraction"], random_state=seed
)

# Inner split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_train_full,
    y_train_full,
    test_size=1 - cfg["ml"]["splits"]["train_fraction_within_train"],
    random_state=seed
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)
print("Validation:", X_val.shape)

Train: (5904, 3)
Test: (1476, 3)
Validation: (1845, 3)


### Save processed datasets ###
Directory Structure:

`data/processed/` <br>
> `Train/` <br>
>     `Test/` <br>
>      `Validation/`

In [19]:
train_dir = processed_dir / "Train"
test_dir = processed_dir / "Test"
val_dir = processed_dir / "Validation"

ensure_dir(train_dir)
ensure_dir(test_dir)
ensure_dir(val_dir)

X_train.to_csv(train_dir / "X_train.csv", index=False)
y_train.to_csv(train_dir / "y_train.csv", index=False)

X_test.to_csv(test_dir / "X_test.csv", index=False)
y_test.to_csv(test_dir / "y_test.csv", index=False)

X_val.to_csv(val_dir / "X_val.csv", index=False)
y_val.to_csv(val_dir / "y_val.csv", index=False)

print("✅ Saved processed datasets")


✅ Saved processed datasets


### Save schema and hashes

Important for reproducibility.

In [20]:
schema = {
    "features": features,
    "target": target,
    "rows_total": len(df),
    "splits": {
        "train": len(X_train),
        "test": len(X_test),
        "validation": len(X_val)
    },
    "source_file": str(raw_path.relative_to(repo_root)),
    "files": {
        "X_train": "data/processed/Train/X_train.csv",
        "y_train": "data/processed/Train/y_train.csv",
        "X_test": "data/processed/Test/X_test.csv",
        "y_test": "data/processed/Test/y_test.csv",
        "X_val": "data/processed/Validation/X_val.csv",
        "y_val": "data/processed/Validation/y_val.csv"
    }
}

schema_path = repo_root / "metadata" / "prepared_schema.json"
schema_path.write_text(json.dumps(schema, indent=2))
print("✅ Schema written:", schema_path)

hashes = {k: sha256_file(repo_root / v) for k, v in schema["files"].items()}

hash_path = repo_root / "metadata" / "prepared_hashes.json"
hash_path.write_text(json.dumps(hashes, indent=2))
print("✅ Hashes written:", hash_path)

✅ Schema written: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/metadata/prepared_schema.json
✅ Hashes written: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/metadata/prepared_hashes.json
