In [0]:
# ✅ Final rebuild of silver table with working distance_cm

df_bronze = spark.read.table("workspace.bronze.stedi_step_curated")
df_bronze.filter(col("distance_cm").isNotNull()).count()


In [0]:
df_bronze.select("distance_cm").distinct().orderBy("distance_cm").show(50)


1. Import Libraries and Load Your Curated Dataset

In [0]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.getOrCreate()
df_spark = spark.table("workspace.silver.labeled_step_test")
df = df_spark.toPandas()
df.head()

Sanity Check

In [0]:
spark.sql("SELECT * FROM workspace.silver.labeled_step_test LIMIT 5").show()

2. Define Your Feature Columns

In [0]:
feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["sensor_type", "device_id"]
label_col = "step_label"

3. Create a Train/Test Split

In [0]:
from sklearn.model_selection import train_test_split

# Define valid feature columns based on actual DataFrame columns
feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["device_id"]  # or add "source_label" if useful
label_col = "step_label"

# Define feature matrix and label vector
X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]

# Perform train/test split with stratification on label
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

One-Hot Encoding (Preferred for tree-based models)

In [0]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = feature_cols_categorical
numeric_cols = feature_cols_numeric

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


4. Build Preprocessing Steps - Scale numeric columns

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = StandardScaler()

One-hot encode categorical columns

In [0]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

Combine into a single transformer

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical)
    ]
)

5. Build a Scikit-Learn Pipeline

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

6. Fit the Pipeline and Transform the Data

In [0]:
pipeline.fit(X_train)

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

7. Save Your Processed Feature Set and Pipeline

In [0]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# =============================
# Paths
# =============================
REPO_ROOT = "/Workspace/Repos/win185@ensign.edu/Databricks"
data_dir = os.path.join(REPO_ROOT, "data")
artifacts_dir = os.path.join(REPO_ROOT, "artifacts")
os.makedirs(artifacts_dir, exist_ok=True)

# =============================
# Load Raw Data
# =============================
raw_data_path = os.path.join(data_dir, "rapid_step_tests.parquet")
assert os.path.exists(raw_data_path), f"File not found: {raw_data_path}"

df = pd.read_parquet(raw_data_path)
df.columns = df.columns.str.strip()

# =============================
# Feature Engineering / Injection
# =============================
# Numeric feature
df["distance_cm"] = pd.to_numeric(df.get("step_points", 0), errors="coerce") * 10

# Categorical placeholders (required by pipeline)
df["step_label"] = "step"
df["source_label"] = "device"

# Ensure categorical column exists
if "device_id" not in df.columns:
    raise ValueError("device_id column missing from dataset")

# =============================
# Ensure Correct Data Types
# =============================
df["device_id"] = df["device_id"].astype(str)
df["step_label"] = df["step_label"].astype(str)
df["source_label"] = df["source_label"].astype(str)

# =============================
# Define Features and Target
# =============================
expected_features = [
    "device_id",
    "distance_cm",
    "step_label",
    "source_label"
]

# Create a copy to avoid SettingWithCopyWarning
X = df[expected_features].copy()
y = df["total_steps"]

# =============================
# HANDLE MISSING VALUES SAFELY
# =============================
X["distance_cm"] = X["distance_cm"].fillna(0)
X["device_id"] = X["device_id"].fillna("unknown")
X["step_label"] = X["step_label"].fillna("unknown")
X["source_label"] = X["source_label"].fillna("unknown")

# Sanity check
assert len(X) > 0, "Feature matrix X is empty after preprocessing"

# =============================
# Train / Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Train/Test split complete")
print("X_train shape:", X_train.shape)

# =============================
# Load Prebuilt Pipeline
# =============================
pipeline_path = os.path.join(data_dir, "stedi_feature_pipeline.pkl")
assert os.path.exists(pipeline_path), f"Pipeline not found: {pipeline_path}"

pipeline = joblib.load(pipeline_path)

# =============================
# Fit and Transform
# =============================
pipeline.fit(X_train)
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("Pipeline fit and transform successful")

# =============================
# Save Artifacts
# =============================
joblib.dump(pipeline, os.path.join(artifacts_dir, "stedi_feature_pipeline.pkl"))

np.save(
    os.path.join(artifacts_dir, "X_train_transformed.npy"),
    X_train_transformed,
    allow_pickle=True
)

np.save(
    os.path.join(artifacts_dir, "X_test_transformed.npy"),
    X_test_transformed,
    allow_pickle=True
)

pd.to_pickle(y_train, os.path.join(artifacts_dir, "y_train.pkl"))
pd.to_pickle(y_test, os.path.join(artifacts_dir, "y_test.pkl"))

print("\nArtifacts saved to:", artifacts_dir)
print("Files:", os.listdir(artifacts_dir))


Ethics Reflection - Using a consistent and reproducible feature pipeline helps prevent unfairness or hidden bias by ensuring that all data is processed in the same way, regardless of when or from whom it is collected. When feature generation is inconsistent, subtle differences in preprocessing can disproportionately affect certain groups and lead to biased model outcomes that are hard to detect. Reproducibility also makes it easier to audit models, trace errors, and identify where bias may have been introduced. A spiritual principle that helps illuminate the importance of consistency and fairness is the idea of treating others with impartiality and integrity—acting with the same care and standards for everyone. This perspective reinforces that fairness in machine learning is not just a technical goal, but a moral responsibility to apply rules evenly and transparently.