1. Import Libraries and Load Your Curated Dataset

In [0]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.getOrCreate()
df_spark = spark.table("workspace.silver.labeled_step_test")
df = df_spark.toPandas()
df.head()

Sanity Check

In [0]:
spark.sql("SELECT * FROM workspace.silver.labeled_step_test LIMIT 5").show()

2. Define Your Feature Columns

In [0]:
feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["sensor_type", "device_id"]
label_col = "step_label"

3. Create a Train/Test Split

In [0]:
from sklearn.model_selection import train_test_split

# Define valid feature columns based on actual DataFrame columns
feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["device_id"]  # or add "source_label" if useful
label_col = "step_label"

# Define feature matrix and label vector
X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]

# Perform train/test split with stratification on label
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

One-Hot Encoding (Preferred for tree-based models)

In [0]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = feature_cols_categorical
numeric_cols = feature_cols_numeric

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


4. Build Preprocessing Steps - Scale numeric columns

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = StandardScaler()

One-hot encode categorical columns

In [0]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

Combine into a single transformer

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical)
    ]
)

5. Build a Scikit-Learn Pipeline

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

6. Fit the Pipeline and Transform the Data

In [0]:
pipeline.fit(X_train)

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

7. Save Your Processed Feature Set and Pipeline

In [0]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# Set up paths
REPO_ROOT = "/Workspace/Repos/win185@ensign.edu/Databricks"
data_dir = os.path.join(REPO_ROOT, "data")
artifacts_dir = os.path.join(REPO_ROOT, "artifacts")
os.makedirs(artifacts_dir, exist_ok=True)

# Load the raw Parquet data
raw_data_path = os.path.join(data_dir, "rapid_step_tests.parquet")
df = pd.read_parquet(raw_data_path)

# Optional: Inspect columns
print("Columns in dataset:", df.columns.tolist())

# Define features and label
# Using 'total_steps' as the target variable
X = df.drop("total_steps", axis=1)
y = df["total_steps"]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Load existing preprocessing pipeline
pipeline_path = os.path.join(data_dir, "stedi_feature_pipeline.pkl")
pipeline = joblib.load(pipeline_path)

# Fit pipeline on training features
pipeline.fit(X_train)

# Transform features
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Save the pipeline and transformed data
joblib.dump(pipeline, os.path.join(artifacts_dir, "stedi_feature_pipeline.pkl"))
np.save(os.path.join(artifacts_dir, "X_train_transformed.npy"), X_train_transformed, allow_pickle=True)
np.save(os.path.join(artifacts_dir, "X_test_transformed.npy"), X_test_transformed, allow_pickle=True)
pd.to_pickle(y_train, os.path.join(artifacts_dir, "y_train.pkl"))
pd.to_pickle(y_test, os.path.join(artifacts_dir, "y_test.pkl"))

# Done
print(f"\n Artifacts saved to: {artifacts_dir}")
print("Saved files:", os.listdir(artifacts_dir))


Ethics Reflection - Using a consistent and reproducible feature pipeline helps prevent unfairness or hidden bias by ensuring that all data is processed in the same way, regardless of when or from whom it is collected. When feature generation is inconsistent, subtle differences in preprocessing can disproportionately affect certain groups and lead to biased model outcomes that are hard to detect. Reproducibility also makes it easier to audit models, trace errors, and identify where bias may have been introduced. A spiritual principle that helps illuminate the importance of consistency and fairness is the idea of treating others with impartiality and integrityâ€”acting with the same care and standards for everyone. This perspective reinforces that fairness in machine learning is not just a technical goal, but a moral responsibility to apply rules evenly and transparently.