In [0]:
import os
import pandas as pd
import joblib
import shutil
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# ============================
# STEP 1: Load Data from Silver
# ============================
df = spark.read.table("silver.labeled_step_test")
pdf = df.toPandas()

# ============================
# STEP 2: Feature / Target Split
# ============================
X = pdf.drop("total_steps", axis=1)
y = pdf["total_steps"]

# Optional: convert categorical if needed (e.g., 'device_id')
X = pd.get_dummies(X)

# ============================
# STEP 3: Train/Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# STEP 4: Train Models
# ============================
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_mse = mean_squared_error(y_test, lin_reg.predict(X_test))

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_mse = mean_squared_error(y_test, rf.predict(X_test))

# ============================
# STEP 5: Save Models + Metadata
# ============================
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
local_dir = f"/tmp/stedi_models/{run_id}"
os.makedirs(local_dir, exist_ok=True)

joblib.dump(lin_reg, f"{local_dir}/linear_regression.joblib")
joblib.dump(rf, f"{local_dir}/random_forest.joblib")

metadata = {
    "run_id": run_id,
    "table_used": "silver.labeled_step_test",
    "linear_regression_mse": lin_mse,
    "random_forest_mse": rf_mse,
    "features_used": list(X.columns)
}
joblib.dump(metadata, f"{local_dir}/metadata.joblib")

# ============================
# STEP 6: Zip Everything
# ============================
zip_path = shutil.make_archive(
    base_name=local_dir,
    format="zip",
    root_dir=local_dir
)

# ============================
# STEP 7: Copy to GitHub Repo
# ============================
repo_dir = "/Workspace/Repos/win185@ensign.edu/Databricks/models"
final_zip_path = os.path.join(repo_dir, f"{run_id}.zip")
shutil.copy(zip_path, final_zip_path)

print(f"Models and metadata exported to: {final_zip_path}")

In [0]:
import os
import shutil
from datetime import datetime

# Reset run_id and use /tmp (NOT /dbfs)
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
base_dir = f"/tmp/stedi_models/{run_id}"
os.makedirs(base_dir, exist_ok=True)

# ðŸ‘‡ Dummy example: Put something inside (delete this if already saved models)
with open(f"{base_dir}/dummy.txt", "w") as f:
    f.write("This is just a test.")

# ZIP the /tmp folder safely
zip_base_name = f"/tmp/stedi_models_{run_id}"
zip_path = shutil.make_archive(
    base_name=zip_base_name,
    format="zip",
    root_dir=base_dir
)

# Copy to GitHub repo folder
repo_dir = "/Workspace/Repos/win185@ensign.edu/Databricks/models"
final_zip_path = os.path.join(repo_dir, os.path.basename(zip_path))
shutil.copy(zip_path, final_zip_path)

print(f"ZIP created: {zip_path}")
print(f"Copied to GitHub repo: {final_zip_path}")