In [0]:
# 04_train_mlflow (fixed)
# Train a RandomForest on churn features, log to MLflow, and (attempt) register model.
# Use this corrected cell — it avoids restricted dbutils calls in CE.

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib
import json
import os
import warnings
warnings.filterwarnings("ignore")

# --------------------------
# Config (safe for CE)
# --------------------------
FEATURES_TABLE = "churn_mlo_mdb.features_churn"
# Use a simple, constant experiment name — avoid restricted dbutils calls
EXPERIMENT_NAME = "churn_experiment"
MODEL_REGISTRY_NAME = "ChurnModel"
SAMPLE_IF_TOO_BIG = True         # if toPandas would be too large, fallback to sampling
MAX_ROWS_FOR_PANDAS = 200000     # safe threshold (CE limited), adjust down if memory errors

# --------------------------
# Load features table
# --------------------------
print("Reading feature table:", FEATURES_TABLE)
spark_df = spark.table(FEATURES_TABLE)
count_rows = spark_df.count()
print("Rows in features table:", count_rows)
spark_df.printSchema()

# Convert to pandas safely
def spark_to_pandas_safe(sdf, max_rows=MAX_ROWS_FOR_PANDAS):
    n = sdf.count()
    if n <= max_rows:
        print(f"Loading full table as pandas ({n} rows)")
        return sdf.toPandas()
    else:
        print(f"Table too large ({n} rows). Sampling up to {max_rows} rows (approx).")
        frac = float(max_rows) / float(n)
        # approximate sample; for a more exact stratified sample we would compute fractions per class
        pdf = sdf.sample(withReplacement=False, fraction=frac, seed=42).toPandas()
        print("Sampled rows:", len(pdf))
        return pdf

pdf = spark_to_pandas_safe(spark_df)

# Basic check
print("Pandas dataframe shape:", pdf.shape)
print("Columns:", pdf.columns.tolist())

# --------------------------
# Prepare X, y
# --------------------------
if "customerID" in pdf.columns:
    customer_ids = pdf["customerID"].astype(str).tolist()
    pdf = pdf.drop(columns=["customerID"])

# Ensure churn_label exists
if "churn_label" not in pdf.columns:
    raise RuntimeError("churn_label not found in feature table. Please ensure silver/features step created it.")

# Fill missing numeric values (simple strategy) and ensure numeric types
pdf = pdf.copy()
pdf = pdf.fillna(0)

# select numeric columns only for this simple demo
numeric_df = pdf.select_dtypes(include=[np.number]).copy()
X = numeric_df.drop(columns=["churn_label"], errors='ignore')
y = numeric_df["churn_label"].astype(int)

print("Feature matrix shape:", X.shape, "Label shape:", y.shape)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("Train/test sizes:", X_train.shape[0], X_test.shape[0])

# --------------------------
# MLflow experiment setup (safe)
# --------------------------
try:
    mlflow.set_experiment(EXPERIMENT_NAME)
    print("MLflow experiment set to:", EXPERIMENT_NAME)
except Exception as e:
    print("Warning: could not set experiment via API; continuing with default. Error:", e)

client = MlflowClient()

# --------------------------
# Hyperparameter search + training
# --------------------------
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [6, 8, 12],
    "min_samples_leaf": [1, 2]
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rf = RandomForestClassifier(random_state=42, n_jobs=1)

grid = GridSearchCV(rf, param_grid, scoring="roc_auc", cv=cv, verbose=1, n_jobs=1)

with mlflow.start_run() as run:
    run_id = run.info.run_id
    print("MLflow Run ID:", run_id)
    # Fit grid search
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print("Best params:", grid.best_params_)
    
    # Predict & evaluate
    preds = best_model.predict(X_test)
    probs = best_model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    print(f"Test Accuracy: {acc:.4f}, ROC AUC: {auc:.4f}")
    print(classification_report(y_test, preds, digits=4))
    
    # Log parameters & metrics
    mlflow.log_param("grid_params", json.dumps(param_grid))
    mlflow.log_param("best_params", json.dumps(grid.best_params_))
    mlflow.log_metric("test_accuracy", float(acc))
    mlflow.log_metric("test_roc_auc", float(auc))
    mlflow.log_metric("train_rows", int(X_train.shape[0]))
    mlflow.log_metric("test_rows", int(X_test.shape[0]))
    
    # Save feature names as artifact
    feature_names = X_train.columns.tolist()
    with open("/tmp/feature_names.json", "w") as f:
        json.dump(feature_names, f)
    mlflow.log_artifact("/tmp/feature_names.json", artifact_path="features")
    
    # Log feature importances as artifact & table
    importances = best_model.feature_importances_
    fi = pd.DataFrame({"feature": feature_names, "importance": importances})
    fi = fi.sort_values("importance", ascending=False)
    fi.to_csv("/tmp/feature_importances.csv", index=False)
    mlflow.log_artifact("/tmp/feature_importances.csv", artifact_path="features")
    
    # Log the model and attempt to register in model registry
    try:
        mlflow.sklearn.log_model(best_model, artifact_path="model", registered_model_name=MODEL_REGISTRY_NAME)
        print("Model logged and registration attempted for:", MODEL_REGISTRY_NAME)
    except Exception as e:
        # fallback: log without auto-registration
        print("Auto-register failed (CE sometimes restricts). Logging model artifact only. Error:", e)
        mlflow.sklearn.log_model(best_model, artifact_path="model")
    
    # Save a local copy (optional)
    joblib.dump(best_model, "/tmp/best_model.joblib")
    mlflow.log_artifact("/tmp/best_model.joblib", artifact_path="model_files")
    
    # Tag the run with feature count & feature list length (useful for monitoring)
    try:
        mlflow.set_tag("n_features", str(len(feature_names)))
        mlflow.set_tag("feature_list", ",".join(feature_names))
    except Exception:
        pass
    
    print("Run finished. Run ID:", run_id)

# --------------------------
# Post-run: check model registry / print registered versions
# --------------------------
try:
    versions = client.get_latest_versions(MODEL_REGISTRY_NAME)
    if versions:
        print("Latest model registry versions for", MODEL_REGISTRY_NAME)
        for v in versions:
            print("Version:", v.version, "Stage:", v.current_stage, "RunID:", v.run_id)
    else:
        print("No versions found in registry (maybe auto-register failed).")
except Exception as e:
    print("Could not query model registry. Error:", e)

# Show artifact URI for the run
try:
    run_info = client.get_run(run_id)
    artifact_uri = run_info.info.artifact_uri
    print("Artifact URI for run:", artifact_uri)
except Exception as e:
    print("Could not fetch run details. Error:", e)

# Save a small metrics summary in /tmp and log it
try:
    metrics_summary = {"accuracy": acc, "roc_auc": auc, "n_train": int(X_train.shape[0]), "n_test": int(X_test.shape[0])}
    with open("/tmp/metrics_summary.json", "w") as f:
        json.dump(metrics_summary, f)
    mlflow.log_artifact("/tmp/metrics_summary.json", artifact_path="metrics")
except Exception as e:
    print("Could not save metrics artifact:", e)

print("Training cell completed. Check the MLflow experiment UI for run details and the Models UI for registry.")
