In [None]:
import pandas as pdimport numpy as npimport mlflowimport mlflow.sklearnimport matplotlib.pyplot as pltfrom sklearn.linear_model import LogisticRegressionfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifierfrom sklearn.neural_network import MLPClassifierfrom sklearn.metrics import (    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,    confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay)

In [None]:
# Load processed dataX_train = pd.read_csv("/tmp/X_train.csv")X_test = pd.read_csv("/tmp/X_test.csv")y_train = pd.read_csv("/tmp/y_train.csv").values.ravel()y_test = pd.read_csv("/tmp/y_test.csv").values.ravel()print(f"✓ Data loaded")print(f"  Training: {X_train.shape}")print(f"  Test: {X_test.shape}")

In [None]:
# Set MLflow experimentmlflow.set_experiment("/Users/shalindri20@gmail.com/Adult_Income_MLflow_Production")

## Helper Function

In [None]:
def train_and_log_model(model_name, model, params_dict):    """    Train model and log everything to MLflow    """    print(f"\n{'='*60}")    print(f"Training: {model_name}")    print(f"{'='*60}")    with mlflow.start_run(run_name=model_name):        # Tags        mlflow.set_tag("model_family", params_dict.get("model_family", "unknown"))        mlflow.set_tag("user", "shalindri20@gmail.com")        mlflow.set_tag("warehouse", "Serverless Starter Warehouse")        # Log parameters        mlflow.log_params({k: v for k, v in params_dict.items() if k != "model_family"})        # Train        model.fit(X_train, y_train)        # Predict        y_pred = model.predict(X_test)        if hasattr(model, "predict_proba"):            y_prob = model.predict_proba(X_test)[:, 1]        else:            y_prob = y_pred        # Calculate metrics        metrics = {            "accuracy": accuracy_score(y_test, y_pred),            "precision": precision_score(y_test, y_pred, zero_division=0),            "recall": recall_score(y_test, y_pred, zero_division=0),            "f1_score": f1_score(y_test, y_pred, zero_division=0),            "roc_auc": roc_auc_score(y_test, y_prob)        }        # Log metrics        mlflow.log_metrics(metrics)        # Confusion Matrix        fig, ax = plt.subplots(figsize=(8, 6))        cm = confusion_matrix(y_test, y_pred)        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['<=50K', '>50K'])        disp.plot(ax=ax, cmap='Blues')        plt.title(f"Confusion Matrix - {model_name}")        cm_path = f"/tmp/cm_{model_name.replace(' ', '_')}.png"        plt.savefig(cm_path)        mlflow.log_artifact(cm_path)        plt.close()        # ROC Curve        fig, ax = plt.subplots(figsize=(8, 6))        fpr, tpr, _ = roc_curve(y_test, y_prob)        display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=metrics["roc_auc"])        display.plot(ax=ax)        plt.title(f"ROC Curve - {model_name}")        roc_path = f"/tmp/roc_{model_name.replace(' ', '_')}.png"        plt.savefig(roc_path)        mlflow.log_artifact(roc_path)        plt.close()        # Log model        mlflow.sklearn.log_model(model, "model")        # Print results        print(f"\n✓ {model_name} Complete:")        print(f"  Accuracy:  {metrics['accuracy']:.4f}")        print(f"  Precision: {metrics['precision']:.4f}")        print(f"  Recall:    {metrics['recall']:.4f}")        print(f"  F1-Score:  {metrics['f1_score']:.4f}")        print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")        print(f"{'='*60}\n")        return model, metrics

## Model 1: Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)lr_params = {    "model_family": "linear",    "model_type": "Logistic Regression",    "max_iter": 1000,    "class_weight": "balanced",    "random_state": 42}lr_model, lr_metrics = train_and_log_model("Logistic_Regression", lr_model, lr_params)

## Model 2: Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced', max_depth=10)dt_params = {    "model_family": "tree-based",    "model_type": "Decision Tree",    "max_depth": 10,    "class_weight": "balanced",    "random_state": 42}dt_model, dt_metrics = train_and_log_model("Decision_Tree", dt_model, dt_params)

## Model 3: Random Forest

In [None]:
rf_model = RandomForestClassifier(    n_estimators=300,    max_depth=None,    random_state=42,    class_weight='balanced_subsample',    n_jobs=-1)rf_params = {    "model_family": "tree-based",    "model_type": "Random Forest",    "n_estimators": 300,    "max_depth": "None",    "class_weight": "balanced_subsample",    "random_state": 42}rf_model, rf_metrics = train_and_log_model("Random_Forest", rf_model, rf_params)

## Model 4: Gradient Boosting

In [None]:
gb_model = GradientBoostingClassifier(    n_estimators=100,    learning_rate=0.1,    max_depth=3,    random_state=42)gb_params = {    "model_family": "tree-based",    "model_type": "Gradient Boosting",    "n_estimators": 100,    "learning_rate": 0.1,    "max_depth": 3,    "random_state": 42}gb_model, gb_metrics = train_and_log_model("Gradient_Boosting", gb_model, gb_params)

## Model 5: Neural Network (MLP)

In [None]:
mlp_model = MLPClassifier(    hidden_layer_sizes=(128, 64),    activation='relu',    solver='adam',    learning_rate='adaptive',    learning_rate_init=0.001,    max_iter=600,    early_stopping=True,    random_state=42,    verbose=False)mlp_params = {    "model_family": "neural-network",    "model_type": "Neural Network (MLP)",    "hidden_layers": "128-64",    "activation": "relu",    "solver": "adam",    "learning_rate": "adaptive",    "max_iter": 600,    "early_stopping": True,    "random_state": 42}mlp_model, mlp_metrics = train_and_log_model("Neural_Network_MLP", mlp_model, mlp_params)

## Summary

In [None]:
# Create summary dataframesummary_data = {    "Model": ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network"],    "Accuracy": [lr_metrics["accuracy"], dt_metrics["accuracy"], rf_metrics["accuracy"],                 gb_metrics["accuracy"], mlp_metrics["accuracy"]],    "F1-Score": [lr_metrics["f1_score"], dt_metrics["f1_score"], rf_metrics["f1_score"],                 gb_metrics["f1_score"], mlp_metrics["f1_score"]],    "ROC-AUC": [lr_metrics["roc_auc"], dt_metrics["roc_auc"], rf_metrics["roc_auc"],                gb_metrics["roc_auc"], mlp_metrics["roc_auc"]]}summary_df = pd.DataFrame(summary_data).sort_values("ROC-AUC", ascending=False)print("\n" + "="*70)print("MODEL COMPARISON SUMMARY")print("="*70)print(summary_df.to_string(index=False))print("="*70)# Find best modelbest_model_name = summary_df.iloc[0]["Model"]best_roc_auc = summary_df.iloc[0]["ROC-AUC"]print(f"\n✓ Best Model: {best_model_name}")print(f"  ROC-AUC: {best_roc_auc:.4f}")

✅ All 5 Models Trained and Logged to MLflow!**View Results:**- Click "Experiments" in the right sidebar- Or navigate to: Machine Learning > Experiments**Next:** Run notebook `04_Model_Comparison.py`