
# 🚀 ML Experiment Tracking with MLflow

This project trains **5 machine learning models** on the Iris dataset and tracks them using **MLflow**.  
It automatically registers the **best-performing model** (by F1 score) into the **MLflow Model Registry**, promoting it to **Production**.

### ✅ Workflow
1. Run all cells in order.
2. Watch training logs appear — MLflow will automatically pick the best model.
3. The best model is automatically registered and promoted to *Production*.
4. A summary of all registered versions is displayed at the end.


In [1]:
!pip install -q mlflow scikit-learn pandas numpy matplotlib
print("✅ All packages installed successfully.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m752.6/752.6 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h✅ All packages installed successfully.


In [2]:
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

EXPERIMENT_NAME = "MLOps_Assignment_1"
mlflow.set_experiment(EXPERIMENT_NAME)
print(f"📘 MLflow experiment set to: {EXPERIMENT_NAME}")

2025/10/29 18:26:24 INFO mlflow.tracking.fluent: Experiment with name 'MLOps_Assignment_1' does not exist. Creating a new experiment.


📘 MLflow experiment set to: MLOps_Assignment_1


In [4]:
def train_and_register_best_model():
    # Load data
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    # Define column names as the dataset does not have a header
    col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
    df = pd.read_csv(url, names=col_names)
    le = LabelEncoder()
    df['species'] = le.fit_transform(df['species'])
    X = df.drop('species', axis=1)
    y = df['species']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    models = {
        "LogisticRegression": LogisticRegression(max_iter=200),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "DecisionTree": DecisionTreeClassifier(random_state=42),
        "SVM": SVC(kernel='rbf', probability=True),
        "KNN": KNeighborsClassifier(n_neighbors=5)
    }

    best_f1 = -1
    best_run_id = None
    best_model_name = None
    results = []

    for name, model in models.items():
        with mlflow.start_run(run_name=name) as run:
            run_id = run.info.run_id
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, average='macro')
            rec = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')

            mlflow.log_param("model_name", name)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)
            mlflow.sklearn.log_model(model, artifact_path="model")

            results.append((name, acc, prec, rec, f1, run_id))
            print(f" {name} -> Acc: {acc:.3f}, F1: {f1:.3f}, Run ID: {run_id}")

            if f1 > best_f1:
                best_f1 = f1
                best_run_id = run_id
                best_model_name = name

    print("\n=== Summary ===")
    for r in results:
        print(f"Model: {r[0]} | Acc: {r[1]:.3f} | F1: {r[4]:.3f} | Run ID: {r[5]}")
    print(f"\n🏆 Best model: {best_model_name} | F1: {best_f1:.3f} | Run ID: {best_run_id}")

    # Auto-register
    MODEL_NAME = "MLOps_Assignment1_BestModel"
    client = MlflowClient()
    model_uri = f"runs:/{best_run_id}/model"
    print(f"\n📦 Registering model from {model_uri}...")
    registered = mlflow.register_model(model_uri, MODEL_NAME)

    import time
    time.sleep(3)
    version = registered.version

    # Promote
    client.transition_model_version_stage(MODEL_NAME, version, stage="Staging")
    print(f"✅ Model version {version} moved to Staging.")
    client.transition_model_version_stage(MODEL_NAME, version, stage="Production", archive_existing_versions=True)
    print(f"🚀 Model version {version} promoted to Production.")

    print("\n✅ Final Registered Model Info:")
    mv = client.get_model_version(MODEL_NAME, version)
    print(f"Name: {mv.name}, Version: {mv.version}, Stage: {mv.current_stage}")

train_and_register_best_model()



 LogisticRegression -> Acc: 1.000, F1: 1.000, Run ID: 5ed72086cb6e41f098d49bb552e9b4bf




 RandomForest -> Acc: 1.000, F1: 1.000, Run ID: 1840e5f571724a1d971833cad9cfb05d




 DecisionTree -> Acc: 1.000, F1: 1.000, Run ID: 42c340ca38284ab3bb11f5a069358b2d




 SVM -> Acc: 1.000, F1: 1.000, Run ID: c040539c3e674ea5bfbfce4f122a3cee


Successfully registered model 'MLOps_Assignment1_BestModel'.
Created version '1' of model 'MLOps_Assignment1_BestModel'.


 KNN -> Acc: 1.000, F1: 1.000, Run ID: 3e59337db21c407baa6c7e191cb56680

=== Summary ===
Model: LogisticRegression | Acc: 1.000 | F1: 1.000 | Run ID: 5ed72086cb6e41f098d49bb552e9b4bf
Model: RandomForest | Acc: 1.000 | F1: 1.000 | Run ID: 1840e5f571724a1d971833cad9cfb05d
Model: DecisionTree | Acc: 1.000 | F1: 1.000 | Run ID: 42c340ca38284ab3bb11f5a069358b2d
Model: SVM | Acc: 1.000 | F1: 1.000 | Run ID: c040539c3e674ea5bfbfce4f122a3cee
Model: KNN | Acc: 1.000 | F1: 1.000 | Run ID: 3e59337db21c407baa6c7e191cb56680

🏆 Best model: LogisticRegression | F1: 1.000 | Run ID: 5ed72086cb6e41f098d49bb552e9b4bf

📦 Registering model from runs:/5ed72086cb6e41f098d49bb552e9b4bf/model...
✅ Model version 1 moved to Staging.
🚀 Model version 1 promoted to Production.

✅ Final Registered Model Info:
Name: MLOps_Assignment1_BestModel, Version: 1, Stage: Production


  client.transition_model_version_stage(MODEL_NAME, version, stage="Staging")
  client.transition_model_version_stage(MODEL_NAME, version, stage="Production", archive_existing_versions=True)


In [5]:
client = MlflowClient()
MODEL_NAME = "MLOps_Assignment1_BestModel"

print("📊 Registered model versions:")
try:
    versions = client.get_latest_versions(MODEL_NAME)
    for v in versions:
        print(f"Version: {v.version} | Stage: {v.current_stage} | Run ID: {v.run_id}")
except Exception as e:
    print("Error listing versions:", e)

📊 Registered model versions:
Version: 1 | Stage: Production | Run ID: 5ed72086cb6e41f098d49bb552e9b4bf


  versions = client.get_latest_versions(MODEL_NAME)


In [6]:
# Optional cleanup
import shutil
shutil.rmtree('/content/mlruns', ignore_errors=True)
print('🧹 Cleared /content/mlruns')
print('Cleanup cell ready. Uncomment lines above to reset MLflow logs if needed.')

🧹 Cleared /content/mlruns
Cleanup cell ready. Uncomment lines above to reset MLflow logs if needed.
