In [None]:
# Create a new directory for your project, create requirements.txt, and install packages
!mkdir mlflow-assignment-1 && cd mlflow-assignment-1 && cat > requirements.txt << EOF
mlflow>=2.10.0
scikit-learn>=1.2.0
pandas>=1.5.0
numpy>=1.21.0
matplotlib>=3.5.0
seaborn>=0.11.0
plotly>=5.13.0
jupyter>=1.0.0
notebook>=6.5.0
boto3>=1.26.0  # For AWS integration (optional)
azureml-core>=1.50.0  # For Azure integration (optional)
google-cloud-aiplatform>=1.25.0  # For GCP integration (optional)
EOF
!pip install -r mlflow-assignment-1/requirements.txt

SyntaxError: invalid syntax (ipython-input-495297892.py, line 3)

In [None]:
# Create a new directory for your project
!mkdir mlflow-assignment-1

In [None]:
%%writefile mlflow-assignment-1/requirements.txt
mlflow>=2.10.0
scikit-learn>=1.2.0
pandas>=1.5.0
numpy>=1.21.0
matplotlib>=3.5.0
seaborn>=0.11.0
plotly>=5.13.0
jupyter>=1.0.0
notebook>=6.5.0
boto3>=1.26.0  # For AWS integration (optional)
azureml-core>=1.50.0  # For Azure integration (optional)
google-cloud-aiplatform>=1.25.0  # For GCP integration (optional)

Writing mlflow-assignment-1/requirements.txt


In [None]:
# Install all required packages
!pip install -r mlflow-assignment-1/requirements.txt

Collecting mlflow>=2.10.0 (from -r mlflow-assignment-1/requirements.txt (line 1))
  Downloading mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting jupyter>=1.0.0 (from -r mlflow-assignment-1/requirements.txt (line 8))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting boto3>=1.26.0 (from -r mlflow-assignment-1/requirements.txt (line 10))
  Downloading boto3-1.40.61-py3-none-any.whl.metadata (6.6 kB)
Collecting azureml-core>=1.50.0 (from -r mlflow-assignment-1/requirements.txt (line 11))
  Downloading azureml_core-1.60.0.post1-py3-none-any.whl.metadata (3.4 kB)
Collecting mlflow-skinny==3.5.1 (from mlflow>=2.10.0->-r mlflow-assignment-1/requirements.txt (line 1))
  Downloading mlflow_skinny-3.5.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.1 (from mlflow>=2.10.0->-r mlflow-assignment-1/requirements.txt (line 1))
  Downloading mlflow_tracing-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow>=2.10.0->-r mlfl

In [None]:
# data_preparation.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import fetch_openml
import warnings
warnings.filterwarnings('ignore')

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

    def load_data(self):
        """Load and prepare the wine quality dataset"""
        print("Loading Wine Quality Dataset...")

        # Method 1: Using fetch_openml (recommended)
        try:
            wine = fetch_openml(name='wine-quality-red', version=1, as_frame=True)
            df = wine.frame
            df['quality'] = df['quality'].astype(int)
        except:
            # Method 2: Download from URL
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
            df = pd.read_csv(url, delimiter=';')

        print(f"Dataset loaded with {df.shape[0]} samples and {df.shape[1]} features")
        return df

    def explore_data(self, df):
        """Perform basic data exploration"""
        print("\n=== Data Exploration ===")
        print(f"Dataset shape: {df.shape}")
        print(f"\nColumn types:\n{df.dtypes}")
        print(f"\nMissing values:\n{df.isnull().sum()}")
        print(f"\nDataset description:\n{df.describe()}")

        # Check target distribution
        print(f"\nTarget distribution (quality):\n{df['quality'].value_counts().sort_index()}")

        return df

    def preprocess_data(self, df):
        """Preprocess the data for modeling"""
        print("\n=== Data Preprocessing ===")

        # Create a binary classification problem (good wine vs bad wine)
        df['wine_quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

        # Features and target
        X = df.drop(['quality', 'wine_quality'], axis=1)
        y = df['wine_quality']

        print(f"Features: {X.columns.tolist()}")
        print(f"Target distribution: {y.value_counts()}")
        print(f"Positive class ratio: {y.mean():.3f}")

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Scale numerical features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        print(f"Training set: {X_train_scaled.shape}")
        print(f"Test set: {X_test_scaled.shape}")

        return X_train_scaled, X_test_scaled, y_train, y_test, X.columns

if __name__ == "__main__":
    preprocessor = DataPreprocessor()
    df = preprocessor.load_data()
    df = preprocessor.explore_data(df)
    X_train, X_test, y_train, y_test, feature_names = preprocessor.preprocess_data(df)

Loading Wine Quality Dataset...
Dataset loaded with 1599 samples and 12 features

=== Data Exploration ===
Dataset shape: (1599, 12)

Column types:
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

Missing values:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

Dataset description:
       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.00000

In [None]:
# mlflow_setup.py
import mlflow
import mlflow.sklearn
import os
from datetime import datetime
import tempfile

class MLflowSetup:
    def __init__(self, experiment_name="Wine-Quality-Classification"):
        self.experiment_name = experiment_name
        self.setup_tracking()

    def setup_tracking(self):
        """Setup MLflow tracking with multiple backend options"""

        # Option 1: Local File System (Default)
        tracking_uri = "file:///./mlruns"

        # Option 2: SQLite Backend (Uncomment to use)
        # tracking_uri = "sqlite:///mlflow.db"

        # Option 3: Remote Server (Uncomment and modify for your setup)
        # tracking_uri = "http://your-mlflow-server:5000"

        # Option 4: AWS S3 Backend (Uncomment for AWS deployment)
        # tracking_uri = "file:///./mlruns"
        # os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key'
        # os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-key'
        # os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'https://s3.amazonaws.com'

        mlflow.set_tracking_uri(tracking_uri)

        # Set experiment
        mlflow.set_experiment(self.experiment_name)

        print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
        print(f"Experiment: {self.experiment_name}")

    def create_run_name(self, model_type, run_description):
        """Create meaningful run names"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"{model_type}_{timestamp}_{run_description}"

# Initialize MLflow setup
mlflow_setup = MLflowSetup()

2025/10/29 07:48:31 INFO mlflow.tracking.fluent: Experiment with name 'Wine-Quality-Classification' does not exist. Creating a new experiment.


MLflow tracking URI: file:///./mlruns
Experiment: Wine-Quality-Classification


In [None]:
# model_training.py
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import tempfile
import os

class ModelTrainer:
    def __init__(self, X_train, X_test, y_train, y_test, feature_names):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.mlflow_setup = MLflowSetup()

    def evaluate_model(self, model, model_type):
        """Comprehensive model evaluation"""
        # Predictions
        y_pred = model.predict(self.X_test)
        y_pred_proba = model.predict_proba(self.X_test)[:, 1] if hasattr(model, 'predict_proba') else None

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, zero_division=0),
            'recall': recall_score(self.y_test, y_pred, zero_division=0),
            'f1_score': f1_score(self.y_test, y_pred, zero_division=0),
        }

        if y_pred_proba is not None:
            metrics['roc_auc'] = roc_auc_score(self.y_test, y_pred_proba)

        # Cross-validation scores
        cv_scores = cross_val_score(model, self.X_train, self.y_train, cv=5, scoring='f1')
        metrics['cv_f1_mean'] = cv_scores.mean()
        metrics['cv_f1_std'] = cv_scores.std()

        return metrics, y_pred, y_pred_proba

    def create_plots(self, model, y_pred, y_pred_proba, model_type):
        """Create evaluation plots"""
        plots = {}

        # Confusion Matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(self.y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_type}')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')

        cm_path = tempfile.mktemp(suffix='_cm.png')
        plt.savefig(cm_path)
        plt.close()
        plots['confusion_matrix'] = cm_path

        # Feature Importance (for tree-based models)
        if hasattr(model, 'feature_importances_'):
            plt.figure(figsize=(10, 6))
            feature_imp = pd.DataFrame({
                'feature': self.feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)

            sns.barplot(data=feature_imp.head(10), x='importance', y='feature')
            plt.title(f'Feature Importance - {model_type}')

            fi_path = tempfile.mktemp(suffix='_feature_importance.png')
            plt.savefig(fi_path)
            plt.close()
            plots['feature_importance'] = fi_path

        return plots

    def train_random_forest(self, run_description="baseline"):
        """Train Random Forest Classifier with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("RandomForest", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'n_estimators': 100,
                'max_depth': 10,
                'min_samples_split': 2,
                'min_samples_leaf': 1,
                'random_state': 42
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Log dataset info
            mlflow.log_param("dataset_shape", f"{self.X_train.shape}")
            mlflow.log_param("feature_count", len(self.feature_names))

            # Train model
            print("Training Random Forest...")
            model = RandomForestClassifier(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "RandomForest")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Create and log plots
            plots = self.create_plots(model, y_pred, y_pred_proba, "RandomForest")
            for plot_name, plot_path in plots.items():
                mlflow.log_artifact(plot_path, "plots")
                os.remove(plot_path)  # Clean up temp file

            # Log model
            mlflow.sklearn.log_model(
                model,
                "random_forest_model",
                registered_model_name="RandomForest_Wine_Quality"
            )

            # Log feature names as artifact
            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
                json.dump({'feature_names': self.feature_names.tolist()}, f)
                mlflow.log_artifact(f.name, "metadata")
                os.remove(f.name)

            print(f"Random Forest completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

    def train_gradient_boosting(self, run_description="baseline"):
        """Train Gradient Boosting Classifier with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("GradientBoosting", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'n_estimators': 100,
                'learning_rate': 0.1,
                'max_depth': 3,
                'random_state': 42
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Train model
            print("Training Gradient Boosting...")
            model = GradientBoostingClassifier(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "GradientBoosting")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Create and log plots
            plots = self.create_plots(model, y_pred, y_pred_proba, "GradientBoosting")
            for plot_name, plot_path in plots.items():
                mlflow.log_artifact(plot_path, "plots")
                os.remove(plot_path)

            # Log model
            mlflow.sklearn.log_model(
                model,
                "gradient_boosting_model",
                registered_model_name="GradientBoosting_Wine_Quality"
            )

            print(f"Gradient Boosting completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

    def train_logistic_regression(self, run_description="baseline"):
        """Train Logistic Regression with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("LogisticRegression", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'C': 1.0,
                'max_iter': 1000,
                'random_state': 42,
                'solver': 'liblinear'
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Train model
            print("Training Logistic Regression...")
            model = LogisticRegression(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "LogisticRegression")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                model,
                "logistic_regression_model",
                registered_model_name="LogisticRegression_Wine_Quality"
            )

            print(f"Logistic Regression completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

In [None]:
# hyperparameter_tuning.py
import mlflow
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd

class HyperparameterTuner:
    def __init__(self, X_train, X_test, y_train, y_test, feature_names):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.mlflow_setup = MLflowSetup()
        self.trainer = ModelTrainer(X_train, X_test, y_train, y_test, feature_names)

    def tune_random_forest(self):
        """Perform hyperparameter tuning for Random Forest"""
        run_name = self.mlflow_setup.create_run_name("RandomForest", "hyperparameter_tuning")

        with mlflow.start_run(run_name=run_name):
            # Define parameter grid
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, 15, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }

            # Log tuning parameters
            mlflow.log_param("tuning_method", "GridSearchCV")
            mlflow.log_param("param_grid", str(param_grid))
            mlflow.log_param("cv_folds", 5)

            # Perform grid search
            print("Performing GridSearch for Random Forest...")
            grid_search = GridSearchCV(
                RandomForestClassifier(random_state=42),
                param_grid,
                cv=5,
                scoring='f1',
                n_jobs=-1,
                verbose=1
            )

            grid_search.fit(self.X_train, self.y_train)

            # Log best parameters
            best_params = grid_search.best_params_
            for param, value in best_params.items():
                mlflow.log_param(f"best_{param}", value)

            mlflow.log_metric("best_cv_score", grid_search.best_score_)

            # Train final model with best parameters
            best_model = grid_search.best_estimator_
            metrics, y_pred, y_pred_proba = self.trainer.evaluate_model(best_model, "RandomForest_Tuned")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                best_model,
                "random_forest_tuned_model",
                registered_model_name="RandomForest_Wine_Quality_Tuned"
            )

            print(f"Random Forest Tuning completed - Best F1 Score: {metrics['f1_score']:.4f}")

            return metrics, best_model, grid_search.best_params_

    def tune_gradient_boosting(self):
        """Perform hyperparameter tuning for Gradient Boosting"""
        run_name = self.mlflow_setup.create_run_name("GradientBoosting", "hyperparameter_tuning")

        with mlflow.start_run(run_name=run_name):
            # Define parameter grid
            param_grid = {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'subsample': [0.8, 0.9, 1.0]
            }

            # Log tuning parameters
            mlflow.log_param("tuning_method", "GridSearchCV")
            mlflow.log_param("param_grid", str(param_grid))

            # Perform grid search
            print("Performing GridSearch for Gradient Boosting...")
            grid_search = GridSearchCV(
                GradientBoostingClassifier(random_state=42),
                param_grid,
                cv=5,
                scoring='f1',
                n_jobs=-1,
                verbose=1
            )

            grid_search.fit(self.X_train, self.y_train)

            # Log best parameters
            best_params = grid_search.best_params_
            for param, value in best_params.items():
                mlflow.log_param(f"best_{param}", value)

            mlflow.log_metric("best_cv_score", grid_search.best_score_)

            # Train final model with best parameters
            best_model = grid_search.best_estimator_
            metrics, y_pred, y_pred_proba = self.trainer.evaluate_model(best_model, "GradientBoosting_Tuned")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                best_model,
                "gradient_boosting_tuned_model",
                registered_model_name="GradientBoosting_Wine_Quality_Tuned"
            )

            print(f"Gradient Boosting Tuning completed - Best F1 Score: {metrics['f1_score']:.4f}")

            return metrics, best_model, grid_search.best_params_

In [None]:
# model_selection.py
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import numpy as np

class ModelSelector:
    def __init__(self):
        self.client = MlflowClient()
        self.mlflow_setup = MLflowSetup()

    def get_all_runs(self):
        """Retrieve all runs from the current experiment"""
        experiment = mlflow.get_experiment_by_name(self.mlflow_setup.experiment_name)
        runs = self.client.search_runs(experiment.experiment_id)

        run_data = []
        for run in runs:
            run_info = {
                'run_id': run.info.run_id,
                'run_name': run.data.tags.get('mlflow.runName', ''),
                'model_type': run.data.tags.get('mlflow.runName', '').split('_')[0],
                'status': run.info.status,
                'start_time': run.info.start_time,
            }

            # Add parameters
            for key, value in run.data.params.items():
                run_info[f'param_{key}'] = value

            # Add metrics
            for key, value in run.data.metrics.items():
                run_info[f'metric_{key}'] = value

            run_data.append(run_info)

        return pd.DataFrame(run_data)

    def select_best_model(self, primary_metric='metric_f1_score', secondary_metric='metric_roc_auc'):
        """Select the best model based on specified metrics"""
        runs_df = self.get_all_runs()

        if runs_df.empty:
            print("No runs found in the experiment")
            return None

        # Filter completed runs
        completed_runs = runs_df[runs_df['status'] == 'FINISHED']

        if completed_runs.empty:
            print("No completed runs found")
            return None

        # Sort by primary and secondary metrics
        best_run = completed_runs.sort_values(
            [primary_metric, secondary_metric],
            ascending=[False, False]
        ).iloc[0]

        print("=== BEST MODEL SELECTION RESULTS ===")
        print(f"Best Run ID: {best_run['run_id']}")
        print(f"Best Run Name: {best_run['run_name']}")
        print(f"Primary Metric ({primary_metric}): {best_run[primary_metric]:.4f}")
        print(f"Secondary Metric ({secondary_metric}): {best_run.get(secondary_metric, 'N/A')}")

        # Display comparison table
        comparison_df = completed_runs.groupby('model_type').agg({
            'metric_accuracy': 'mean',
            'metric_precision': 'mean',
            'metric_recall': 'mean',
            'metric_f1_score': 'mean',
            'metric_roc_auc': 'mean'
        }).round(4)

        print("\n=== MODEL COMPARISON ===")
        print(comparison_df)

        return best_run

In [None]:
# model_registry.py
import mlflow
from mlflow.tracking import MlflowClient
import time

class ModelRegistryManager:
    def __init__(self):
        self.client = MlflowClient()

    def register_best_model(self, run_id, model_name="Best_Wine_Quality_Model"):
        """Register the best model in MLflow Model Registry"""

        # Construct model URI
        model_uri = f"runs:/{run_id}/model"

        try:
            # Register the model
            print(f"Registering model from run {run_id}...")
            mv = mlflow.register_model(model_uri, model_name)

            print(f"Model registered successfully!")
            print(f"Model Name: {mv.name}")
            print(f"Model Version: {mv.version}")
            print(f"Current Stage: {mv.current_stage}")

            return mv

        except Exception as e:
            print(f"Error registering model: {e}")
            return None

    def transition_model_stage(self, model_name, version, stage):
        """Transition model to different stages (Staging → Production)"""

        try:
            # Transition model stage
            self.client.transition_model_version_stage(
                name=model_name,
                version=version,
                stage=stage
            )

            print(f"Model {model_name} version {version} transitioned to {stage} stage")

            # Wait for transition to complete
            time.sleep(2)

            # Get updated model version
            mv = self.client.get_model_version(model_name, version)
            print(f"Current stage: {mv.current_stage}")

            return mv

        except Exception as e:
            print(f"Error transitioning model stage: {e}")
            return None

    def list_registered_models(self):
        """List all registered models"""
        models = self.client.search_registered_models()

        print("=== REGISTERED MODELS ===")
        for model in models:
            print(f"Model: {model.name}")
            for version in model.latest_versions:
                print(f"  Version {version.version}: {version.current_stage}")

    def archive_old_versions(self, model_name, keep_versions=3):
        """Archive old model versions to keep registry clean"""
        try:
            # Get all versions
            versions = self.client.search_model_versions(f"name='{model_name}'")

            # Sort by version number and get old versions
            sorted_versions = sorted(versions, key=lambda x: x.version, reverse=True)
            old_versions = sorted_versions[keep_versions:]

            for version in old_versions:
                if version.current_stage == "None":
                    self.client.transition_model_version_stage(
                        name=model_name,
                        version=version.version,
                        stage="Archived"
                    )
                    print(f"Archived version {version.version}")

        except Exception as e:
            print(f"Error archiving old versions: {e}")

In [None]:
# main.py
import mlflow
from data_preparation import DataPreprocessor
from model_training import ModelTrainer
from hyperparameter_tuning import HyperparameterTuner
from model_selection import ModelSelector
from model_registry import ModelRegistryManager
import pandas as pd

def main():
    print("🚀 Starting MLOps Assignment 1 - MLflow Experiment Tracking")

    # Step 1: Data Preparation
    print("\n" + "="*50)
    print("STEP 1: Data Preparation")
    print("="*50)
    preprocessor = DataPreprocessor()
    df = preprocessor.load_data()
    df = preprocessor.explore_data(df)
    X_train, X_test, y_train, y_test, feature_names = preprocessor.preprocess_data(df)

    # Step 2: Baseline Model Training
    print("\n" + "="*50)
    print("STEP 2: Baseline Model Training")
    print("="*50)
    trainer = ModelTrainer(X_train, X_test, y_train, y_test, feature_names)

    # Train multiple baseline models
    rf_metrics, rf_model = trainer.train_random_forest("baseline")
    gb_metrics, gb_model = trainer.train_gradient_boosting("baseline")
    lr_metrics, lr_model = trainer.train_logistic_regression("baseline")

    # Step 3: Hyperparameter Tuning
    print("\n" + "="*50)
    print("STEP 3: Hyperparameter Tuning")
    print("="*50)
    tuner = HyperparameterTuner(X_train, X_test, y_train, y_test, feature_names)

    rf_tuned_metrics, rf_tuned_model, rf_best_params = tuner.tune_random_forest()
    gb_tuned_metrics, gb_tuned_model, gb_best_params = tuner.tune_gradient_boosting()

    # Step 4: Model Selection
    print("\n" + "="*50)
    print("STEP 4: Model Selection")
    print("="*50)
    selector = ModelSelector()
    best_run = selector.select_best_model()

    if best_run is not None:
        best_run_id = best_run['run_id']
        best_model_name = best_run['run_name']

        # Step 5: Model Registration
        print("\n" + "="*50)
        print("STEP 5: Model Registration")
        print("="*50)
        registry_manager = ModelRegistryManager()

        # Register the best model
        model_version = registry_manager.register_best_model(best_run_id)

        if model_version:
            # Transition to Staging
            registry_manager.transition_model_stage(
                model_version.name,
                model_version.version,
                "Staging"
            )

            # Demonstrate stage transition (Staging → Production)
            print("\n" + "="*50)
            print("Demonstrating Stage Transition: Staging → Production")
            print("="*50)
            registry_manager.transition_model_stage(
                model_version.name,
                model_version.version,
                "Production"
            )

        # List all registered models
        registry_manager.list_registered_models()

    print("\n" + "="*50)
    print("🎉 ASSIGNMENT COMPLETED SUCCESSFULLY!")
    print("="*50)
    print("\nNext steps:")
    print("1. Start MLflow UI: mlflow ui")
    print("2. Open http://localhost:5000 in your browser")
    print("3. Explore experiments and model registry")
    print("4. Check the mlruns directory for artifacts")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'data_preparation'

In [None]:
!python mlflow-assignment-1/main.py

In [None]:
%%writefile mlflow-assignment-1/data_preparation.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import fetch_openml
import warnings
warnings.filterwarnings('ignore')

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

    def load_data(self):
        """Load and prepare the wine quality dataset"""
        print("Loading Wine Quality Dataset...")

        # Method 1: Using fetch_openml (recommended)
        try:
            wine = fetch_openml(name='wine-quality-red', version=1, as_frame=True)
            df = wine.frame
            df['quality'] = df['quality'].astype(int)
        except:
            # Method 2: Download from URL
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
            df = pd.read_csv(url, delimiter=';')

        print(f"Dataset loaded with {df.shape[0]} samples and {df.shape[1]} features")
        return df

    def explore_data(self, df):
        """Perform basic data exploration"""
        print("\n=== Data Exploration ===")
        print(f"Dataset shape: {df.shape}")
        print(f"\nColumn types:\n{df.dtypes}")
        print(f"\nMissing values:\n{df.isnull().sum()}")
        print(f"\nDataset description:\n{df.describe()}")

        # Check target distribution
        print(f"\nTarget distribution (quality):\n{df['quality'].value_counts().sort_index()}")

        return df

    def preprocess_data(self, df):
        """Preprocess the data for modeling"""
        print("\n=== Data Preprocessing ===")

        # Create a binary classification problem (good wine vs bad wine)
        df['wine_quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

        # Features and target
        X = df.drop(['quality', 'wine_quality'], axis=1)
        y = df['wine_quality']

        print(f"Features: {X.columns.tolist()}")
        print(f"Target distribution: {y.value_counts()}")
        print(f"Positive class ratio: {y.mean():.3f}")

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Scale numerical features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        print(f"Training set: {X_train_scaled.shape}")
        print(f"Test set: {X_test_scaled.shape}")

        return X_train_scaled, X_test_scaled, y_train, y_test, X.columns

if __name__ == "__main__":
    preprocessor = DataPreprocessor()
    df = preprocessor.load_data()
    df = preprocessor.explore_data(df)
    X_train, X_test, y_train, y_test, feature_names = preprocessor.preprocess_data(df)

Writing mlflow-assignment-1/data_preparation.py


In [None]:
%%writefile mlflow-assignment-1/mlflow_setup.py
import mlflow
import mlflow.sklearn
import os
from datetime import datetime
import tempfile

class MLflowSetup:
    def __init__(self, experiment_name="Wine-Quality-Classification"):
        self.experiment_name = experiment_name
        self.setup_tracking()

    def setup_tracking(self):
        """Setup MLflow tracking with multiple backend options"""

        # Option 1: Local File System (Default)
        tracking_uri = "file:///./mlruns"

        # Option 2: SQLite Backend (Uncomment to use)
        # tracking_uri = "sqlite:///mlflow.db"

        # Option 3: Remote Server (Uncomment and modify for your setup)
        # tracking_uri = "http://your-mlflow-server:5000"

        # Option 4: AWS S3 Backend (Uncomment for AWS deployment)
        # tracking_uri = "file:///./mlruns"
        # os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key'
        # os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-key'
        # os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'https://s3.amazonaws.com'

        mlflow.set_tracking_uri(tracking_uri)

        # Set experiment
        mlflow.set_experiment(self.experiment_name)

        print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
        print(f"Experiment: {self.experiment_name}")

    def create_run_name(self, model_type, run_description):
        """Create meaningful run names"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"{model_type}_{timestamp}_{run_description}"

# Initialize MLflow setup
mlflow_setup = MLflowSetup()

Writing mlflow-assignment-1/mlflow_setup.py


In [None]:
%%writefile mlflow-assignment-1/model_training.py
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import tempfile
import os
import pandas as pd
from mlflow_setup import MLflowSetup # Import MLflowSetup

class ModelTrainer:
    def __init__(self, X_train, X_test, y_train, y_test, feature_names):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.mlflow_setup = MLflowSetup()

    def evaluate_model(self, model, model_type):
        """Comprehensive model evaluation"""
        # Predictions
        y_pred = model.predict(self.X_test)
        y_pred_proba = model.predict_proba(self.X_test)[:, 1] if hasattr(model, 'predict_proba') else None

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(self.y_test, y_pred),
            'precision': precision_score(self.y_test, y_pred, zero_division=0),
            'recall': recall_score(self.y_test, y_pred, zero_division=0),
            'f1_score': f1_score(self.y_test, y_pred, zero_division=0),
        }

        if y_pred_proba is not None:
            metrics['roc_auc'] = roc_auc_score(self.y_test, y_pred_proba)

        # Cross-validation scores
        cv_scores = cross_val_score(model, self.X_train, self.y_train, cv=5, scoring='f1')
        metrics['cv_f1_mean'] = cv_scores.mean()
        metrics['cv_f1_std'] = cv_scores.std()

        return metrics, y_pred, y_pred_proba

    def create_plots(self, model, y_pred, y_pred_proba, model_type):
        """Create evaluation plots"""
        plots = {}

        # Confusion Matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(self.y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_type}')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')

        cm_path = tempfile.mktemp(suffix='_cm.png')
        plt.savefig(cm_path)
        plt.close()
        plots['confusion_matrix'] = cm_path

        # Feature Importance (for tree-based models)
        if hasattr(model, 'feature_importances_'):
            plt.figure(figsize=(10, 6))
            feature_imp = pd.DataFrame({
                'feature': self.feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)

            sns.barplot(data=feature_imp.head(10), x='importance', y='feature')
            plt.title(f'Feature Importance - {model_type}')

            fi_path = tempfile.mktemp(suffix='_feature_importance.png')
            plt.savefig(fi_path)
            plt.close()
            plots['feature_importance'] = fi_path

        return plots

    def train_random_forest(self, run_description="baseline"):
        """Train Random Forest Classifier with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("RandomForest", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'n_estimators': 100,
                'max_depth': 10,
                'min_samples_split': 2,
                'min_samples_leaf': 1,
                'random_state': 42
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Log dataset info
            mlflow.log_param("dataset_shape", f"{self.X_train.shape}")
            mlflow.log_param("feature_count", len(self.feature_names))

            # Train model
            print("Training Random Forest...")
            model = RandomForestClassifier(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "RandomForest")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Create and log plots
            plots = self.create_plots(model, y_pred, y_pred_proba, "RandomForest")
            for plot_name, plot_path in plots.items():
                mlflow.log_artifact(plot_path, "plots")
                os.remove(plot_path)  # Clean up temp file

            # Log model
            mlflow.sklearn.log_model(
                model,
                "random_forest_model",
                registered_model_name="RandomForest_Wine_Quality"
            )

            # Log feature names as artifact
            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
                json.dump({'feature_names': self.feature_names.tolist()}, f)
                mlflow.log_artifact(f.name, "metadata")
                os.remove(f.name)

            print(f"Random Forest completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

    def train_gradient_boosting(self, run_description="baseline"):
        """Train Gradient Boosting Classifier with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("GradientBoosting", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'n_estimators': 100,
                'learning_rate': 0.1,
                'max_depth': 3,
                'random_state': 42
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Train model
            print("Training Gradient Boosting...")
            model = GradientBoostingClassifier(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "GradientBoosting")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Create and log plots
            plots = self.create_plots(model, y_pred, y_pred_proba, "GradientBoosting")
            for plot_name, plot_path in plots.items():
                mlflow.log_artifact(plot_path, "plots")
                os.remove(plot_path)

            # Log model
            mlflow.sklearn.log_model(
                model,
                "gradient_boosting_model",
                registered_model_name="GradientBoosting_Wine_Quality"
            )

            print(f"Gradient Boosting completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

    def train_logistic_regression(self, run_description="baseline"):
        """Train Logistic Regression with MLflow tracking"""
        run_name = self.mlflow_setup.create_run_name("LogisticRegression", run_description)

        with mlflow.start_run(run_name=run_name):
            # Define parameters
            params = {
                'C': 1.0,
                'max_iter': 1000,
                'random_state': 42,
                'solver': 'liblinear'
            }

            # Log parameters
            for param, value in params.items():
                mlflow.log_param(param, value)

            # Train model
            print("Training Logistic Regression...")
            model = LogisticRegression(**params)
            model.fit(self.X_train, self.y_train)

            # Evaluate
            metrics, y_pred, y_pred_proba = self.evaluate_model(model, "LogisticRegression")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                model,
                "logistic_regression_model",
                registered_model_name="LogisticRegression_Wine_Quality"
            )

            print(f"Logistic Regression completed - F1 Score: {metrics['f1_score']:.4f}")

            return metrics, model

Writing mlflow-assignment-1/model_training.py


In [None]:
%%writefile mlflow-assignment-1/hyperparameter_tuning.py
import mlflow
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd
from mlflow_setup import MLflowSetup # Import MLflowSetup
from model_training import ModelTrainer # Import ModelTrainer

class HyperparameterTuner:
    def __init__(self, X_train, X_test, y_train, y_test, feature_names):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.feature_names = feature_names
        self.mlflow_setup = MLflowSetup()
        self.trainer = ModelTrainer(X_train, X_test, y_train, y_test, feature_names)

    def tune_random_forest(self):
        """Perform hyperparameter tuning for Random Forest"""
        run_name = self.mlflow_setup.create_run_name("RandomForest", "hyperparameter_tuning")

        with mlflow.start_run(run_name=run_name):
            # Define parameter grid
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, 15, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }

            # Log tuning parameters
            mlflow.log_param("tuning_method", "GridSearchCV")
            mlflow.log_param("param_grid", str(param_grid))
            mlflow.log_param("cv_folds", 5)

            # Perform grid search
            print("Performing GridSearch for Random Forest...")
            grid_search = GridSearchCV(
                RandomForestClassifier(random_state=42),
                param_grid,
                cv=5,
                scoring='f1',
                n_jobs=-1,
                verbose=1
            )

            grid_search.fit(self.X_train, self.y_train)

            # Log best parameters
            best_params = grid_search.best_params_
            for param, value in best_params.items():
                mlflow.log_param(f"best_{param}", value)

            mlflow.log_metric("best_cv_score", grid_search.best_score_)

            # Train final model with best parameters
            best_model = grid_search.best_estimator_
            metrics, y_pred, y_pred_proba = self.trainer.evaluate_model(best_model, "RandomForest_Tuned")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                best_model,
                "random_forest_tuned_model",
                registered_model_name="RandomForest_Wine_Quality_Tuned"
            )

            print(f"Random Forest Tuning completed - Best F1 Score: {metrics['f1_score']:.4f}")

            return metrics, best_model, grid_search.best_params_

    def tune_gradient_boosting(self):
        """Perform hyperparameter tuning for Gradient Boosting"""
        run_name = self.mlflow_setup.create_run_name("GradientBoosting", "hyperparameter_tuning")

        with mlflow.start_run(run_name=run_name):
            # Define parameter grid
            param_grid = {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'subsample': [0.8, 0.9, 1.0]
            }

            # Log tuning parameters
            mlflow.log_param("tuning_method", "GridSearchCV")
            mlflow.log_param("param_grid", str(param_grid))

            # Perform grid search
            print("Performing GridSearch for Gradient Boosting...")
            grid_search = GridSearchCV(
                GradientBoostingClassifier(random_state=42),
                param_grid,
                cv=5,
                scoring='f1',
                n_jobs=-1,
                verbose=1
            )

            grid_search.fit(self.X_train, self.y_train)

            # Log best parameters
            best_params = grid_search.best_params_
            for param, value in best_params.items():
                mlflow.log_param(f"best_{param}", value)

            mlflow.log_metric("best_cv_score", grid_search.best_score_)

            # Train final model with best parameters
            best_model = grid_search.best_estimator_
            metrics, y_pred, y_pred_proba = self.trainer.evaluate_model(best_model, "GradientBoosting_Tuned")

            # Log metrics
            for metric, value in metrics.items():
                mlflow.log_metric(metric, value)

            # Log model
            mlflow.sklearn.log_model(
                best_model,
                "gradient_boosting_tuned_model",
                registered_model_name="GradientBoosting_Wine_Quality_Tuned"
            )

            print(f"Gradient Boosting Tuning completed - Best F1 Score: {metrics['f1_score']:.4f}")

            return metrics, best_model, grid_search.best_params_

Writing mlflow-assignment-1/hyperparameter_tuning.py


In [None]:
%%writefile mlflow-assignment-1/model_selection.py
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import numpy as np
from mlflow_setup import MLflowSetup # Import MLflowSetup

class ModelSelector:
    def __init__(self):
        self.client = MlflowClient()
        self.mlflow_setup = MLflowSetup()

    def get_all_runs(self):
        """Retrieve all runs from the current experiment"""
        experiment = mlflow.get_experiment_by_name(self.mlflow_setup.experiment_name)
        runs = self.client.search_runs(experiment.experiment_id)

        run_data = []
        for run in runs:
            run_info = {
                'run_id': run.info.run_id,
                'run_name': run.data.tags.get('mlflow.runName', ''),
                'model_type': run.data.tags.get('mlflow.runName', '').split('_')[0],
                'status': run.info.status,
                'start_time': run.info.start_time,
            }

            # Add parameters
            for key, value in run.data.params.items():
                run_info[f'param_{key}'] = value

            # Add metrics
            for key, value in run.data.metrics.items():
                run_info[f'metric_{key}'] = value

            run_data.append(run_info)

        return pd.DataFrame(run_data)

    def select_best_model(self, primary_metric='metric_f1_score', secondary_metric='metric_roc_auc'):
        """Select the best model based on specified metrics"""
        runs_df = self.get_all_runs()

        if runs_df.empty:
            print("No runs found in the experiment")
            return None

        # Filter completed runs
        completed_runs = runs_df[runs_df['status'] == 'FINISHED']

        if completed_runs.empty:
            print("No completed runs found")
            return None

        # Sort by primary and secondary metrics
        best_run = completed_runs.sort_values(
            [primary_metric, secondary_metric],
            ascending=[False, False]
        ).iloc[0]

        print("=== BEST MODEL SELECTION RESULTS ===")
        print(f"Best Run ID: {best_run['run_id']}")
        print(f"Best Run Name: {best_run['run_name']}")
        print(f"Primary Metric ({primary_metric}): {best_run[primary_metric]:.4f}")
        print(f"Secondary Metric ({secondary_metric}): {best_run.get(secondary_metric, 'N/A')}")

        # Display comparison table
        comparison_df = completed_runs.groupby('model_type').agg({
            'metric_accuracy': 'mean',
            'metric_precision': 'mean',
            'metric_recall': 'mean',
            'metric_f1_score': 'mean',
            'metric_roc_auc': 'mean'
        }).round(4)

        print("\n=== MODEL COMPARISON ===")
        print(comparison_df)

        return best_run

Writing mlflow-assignment-1/model_selection.py


In [None]:
%%writefile mlflow-assignment-1/model_registry.py
import mlflow
from mlflow.tracking import MlflowClient
import time

class ModelRegistryManager:
    def __init__(self):
        self.client = MlflowClient()

    def register_best_model(self, run_id, model_name="Best_Wine_Quality_Model"):
        """Register the best model in MLflow Model Registry"""

        # Construct model URI
        model_uri = f"runs:/{run_id}/model"

        try:
            # Register the model
            print(f"Registering model from run {run_id}...")
            mv = mlflow.register_model(model_uri, model_name)

            print(f"Model registered successfully!")
            print(f"Model Name: {mv.name}")
            print(f"Model Version: {mv.version}")
            print(f"Current Stage: {mv.current_stage}")

            return mv

        except Exception as e:
            print(f"Error registering model: {e}")
            return None

    def transition_model_stage(self, model_name, version, stage):
        """Transition model to different stages (Staging → Production)"""

        try:
            # Transition model stage
            self.client.transition_model_version_stage(
                name=model_name,
                version=version,
                stage=stage
            )

            print(f"Model {model_name} version {version} transitioned to {stage} stage")

            # Wait for transition to complete
            time.sleep(2)

            # Get updated model version
            mv = self.client.get_model_version(model_name, version)
            print(f"Current stage: {mv.current_stage}")

            return mv

        except Exception as e:
            print(f"Error transitioning model stage: {e}")
            return None

    def list_registered_models(self):
        """List all registered models"""
        models = self.client.search_registered_models()

        print("=== REGISTERED MODELS ===")
        for model in models:
            print(f"Model: {model.name}")
            for version in model.latest_versions:
                print(f"  Version {version.version}: {version.current_stage}")

    def archive_old_versions(self, model_name, keep_versions=3):
        """Archive old model versions to keep registry clean"""
        try:
            # Get all versions
            versions = self.client.search_model_versions(f"name='{model_name}'")

            # Sort by version number and get old versions
            sorted_versions = sorted(versions, key=lambda x: x.version, reverse=True)
            old_versions = sorted_versions[keep_versions:]

            for version in old_versions:
                if version.current_stage == "None":
                    self.client.transition_model_version_stage(
                        name=model_name,
                        version=version.version,
                        stage="Archived"
                    )
                    print(f"Archived version {version.version}")

        except Exception as e:
            print(f"Error archiving old versions: {e}")

Writing mlflow-assignment-1/model_registry.py


In [None]:
%%writefile mlflow-assignment-1/main.py
import mlflow
from data_preparation import DataPreprocessor
from model_training import ModelTrainer
from hyperparameter_tuning import HyperparameterTuner
from model_selection import ModelSelector
from model_registry import ModelRegistryManager
import pandas as pd

def main():
    print("🚀 Starting MLOps Assignment 1 - MLflow Experiment Tracking")

    # Step 1: Data Preparation
    print("\n" + "="*50)
    print("STEP 1: Data Preparation")
    print("="*50)
    preprocessor = DataPreprocessor()
    df = preprocessor.load_data()
    df = preprocessor.explore_data(df)
    X_train, X_test, y_train, y_test, feature_names = preprocessor.preprocess_data(df)

    # Step 2: Baseline Model Training
    print("\n" + "="*50)
    print("STEP 2: Baseline Model Training")
    print("="*50)
    trainer = ModelTrainer(X_train, X_test, y_train, y_test, feature_names)

    # Train multiple baseline models
    rf_metrics, rf_model = trainer.train_random_forest("baseline")
    gb_metrics, gb_model = trainer.train_gradient_boosting("baseline")
    lr_metrics, lr_model = trainer.train_logistic_regression("baseline")

    # Step 3: Hyperparameter Tuning
    print("\n" + "="*50)
    print("STEP 3: Hyperparameter Tuning")
    print("="*50)
    tuner = HyperparameterTuner(X_train, X_test, y_train, y_test, feature_names)

    rf_tuned_metrics, rf_tuned_model, rf_best_params = tuner.tune_random_forest()
    gb_tuned_metrics, gb_tuned_model, gb_best_params = tuner.tune_gradient_boosting()

    # Step 4: Model Selection
    print("\n" + "="*50)
    print("STEP 4: Model Selection")
    print("="*50)
    selector = ModelSelector()
    best_run = selector.select_best_model()

    if best_run is not None:
        best_run_id = best_run['run_id']
        best_model_name = best_run['run_name']

        # Step 5: Model Registration
        print("\n" + "="*50)
        print("STEP 5: Model Registration")
        print("="*50)
        registry_manager = ModelRegistryManager()

        # Register the best model
        model_version = registry_manager.register_best_model(best_run_id)

        if model_version:
            # Transition to Staging
            registry_manager.transition_model_stage(
                model_version.name,
                model_version.version,
                "Staging"
            )

            # Demonstrate stage transition (Staging → Production)
            print("\n" + "="*50)
            print("Demonstrating Stage Transition: Staging → Production")
            print("="*50)
            registry_manager.transition_model_stage(
                model_version.name,
                model_version.version,
                "Production"
            )

        # List all registered models
        registry_manager.list_registered_models()

    print("\n" + "="*50)
    print("🎉 ASSIGNMENT COMPLETED SUCCESSFULLY!")
    print("="*50)
    print("\nNext steps:")
    print("1. Start MLflow UI: mlflow ui")
    print("2. Open http://localhost:5000 in your browser")
    print("3. Explore experiments and model registry")
    print("4. Check the mlruns directory for artifacts")

if __name__ == "__main__":
    main()

Writing mlflow-assignment-1/main.py
