In [1]:
import numpy as np
import pandas as pd
import itertools

import multiprocessing
import mlflow
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
mlflow.set_tracking_uri('file:///Users/zhanghq/Learning/ml_workflow/mlrun')

mlflow.set_experiment("Iris_Classification")
# Load Dataset1
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and their hyperparameters
models = {
    "RandomForest": RandomForestClassifier,
    # "SVM": SVC
}
params = {
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [10, 20], 'n_jobs': [6]},
    # "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
}

for model_name, model_class in models.items():
    for param_combination in [dict(zip(params[model_name], v)) for v in itertools.product(*params[model_name].values())]:
        with mlflow.start_run(run_name=f"{model_name}_{param_combination}"):
            # Initialize the model with the hyperparameters
            model = model_class(**param_combination)
            
            # Log hyperparameters
            for param_name, param_value in param_combination.items():
                mlflow.log_param(param_name, param_value)
            
            # Perform cross-validation
            cv_scores = cross_val_score(model, X_train, y_train, cv=2, n_jobs=6)
            
            # Log cross-validation scores
            mlflow.log_metric("cv_accuracy_mean", np.mean(cv_scores))
            mlflow.log_metric("cv_accuracy_std", np.std(cv_scores))
            
            # Fit the model on the full training data
            model.fit(X_train, y_train)
            
            # Log the trained model
            mlflow.sklearn.log_model(model, model_name)


2024/07/18 01:33:00 INFO mlflow.tracking.fluent: Experiment with name 'Iris_Classification' does not exist. Creating a new experiment.


In [None]:
# Load Dataset (e.g., Iris)
data = load_iris()
X = data.data
y = data.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and their hyperparameters
models = {
    "RandomForest_1": RandomForestClassifier(n_estimators=50, max_depth=5),
    "RandomForest_2": RandomForestClassifier(n_estimators=100, max_depth=10),
}

def run_experiment(model_name, model):
    with mlflow.start_run(run_name=model_name):
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        
        # Log cross-validation scores
        mlflow.log_metric("cv_accuracy_mean", np.mean(cv_scores))
        mlflow.log_metric("cv_accuracy_std", np.std(cv_scores))
        
        # Fit the model on the full training data
        model.fit(X_train, y_train)
        
        # Log the trained model
        mlflow.sklearn.log_model(model, model_name)

# Create processes for each model
processes = []
for model_name, model in models.items():
    process = multiprocessing.Process(target=run_experiment, args=(model_name, model))
    processes.append(process)
    process.start()

# Wait for all processes to complete
for process in processes:
    process.join()

print("All experiments completed.")
