In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import mlflow

In [10]:
# 1. Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
# 3. Define the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# 4. Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [12]:
# 5. Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)


In [13]:
# 6. MLflow experiment
mlflow.set_experiment("RandomForest_BreastCancer")


<Experiment: artifact_location='file:///c:/ðŸ’•Coding ðŸ’•/YT_MLOPS_EXPERIMENT_WITH_MLFLOW/src/mlruns/4', creation_time=1767405898849, experiment_id='4', last_update_time=1767405898849, lifecycle_stage='active', name='RandomForest_BreastCancer', tags={}>

In [14]:
with mlflow.start_run(run_name="RF_GridSearchCV") as parent:
    # Fit model
    grid_search.fit(X_train, y_train)

    for i in range (len(grid_search.cv_results_['params'])):
        with mlflow.start_run(run_name=f"RF_Params_{i}", nested=True):
            params = grid_search.cv_results_['params'][i]
            mean_test_score = grid_search.cv_results_['mean_test_score'][i]
            std_test_score = grid_search.cv_results_['std_test_score'][i]
            
            # Log parameters and metrics for each hyperparameter combination
            mlflow.log_params(params)
            mlflow.log_metric("mean_cv_accuracy", mean_test_score)
            mlflow.log_metric("std_cv_accuracy", std_test_score)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_accuracy", best_score)
    
    # Log the model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "random_forest_model")
    
    print("Best Parameters:", best_params)
    print("Best CV Accuracy:", best_score)

Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best CV Accuracy: 0.9626373626373625


In [15]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# best_model = grid_search.best_estimator_
# print("Best Hyperparameters:", grid_search.best_params_)
# # Evaluate the best model on the test set
# test_accuracy = best_model.score(X_test, y_test)
# print("Test Set Accuracy:", test_accuracy)
# # Log the best model and parameters to MLflow
# mlflow.sklearn.log_model(best_model, "best_random_forest_model")
# mlflow.log_params(grid_search.best_params_)
# mlflow.log_metric("test_accuracy", test_accuracy)
# mlflow.end_run()