In [1]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import joblib

In [2]:
# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Pipeline definition and creation

In [3]:
# Create a pipeline with a scaler and an SVM classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('svm', SVC(random_state=42))
])

## Grid Search cross validation

In [4]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'svm__C': [0.1, 1, 10, 100], 
    'svm__gamma': [1, 0.1, 0.01, 0.001],
    'svm__kernel': ['rbf', 'linear']
}

In [5]:
# Create GridSearchCV for tuning hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=0)
grid_search.fit(X_train, y_train)
print(f"GridSearchCV Best parameters: {grid_search.best_params_}")
print(f"GridSearchCV Best cross-validation score: {grid_search.best_score_:.3f}")

GridSearchCV Best parameters: {'svm__C': 100, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
GridSearchCV Best cross-validation score: 0.967


## Randomized Search cross validation

In [6]:
# Define the distribution for RandomizedSearchCV
param_distribution = {
    'svm__C': reciprocal(0.1, 100), 
    'svm__gamma': expon(scale=1.0),
}

In [7]:
# Create RandomizedSearchCV for tuning hyperparameters
random_search = RandomizedSearchCV(pipeline, param_distribution, n_iter=50, cv=5, verbose=0, random_state=42)
random_search.fit(X_train, y_train)
print(f"RandomizedSearchCV Best parameters: {random_search.best_params_}")
print(f"RandomizedSearchCV Best cross-validation score: {random_search.best_score_:.3f}")

RandomizedSearchCV Best parameters: {'svm__C': 20.59733535743719, 'svm__gamma': 0.07692926551379635}
RandomizedSearchCV Best cross-validation score: 0.967


## Saving and loading trained estimators

We will see in the next two batches that it is possible to save a trained estimator and load it later when we want to use it. Code below uses a Python library called *joblib* to achieve this. Don't worry, we will see this later, below code is just a first grasp.

In [8]:
# Save the best GridSearchCV estimator
joblib.dump(grid_search.best_estimator_, 'grid_search_best_pipeline.joblib')

# Save the best RandomizedSearchCV estimator
joblib.dump(random_search.best_estimator_, 'random_search_best_pipeline.joblib')

['grid_search_best_pipeline.joblib']

In [10]:
# Load and evaluate the best GridSearchCV estimator on unseen data
loaded_grid_model = joblib.load('grid_search_best_pipeline.joblib')
grid_test_score = loaded_grid_model.score(X_test, y_test)
print(f"Loaded GridSearchCV estimator Test set score: {grid_test_score:.3f}")

# Load and evaluate the best RandomizedSearchCV estimator on unseen data
loaded_random_model = joblib.load('random_search_best_pipeline.joblib')
random_test_score = loaded_random_model.score(X_test, y_test)
print(f"Loaded RandomizedSearchCV estimator Test set score: {random_test_score:.3f}")

Loaded GridSearchCV Model Test set score: 0.967
