# Hyperparameter tunning
Can be a computationally expensive task, but it can be parallelized to speed up the process. Parallelization can consume a lot of CPU/GPU resources. Be sure to monitor your system's resource usage to avoid overloading your hardware.

### Grid Search

Is a systematic approach where all possible combinations of hyperparameters are evaluated. 

- When to Use:
When the hyperparameter space is small and computationally feasible to explore exhaustively.
When you want to ensure that you find the best possible combination of hyperparameters.
- Advantages:
Exhaustive: Guarantees finding the best combination within the specified grid.
Easy to implement and understand.
- Disadvantages:
Computationally expensive, especially with a large number of hyperparameters or a wide range of values.
Inefficient for high-dimensional hyperparameter spaces.
Suffers from "curse of dimensionality".
Inefficient when some parameters are more important than others.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define model
model = RandomForestClassifier()

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy') # n_jobs=-1,use all available cores to run cross-validation on the grid of hyperparameters.
grid_search.fit(X, y)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define models and parameter grids
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': [3, 5, 10]
        }
    }
}

# Perform Grid Search for each model
best_models = {}
for model_name, model_info in models.items():
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_models[model_name] = {
        'best_model': grid_search.best_estimator_,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    }

# Select the best model
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_models[best_model_name]['best_params']}")
print(f"Best Score: {best_models[best_model_name]['best_score']}")

### Random Search
Randomly samples hyperparameters from a specified distribution or range.

- When to Use:
When the hyperparameter space is large, and Grid Search is computationally infeasible.
When you want to explore a wide range of hyperparameters efficiently.
- Advantages:
More efficient than Grid Search for large hyperparameter spaces.
Can find good hyperparameters with fewer iterations.
- Disadvantages:
Does not guarantee finding the best combination of hyperparameters.
May miss optimal hyperparameters if the search space is not well-defined.


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from scipy.stats import randint

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define model
model = RandomForestClassifier()

# Define hyperparameter distribution
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11)
}

# Perform Random Search
random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X, y)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from scipy.stats import randint, uniform

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define models and parameter distributions
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': randint(50, 200),
            'max_depth': [None, 10, 20],
            'min_samples_split': randint(2, 11)
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': uniform(0.1, 10),
            'kernel': ['linear', 'rbf']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': randint(50, 200),
            'learning_rate': uniform(0.01, 1),
            'max_depth': randint(3, 10)
        }
    }
}

# Perform Random Search for each model
best_models = {}
for model_name, model_info in models.items():
    random_search = RandomizedSearchCV(model_info['model'], model_info['params'], n_iter=10, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X, y)
    best_models[model_name] = {
        'best_model': random_search.best_estimator_,
        'best_score': random_search.best_score_,
        'best_params': random_search.best_params_
    }

# Select the best model
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_models[best_model_name]['best_params']}")
print(f"Best Score: {best_models[best_model_name]['best_score']}")

###  Bayesian Optimization
Bayesian Optimization uses probabilistic models (e.g., Gaussian Processes) to find the optimal hyperparameters by modeling the performance of the model as a function of the hyperparameters.

- When to Use:
When the hyperparameter space is large, and you want to find the optimal hyperparameters with fewer evaluations.
When the objective function (e.g., model performance) is expensive to evaluate.
- Advantages:
Efficient: Requires fewer evaluations compared to Grid Search and Random Search.
Balances exploration and exploitation.
- Disadvantages:
More complex to implement and understand.Might get stuck in local optima
Requires a good probabilistic model to guide the search.
Less parallelizable than Grid/Random Search




In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define model
model = RandomForestClassifier()

# Define hyperparameter search space
param_space = {
    'n_estimators': (50, 200),
    'max_depth': (1, 20),
    'min_samples_split': (2, 10)
}

# Perform Bayesian Optimization
bayes_search = BayesSearchCV(model, param_space, n_iter=10, cv=5, scoring='accuracy', random_state=42)
bayes_search.fit(X, y)

# Best parameters and score
print("Best Parameters:", bayes_search.best_params_)
print("Best Score:", bayes_search.best_score_)

In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define models and parameter search spaces
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': (50, 200),
            'max_depth': (1, 20),
            'min_samples_split': (2, 10)
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': (0.1, 10),
            'kernel': ['linear', 'rbf']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': (50, 200),
            'learning_rate': (0.01, 1),
            'max_depth': (3, 10)
        }
    }
}

# Perform Bayesian Optimization for each model
best_models = {}
for model_name, model_info in models.items():
    bayes_search = BayesSearchCV(model_info['model'], model_info['params'], n_iter=10, cv=5, scoring='accuracy', random_state=42)
    bayes_search.fit(X, y)
    best_models[model_name] = {
        'best_model': bayes_search.best_estimator_,
        'best_score': bayes_search.best_score_,
        'best_params': bayes_search.best_params_
    }

# Select the best model
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_models[best_model_name]['best_params']}")
print(f"Best Score: {best_models[best_model_name]['best_score']}")

###  Hyperband
Bandit-based optimization technique that speeds up hyperparameter tuning by early stopping poorly performing configurations.

- When to Use:
When you have a large hyperparameter space and want to quickly eliminate poor configurations.
When the model training process is time-consuming.
- Advantages:
Efficient: Reduces the number of evaluations by early stopping.
Works well with large hyperparameter spaces.
- Disadvantages:
Requires the model to support early stopping.
More complex to implement compared to Grid Search and Random Search.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperband import HyperbandSearchCV  # Requires external library

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model
model = RandomForestClassifier()

# Define hyperparameter search space
param_space = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform Hyperband Search
hyperband_search = HyperbandSearchCV(model, param_space, max_iter=81, cv=5, scoring='accuracy', random_state=42)
hyperband_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", hyperband_search.best_params_)
print("Best Score:", hyperband_search.best_score_)

# Evaluate on test set
y_pred = hyperband_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperband import HyperbandSearchCV  # Requires external library

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and parameter search spaces
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': [3, 5, 10]
        }
    }
}

# Perform Hyperband Search for each model
best_models = {}
for model_name, model_info in models.items():
    hyperband_search = HyperbandSearchCV(model_info['model'], model_info['params'], max_iter=81, cv=5, scoring='accuracy', random_state=42)
    hyperband_search.fit(X_train, y_train)
    best_models[model_name] = {
        'best_model': hyperband_search.best_estimator_,
        'best_score': hyperband_search.best_score_,
        'best_params': hyperband_search.best_params_
    }

# Select the best model
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_models[best_model_name]['best_params']}")
print(f"Best Score: {best_models[best_model_name]['best_score']}")

# Evaluate on test set
best_model = best_models[best_model_name]['best_model']
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))