# Import required packages and load dataset

In [0]:
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

import mlflow

In [0]:
X, y = fetch_california_housing(return_X_y=True)

- The California housing dataset is a widely used dataset in machine learning and is available in the scikit-learn library
- It contains information about housing prices in various districts of California. The dataset is often used for regression tasks to predict the median house value in a given district based on several features.

- **The California housing dataset provides the following information for each district:**

1) MedInc: Median income of households in the district.
2) HouseAge: Median age of houses in the district.
3) AveRooms: Average number of rooms per house.
4) AveBedrms: Average number of bedrooms per house.
5) Population: Total population in the district.
6) AveOccup: Average number of occupants per house.
7) Latitude: Latitude of the district's location.
8) Longitude: Longitude of the district's location.
9) MedHouseVal: Median value of houses in the district (the target variable).

- The goal of using this dataset is typically to build a regression model that can predict the median house value based on the given features.

# Feature Engineering

## Scale the features


In [0]:
X.mean(axis=0)

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scalar = StandardScaler()

In [0]:
X = scalar.fit_transform(X)

In [0]:
X.mean(axis=0)

## Convert the numeric target column to discrete values

In [0]:
y_discrete = np.where(y < np.median(y), 0, 1)

In [0]:
print(y_discrete)

# Hyperopt workflow

## Define the function to minimize

In [0]:
def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'svm':
        clf = SVC(**params)
    elif classifier_type == 'rf':
        clf = RandomForestClassifier(**params)
    elif classifier_type == 'logreg':
        clf = LogisticRegression(**params)
    else:
        return 0
    accuracy = cross_val_score(clf, X, y_discrete).mean()
    
    # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
    return {'loss': -accuracy, 'status': STATUS_OK}

## Define the search space over hyperparameters

In [0]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'svm',
        'C': hp.lognormal('SVM_C', 0, 1.0),
        'kernel': hp.choice('kernel', ['linear', 'rbf'])
    },
    {
        'type': 'rf',
        'max_depth': hp.quniform('max_depth', 2, 5, 1),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
    {
        'type': 'logreg',
        'C': hp.lognormal('LR_C', 0, 1.0),
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    },
])

## Select the search algorithm

The two main choices are:
* `hyperopt.tpe.suggest`: Tree of Parzen Estimators, a Bayesian approach that iteratively and adaptively selects new hyperparameter settings to explore based on previous results
* `hyperopt.rand.suggest`: Random search, a non-adaptive approach that samples over the search space

In [0]:
algo = tpe.suggest

## Run the tuning algorithm with Hyperopt fmin()


To distribute tuning, add one more argument to `fmin()`: Class `Trials` & Method `SparkTrials`

`SparkTrials` takes 2 optional arguments:  
* `parallelism`: Number of models to fit and evaluate concurrently. The default is the number of available Spark task slots.
* `timeout`: Maximum time (in seconds) that `fmin()` can run. The default is no maximum time limit.

In [0]:
from hyperopt import SparkTrials

In [0]:
spark_trials = SparkTrials()

In [0]:
with mlflow.start_run():
  best_results = fmin(
    fn=objective,
    space=search_space,
    algo=algo,
    max_evals=32,
    trials=spark_trials
  )

## Print the hyperparameters that produced the best result

In [0]:
import hyperopt

In [0]:
print(hyperopt.space_eval(search_space, best_results))