In [1]:
import io
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
from hyperopt import rand, tpe
from hyperopt import Trials
from hyperopt import fmin

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import DistanceMetric
from math import sqrt



In [24]:
url="https://raw.githubusercontent.com/S-Mann/hyperparameter_optimization/master/dataset/dataset.csv"
s=requests.get(url).content
dataset=pd.read_csv(io.StringIO(s.decode('utf-8')))

excluded_columns = [x not in ['model', 'msrp', 'model_year', 'popularity', 'profit_per_unit', 'units_sold'] for x in dataset.columns]

X = dataset.iloc[:,excluded_columns].values
y = dataset.iloc[:,dataset.columns == 'msrp'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=123)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
imp = imp.fit(X_test)
X_test = imp.transform(X_test)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [25]:
def knn_obj_func(hyperparams):    
    classifier = KNeighborsClassifier(n_neighbors=int(hyperparams[1]), metric=hyperparams[0])
    classifier.fit(X_train, np.ravel(y_train, order='C'))
    
    y_pred = classifier.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    return rmse



In [62]:
metric_types = ['minkowski','euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis']
# This is a random k value and its RMSE
random_hyperparam = [metric_types[0], 3]
random_loss = knn_obj_func(random_hyperparam)
random_loss

knn_obj_func(['manhattan',5])

8785.074998772658

In [5]:
space = hp.choice('a',
    [
        (metric_types[0], hp.uniform('c1', 1, 100)),
        (metric_types[1], hp.uniform('c2', 1, 100)),
        (metric_types[2], hp.uniform('c3', 1, 100)),
        (metric_types[3], hp.uniform('c4', 1, 100)),
        (metric_types[4], hp.uniform('c5', 1, 100)),
        (metric_types[5], hp.uniform('c6', 1, 100))
    ])
samples = []

for _ in range(5000):
    samples.append(sample(space))



In [6]:
# Create objects for the algorithms
tpe_algo = tpe.suggest # Tree Structure Parzen Estimator
rand_algo = rand.suggest # Random Forest

# Create two trials objects to record your estimations
tpe_trials = Trials()
rand_trials = Trials()



In [7]:
# Run 2000 trials with the tpe algorithm with our objective function and domain space
tpe_best = fmin(fn=knn_obj_func, space=space, algo=tpe_algo, trials=tpe_trials, max_evals=200, rstate= np.random.RandomState(50))

# Run 2000 trials with the random algorithm with our objective function and domain space
rand_best = fmin(fn=knn_obj_func, space=space, algo=rand_algo, trials=rand_trials, max_evals=200, rstate= np.random.RandomState(50))







  0%|          | 0/200 [00:00<?, ?it/s, best loss: ?]  0%|          | 1/200 [00:01<05:27,  1.64s/it, best loss: 25212.752048099115]  1%|          | 2/200 [00:02<05:00,  1.52s/it, best loss: 24105.911826717595]  2%|▏         | 3/200 [00:04<05:24,  1.65s/it, best loss: 24105.911826717595]  2%|▏         | 4/200 [00:07<06:10,  1.89s/it, best loss: 24105.911826717595]  2%|▎         | 5/200 [00:08<05:42,  1.76s/it, best loss: 23183.704573326686]  3%|▎         | 6/200 [00:10<05:54,  1.83s/it, best loss: 23183.704573326686]  4%|▎         | 7/200 [00:11<04:48,  1.50s/it, best loss: 17129.55939882829]   4%|▍         | 8/200 [00:12<04:41,  1.46s/it, best loss: 17129.55939882829]  4%|▍         | 9/200 [00:13<04:20,  1.36s/it, best loss: 17129.55939882829]  5%|▌         | 10/200 [00:14<03:30,  1.11s/it, best loss: 16095.577856231403]  6%|▌         | 11/200 [00:16<04:25,  1.41s/it, best loss: 16095.577856231403]  6%|▌         | 12/200 [00:18<04:44,  1.51s/it, best loss: 16095.5778562314

In [8]:
tpe_best

{'a': 2, 'c3': 1.2096469331563569}