In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from scipy.stats import norm

import random as rand
import matplotlib.pyplot as plt

import itertools


### optimising a decision tree

optimising a decision tree model using bayesian optimisation 

bayesian optimisation picked becuase there was to many options to run a grid meathod


In [2]:
data = pd.read_csv('london_house_prices_ajusted.csv')

data.columns


Index(['bedrooms', 'bathrooms', 'garden', 'street', 'price_pounds',
       'nearest_station_name', 'nearest_station_miles', 'postcode_outer',
       'tenure_freehold', 'tenure_leasehold'],
      dtype='object')

In [3]:

#Runs K fold Validation
#Use KFold() to generate k sets of folds
#Itterate through the folds and find the mean_absolute_percentage_error of each fold
#Then find an average for all the mean_absolute_percentage_error's and return it


def run_k_fold_validation(k, model, data):
    

    
    kfold = KFold(n_splits=k, shuffle=True)

    abs_errors = []

    for train, test in kfold.split(data):

        train = data.iloc[train]
        test = data.iloc[test]


        input_x = train.drop("price_pounds", axis='columns')
        input_y = train.price_pounds


        model.fit(input_x, input_y)
        
        preds = model.predict(test.drop("price_pounds", axis='columns'))

        abs_error = mean_absolute_percentage_error(y_true=test.price_pounds, y_pred=preds )

        abs_errors.append(abs_error)

    return(sum(abs_errors)/k)
    

# generate every combination of hyperparameters     
def get_hypergrid_decision_tree():
    
    k_range = range(2, 16)
    criterion_range = range(1, 5)
    min_samples_split_range = range(2, 6)
    min_samples_leaf_range = range(1, 5)
    min_impurity_decrease_range = range(0, 5)
    combos = list(itertools.product(k_range, criterion_range, min_samples_split_range, min_samples_leaf_range, min_impurity_decrease_range))
    
    combos_dataframe = pd.DataFrame(combos, columns=["k", "criterion", "min_samples_split", "min_samples_leaf", "min_impurity_decrease"])
    
    combos_dataframe["min_impurity_decrease"] = combos_dataframe["min_impurity_decrease"] / 10

    combos_dataframe["squared_error"] = (combos_dataframe["criterion"] ==  1)
    combos_dataframe["friedman_mse"] = (combos_dataframe["criterion"] ==  2)
    combos_dataframe["absolute_error"] = (combos_dataframe["criterion"] ==  3)
    combos_dataframe["poisson"] = (combos_dataframe["criterion"] ==  4)

    combos_dataframe = combos_dataframe.drop("criterion", axis='columns')
    
    return combos_dataframe


#input a table of hyper parameters and generate a result column for those parameters 
def generate_results(hypers_table):
    
    data = pd.read_csv('london_house_prices_ajusted.csv')
   
    y = []
    
    for index, row in hypers_table.iterrows(): 
        
        if row[4]:
            cri = "squared_error"
        elif row[5]:
            cri = "friedman_mse"
        elif row[6]:
            cri = "absolute_error"
        else:
            cri = "poisson"

        model = DecisionTreeRegressor(min_samples_split = row[1], min_samples_leaf= row[2], min_impurity_decrease =row[3], criterion=cri)
        y.append(run_k_fold_validation(row[0],model,data))
        

    
    hypers_table["result"] = y

    
    return hypers_table

#generate n random rows from the dataset
def generate_random_hyperparameters(n_rows):
    
    hypers = get_hypergrid_decision_tree()

    hypers = hypers.sample(n=n_rows)
           
    return generate_results(hypers)






In [4]:

#Create surrogate model 
surg_model = GaussianProcessRegressor()

#Generate some starting data
hypersdata = generate_random_hyperparameters(30)

#exploration-exploitation trade off parameter 
xi = 0.01

for i in range(1,15):

    #Generate a grid
    grid = get_hypergrid_decision_tree()

    #fit data to the surrogate model 
    surg_model.fit(hypersdata[["k", "min_samples_split", "min_samples_leaf", "min_impurity_decrease", "squared_error", "friedman_mse", "absolute_error", "poisson"]], hypersdata["result"])

    #Get values from surrogate model 
    preds, var = surg_model.predict(grid, return_std=True)

    #get optimal result
    best_result = np.min(hypersdata["result"])

    #calculate impurity value 
    imp = best_result - preds - xi

    #calculate expected improvement 
    ei = imp * norm.cdf(imp / var) + var * norm.pdf(imp / var)
    ei[var == 0.0] = 0.0

    #create a new col with expected improvement
    grid["ei"] = ei
    
    #sort the new column so the value with the best improvement is first 
    grid = grid.sort_values(by="ei", axis=0, ascending=False)
    
    #drop starting data values from the grid
    grid.drop(index=hypersdata.index, inplace=True)

    #remove expected improvement ( so it can be added back next loop)
    grid = grid.drop(['ei'], axis=1)
            
    #add the grid value with best improvement to the starting data
    new_point = grid.iloc[0]
    hypersdata.loc[grid.index[0]] = grid.iloc[0]
    
    #re calculate results col 
    hypersdata = hypersdata.drop(['result'], axis=1)
    hypersdata = generate_results(hypersdata)
    
    #tell me the progress of the optimisation 
    print("Loop number:",i)
        
print("done")


Loop number: 1
Loop number: 2
Loop number: 3
Loop number: 4
Loop number: 5
Loop number: 6
Loop number: 7
Loop number: 8
Loop number: 9
Loop number: 10
Loop number: 11
Loop number: 12
Loop number: 13
Loop number: 14
done


In [6]:
hypersdata.sort_values(by="result", axis=0, ascending=True)

Unnamed: 0,k,min_samples_split,min_samples_leaf,min_impurity_decrease,squared_error,friedman_mse,absolute_error,poisson,result
4320,15,2,1,0.0,False,False,True,False,0.283174
4283,15,4,1,0.3,False,True,False,False,0.294113
1862,7,3,1,0.2,False,False,False,True,0.30767
3141,11,3,1,0.1,False,False,False,True,0.308462
3604,13,2,1,0.4,False,True,False,False,0.312587
2502,9,3,1,0.2,False,False,False,True,0.318675
2720,10,2,1,0.0,False,False,True,False,0.319686
1820,7,5,1,0.0,False,False,True,False,0.321166
1293,6,2,3,0.3,True,False,False,False,0.3228
2704,10,5,1,0.4,False,True,False,False,0.323569
