In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from scipy.stats import norm

import random as rand
import matplotlib.pyplot as plt

import itertools

## optimising a random forest 

bayesian optimisation picked becuase there was to many options to run a grid meathod


In [2]:

#Runs K fold Validation
#Use KFold() to generate k sets of folds
#Itterate through the folds and find the mean_absolute_percentage_error of each fold
#Then find an average for all the mean_absolute_percentage_error's and return it


def run_k_fold_validation(k, model, data):
    

    kfold = KFold(n_splits=k, shuffle=True)

    abs_errors = []

    for train, test in kfold.split(data):

        train = data.iloc[train]
        test = data.iloc[test]


        input_x = train.drop("price_pounds", axis='columns')
        input_y = train.price_pounds


        model.fit(input_x, input_y)
        
        preds = model.predict(test.drop("price_pounds", axis='columns'))

        abs_error = mean_absolute_percentage_error(y_true=test.price_pounds, y_pred=preds )

        abs_errors.append(abs_error)

    return(sum(abs_errors)/k)
    

# generate every combination of hyperparameters     
# encode the catagorical variables 
def get_hypergrid():

    k_range = range(2, 16)
    n_estimatorsint = range(100, 150)
    criterion_range = range(1, 5)
    max_depth = range(2, 10)
    max_features = range(1,7)
    
    combos = list(itertools.product(k_range, n_estimatorsint, criterion_range, max_depth, max_features))
    
    combos_dataframe = pd.DataFrame(combos, columns=["k", "n_estimatorsint", "criterion", "max_depth", "max_features"])
    
    combos_dataframe["squared_error"] = (combos_dataframe["criterion"] ==  1)
    combos_dataframe["friedman_mse"] = (combos_dataframe["criterion"] ==  2)
    combos_dataframe["absolute_error"] = (combos_dataframe["criterion"] ==  3)
    combos_dataframe["poisson"] = (combos_dataframe["criterion"] ==  4)

    combos_dataframe = combos_dataframe.drop("criterion", axis='columns')
    
    combos_dataframe["sqrt"] = (combos_dataframe["max_features"] ==  1)
    combos_dataframe["log2"] = (combos_dataframe["max_features"] ==  2)
    combos_dataframe["None"] = (combos_dataframe["max_features"] ==  3)
    combos_dataframe["1"] = (combos_dataframe["max_features"] ==  4)
    combos_dataframe["2"] = (combos_dataframe["max_features"] ==  5)
    combos_dataframe["3"] = (combos_dataframe["max_features"] ==  6)

    
    combos_dataframe = combos_dataframe.drop("max_features", axis='columns')

    
    return combos_dataframe


#generate n random rows from the dataset

def generate_random_hyperparameters(n_rows):
    
    hypers = get_hypergrid()

    hypers = hypers.sample(n=n_rows)
           
    return generate_results(hypers)




#input a table of hyper parameters and generate a result column for those parameters 

def generate_results(hypers_table):
    
    data = pd.read_csv('london_house_prices_ajusted.csv')
   
    y = []
    
    for index, row in hypers_table.iterrows(): 
        
        if row[3]:
            cri = "squared_error"
        elif row[4]:
            cri = "friedman_mse"
        elif row[5]:
            cri = "absolute_error"
        else:
            cri = "poisson"

            
        if row[7]:
            fea = "sqrt"
            
        elif row[8]:
            fea = "log2"
        elif row[9]:
            fea = None
        elif row[10]:
            fea = 1
        elif row[11]:
            fea = 2
        else:
            fea = 3

            
            
        model = RandomForestRegressor(n_estimators=row[1], 
                                      criterion=cri, 
                                      max_depth=row[2],                         
                                      max_features=fea)
        
        y.append(run_k_fold_validation(row[0],model,data))
        
        
        

              
        

    
    hypers_table["result"] = y

    
    return hypers_table






In [3]:
#Create surrogate model 
surg_model = GaussianProcessRegressor()

#Generate some starting data
hypersdata = generate_random_hyperparameters(30)

#exploration-exploitation trade off parameter 
xi = 0.01

for i in range(1,15):
    
    #Generate a grid
    grid = get_hypergrid()

    #fit data to the surrogate model 
    surg_model.fit(hypersdata[["k","n_estimatorsint","max_depth","squared_error","friedman_mse","absolute_error","poisson","sqrt","log2","None","1","2","3"]], hypersdata["result"])

    #Get values from surrogate model 
    preds, var = surg_model.predict(grid, return_std=True)

    #get optimal result
    best_result = np.min(hypersdata["result"])

    #calculate impurity value
    imp = best_result - preds - xi

    #calculate expected improvement
    ei = imp * norm.cdf(imp / var) + var * norm.pdf(imp / var)
    ei[var == 0.0] = 0.0

    #create a new col with expected improvement
    grid["ei"] = ei
    
    #sort the new column so the value with the best improvement is first 
    grid = grid.sort_values(by="ei", axis=0, ascending=False)
    
    #drop starting data values from the grid
    grid.drop(index=hypersdata.index, inplace=True)

    #remove expected improvement ( so it can be added back next loop)
    grid = grid.drop(['ei'], axis=1)
            
    #add the grid value with best improvement to the starting data
    new_point = grid.iloc[0]
    hypersdata.loc[grid.index[0]] = grid.iloc[0]
        
    #re calculate results col 
    hypersdata = hypersdata.drop(['result'], axis=1)
    hypersdata = generate_results(hypersdata)
        
    #tell me the progress of the optimisation 
    print("Loop number:",i)


print("done")

Loop number: 1
Loop number: 2
Loop number: 3
Loop number: 4
Loop number: 5
Loop number: 6
Loop number: 7
Loop number: 8
Loop number: 9
Loop number: 10
Loop number: 11
Loop number: 12
Loop number: 13
Loop number: 14
done


In [4]:
hypersdata.sort_values(by="result", axis=0, ascending=True)

Unnamed: 0,k,n_estimatorsint,max_depth,squared_error,friedman_mse,absolute_error,poisson,sqrt,log2,None,1,2,3,result
65324,8,140,9,True,False,False,False,False,False,True,False,False,False,0.303546
129354,15,123,9,False,False,True,False,True,False,False,False,False,False,0.313296
54714,7,134,9,False,False,False,True,True,False,False,False,False,False,0.33163
38971,6,102,9,False,False,False,True,False,True,False,False,False,False,0.334298
90212,11,119,5,False,False,False,True,False,False,True,False,False,False,0.341998
30810,5,110,9,False,True,False,False,True,False,False,False,False,False,0.345554
131951,15,137,9,True,False,False,False,False,False,False,False,False,True,0.347143
47797,6,148,8,False,False,False,True,False,True,False,False,False,False,0.348138
43290,6,125,9,False,True,False,False,True,False,False,False,False,False,0.348415
79630,10,114,9,False,False,True,False,False,False,False,False,True,False,0.348435
