In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from scipy.stats import norm

import random as rand
import matplotlib.pyplot as plt

import itertools


### Optimising linear regression

This model is being optimised using a grid meathod. This meathod was chosen becuase there wasnt very many hyperparameters




In [2]:

#Runs K fold Validation
#Use KFold() to generate k sets of folds
#Itterate through the folds and find the mean_absolute_percentage_error of each fold
#Then find an average for all the mean_absolute_percentage_error's and return it

def run_k_fold_validation(k, model, data):
    
    
    kfold = KFold(n_splits=k, shuffle=True)

    abs_errors = []

    for train, test in kfold.split(data):

        train = data.iloc[train]
        test = data.iloc[test]


        input_x = train.drop("price_pounds", axis='columns')
        input_y = train.price_pounds


        model.fit(input_x, input_y)
        
        preds = model.predict(test.drop("price_pounds", axis='columns'))

        abs_error = mean_absolute_percentage_error(y_true=test.price_pounds, y_pred=preds )

        abs_errors.append(abs_error)

    return(sum(abs_errors)/k)
    

# generate every combination of hyperparameters 
def get_hypergrid():
    
    k_range = range(2, 21)
    fit_intercept = range(0, 2)
    positive = range(0, 2)
    combos = list(itertools.product(k_range, fit_intercept, positive))

    
    combos_dataframe = pd.DataFrame(combos, columns=["k", "fit_intercept", "positive"])
    
    return combos_dataframe
    
#input a table of hyper parameters and generate a result column for those parameters 
def generate_results(hypers_table):
    
    data = pd.read_csv('london_house_prices_ajusted.csv')
    
    y = []
    
    for index, row in hypers_table.iterrows(): 
        
        model = LinearRegression(fit_intercept=bool(row[1]),positive=bool(row[2]))
        y.append(run_k_fold_validation(row[0],model,data))


    
    hypers_table["result"] = y

    
    return hypers_table

In [3]:
grid = get_hypergrid()
results = generate_results(grid)

In [4]:
results

Unnamed: 0,k,fit_intercept,positive,result
0,2,0,0,0.465767
1,2,0,1,0.465388
2,2,1,0,0.487736
3,2,1,1,0.418679
4,3,0,0,0.472000
...,...,...,...,...
71,19,1,1,0.412558
72,20,0,0,0.468860
73,20,0,1,0.468990
74,20,1,0,0.461744


In [5]:
results.sort_values("result")

Unnamed: 0,k,fit_intercept,positive,result
71,19,1,1,0.412558
43,12,1,1,0.413863
55,15,1,1,0.415070
67,18,1,1,0.415142
27,8,1,1,0.415148
...,...,...,...,...
12,5,0,0,0.482389
36,11,0,0,0.485710
2,2,1,0,0.487736
70,19,1,0,0.493186
