Connect to Drive for Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Load Files

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [3]:
ml_house_data_set = pd.read_csv('/content/drive/My Drive/Datasets/ml_house_data_set.csv')
ml_house_data_set.head(10)

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,house_number,street_name,unit_number,city,zip_code,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,42670,Lopez Crossing,,Hallfort,10907,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,5194,Gardner Park,,Hallfort,10907,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,4366,Harding Islands,,Lake Christinaport,11203,2519996.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,3302,Michelle Highway,,Lake Christinaport,11203,197193.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,582,Jacob Cape,,Lake Christinaport,11203,207897.0
5,2005,1,3,2,0,1621,1672,attached,430,0,True,False,True,True,78445,Michelle Highway,,Lake Christinaport,11203,196559.0
6,1979,1,3,2,1,2285,2365,detached,532,0,True,False,True,True,246,Harris Estates,,Morrisport,10924,434697.0
7,1958,1,5,2,0,1745,1741,none,0,0,False,False,False,False,35725,Jessica Isle,,Lake Christinaport,11203,64887.0
8,1958,1,5,2,0,1747,1745,none,0,0,False,False,False,False,35725,Jessica Isle,,Lake Christinaport,11203,143636.0
9,1961,1,1,1,0,998,1161,none,0,242,False,False,False,False,73327,Kurt Crescent,,Lake Christinaport,11203,81896.0


In [4]:
# Remove the fields from the data set that we don't want to include in our model
del ml_house_data_set['house_number']
del ml_house_data_set['unit_number']
del ml_house_data_set['street_name']
del ml_house_data_set['zip_code']

In [6]:
# Replace categorical data with one-hot encoded data
features_ml_house_data_set = pd.get_dummies(ml_house_data_set, columns=['garage_type', 'city'])

In [7]:
# Remove the sale price from the feature data
del features_ml_house_data_set['sale_price']

In [13]:
# Create the X and y arrays
X = features_ml_house_data_set.to_numpy() 
y = ml_house_data_set['sale_price'].to_numpy() 

In [17]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [23]:
# Create the model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber'
)

In [24]:
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=6, max_features=0.1, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=9, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [19]:
# Parameters we want to try
#param_grid = {
#    'n_estimators': [500, 1000, 3000],
#    'max_depth': [4, 6],
#    'min_samples_leaf': [3, 5, 9, 17],
#    'learning_rate': [0.1, 0.05, 0.02, 0.01],
#    'max_features': [1.0, 0.3, 0.1],
#    'loss': ['ls', 'lad', 'huber']
#}

In [None]:
# Define the grid search we want to run. Run it with four cpus in parallel.
#gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

# Run the grid search - on only the training data!
#gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
#print(gs_cv.best_params_)

In [None]:
# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}

# That is the combination that worked best.

In [26]:
# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 49056.7464


In [27]:
# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 58928.6450


In [28]:
# These are the feature labels from our data set
feature_labels = np.array(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms', 'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'garage_type_attached', 'garage_type_detached', 'garage_type_none', 'city_Amystad', 'city_Brownport', 'city_Chadstad', 'city_Clarkberg', 'city_Coletown', 'city_Davidfort', 'city_Davidtown', 'city_East Amychester', 'city_East Janiceville', 'city_East Justin', 'city_East Lucas', 'city_Fosterberg', 'city_Hallfort', 'city_Jeffreyhaven', 'city_Jenniferberg', 'city_Joshuafurt', 'city_Julieberg', 'city_Justinport', 'city_Lake Carolyn', 'city_Lake Christinaport', 'city_Lake Dariusborough', 'city_Lake Jack', 'city_Lake Jennifer', 'city_Leahview', 'city_Lewishaven', 'city_Martinezfort', 'city_Morrisport', 'city_New Michele', 'city_New Robinton', 'city_North Erinville', 'city_Port Adamtown', 'city_Port Andrealand', 'city_Port Daniel', 'city_Port Jonathanborough', 'city_Richardport', 'city_Rickytown', 'city_Scottberg', 'city_South Anthony', 'city_South Stevenfurt', 'city_Toddshire', 'city_Wendybury', 'city_West Ann', 'city_West Brittanyview', 'city_West Gerald', 'city_West Gregoryview', 'city_West Lydia', 'city_West Terrence'])

**Feature Selection**

In [29]:
# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feauture_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feauture_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

city_New Robinton - 0.00%
city_New Michele - 0.00%
city_Martinezfort - 0.00%
city_Julieberg - 0.00%
city_Davidtown - 0.00%
city_West Terrence - 0.00%
city_Lake Jennifer - 0.01%
city_Rickytown - 0.01%
city_Fosterberg - 0.01%
city_West Brittanyview - 0.01%
city_East Justin - 0.01%
city_South Stevenfurt - 0.02%
city_Leahview - 0.02%
city_Joshuafurt - 0.03%
city_Toddshire - 0.03%
city_Port Adamtown - 0.03%
city_East Janiceville - 0.04%
city_Brownport - 0.04%
city_Amystad - 0.04%
city_Port Daniel - 0.05%
city_Wendybury - 0.05%
city_Clarkberg - 0.09%
has_central_cooling - 0.11%
city_Port Jonathanborough - 0.12%
has_central_heating - 0.12%
city_Davidfort - 0.14%
city_West Lydia - 0.15%
city_North Erinville - 0.17%
city_West Gerald - 0.18%
city_Jenniferberg - 0.18%
city_East Amychester - 0.23%
city_Lewishaven - 0.23%
city_Morrisport - 0.26%
city_Richardport - 0.30%
city_West Gregoryview - 0.33%
city_Lake Dariusborough - 0.38%
city_East Lucas - 0.38%
city_South Anthony - 0.43%
city_West Ann - 0

**Price Estimation**

In [30]:
# For the house we want to value, we need to provide the features in the exact same
# arrangement as our training data set.
house_to_value = [
    # House features
    2006,   # year_built
    1,      # stories
    4,      # num_bedrooms
    3,      # full_bathrooms
    0,      # half_bathrooms 
    2200,   # livable_sqft
    2350,   # total_sqft
    0,      # garage_sqft
    0,      # carport_sqft
    True,   # has_fireplace
    False,  # has_pool
    True,   # has_central_heating
    True,   # has_central_cooling
    
    # Garage type: Choose only one
    0,      # attached
    0,      # detached
    1,      # none
    
    # City: Choose only one
    0,      # Amystad
    1,      # Brownport
    0,      # Chadstad
    0,      # Clarkberg
    0,      # Coletown
    0,      # Davidfort
    0,      # Davidtown
    0,      # East Amychester
    0,      # East Janiceville
    0,      # East Justin
    0,      # East Lucas
    0,      # Fosterberg
    0,      # Hallfort
    0,      # Jeffreyhaven
    0,      # Jenniferberg
    0,      # Joshuafurt
    0,      # Julieberg
    0,      # Justinport
    0,      # Lake Carolyn
    0,      # Lake Christinaport
    0,      # Lake Dariusborough
    0,      # Lake Jack
    0,      # Lake Jennifer
    0,      # Leahview
    0,      # Lewishaven
    0,      # Martinezfort
    0,      # Morrisport
    0,      # New Michele
    0,      # New Robinton
    0,      # North Erinville
    0,      # Port Adamtown
    0,      # Port Andrealand
    0,      # Port Daniel
    0,      # Port Jonathanborough
    0,      # Richardport
    0,      # Rickytown
    0,      # Scottberg
    0,      # South Anthony
    0,      # South Stevenfurt
    0,      # Toddshire
    0,      # Wendybury
    0,      # West Ann
    0,      # West Brittanyview
    0,      # West Gerald
    0,      # West Gregoryview
    0,      # West Lydia
    0       # West Terrence
]

In [31]:
# scikit-learn assumes you want to predict the values for lots of houses at once, so it expects an array.
# We just want to look at a single house, so it will be the only item in our array.
homes_to_value = [
    house_to_value
]

# Run the model and make a prediction for each house in the homes_to_value array
predicted_home_values = model.predict(homes_to_value)

# Since we are only predicting the price of one house, just look at the first prediction returned
predicted_value = predicted_home_values[0]

print("This house has an estimated value of ${:,.2f}".format(predicted_value))

This house has an estimated value of $603,876.31
