In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn import ensemble
from sklearn import datasets
from sklearn import linear_model
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
boston_data = datasets.load_boston()

In [3]:
X, y = boston_data.data, boston_data.target
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle=True)

In [4]:
print(boston_data.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [5]:
lin_reg = linear_model.LinearRegression()

In [6]:
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Linear regression RMSE score: {}'.format(rmse))

Linear regression RMSE score: 4.935949409211373


In [7]:
print('Linear regression coefficient R^2: {}'.format(lin_reg.score(X_test, y_test)))

Linear regression coefficient R^2: 0.7312663806353211


In [8]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
gb_reg = ensemble.GradientBoostingRegressor(**params)

In [9]:
gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Gradient boost RMSE score: {}'.format(rmse))

Gradient boost MSE score: 3.021149331021201


In [10]:
print('Gradient boost R^2 score: {}'.format(gb_reg.score(X_test, y_test)))

Gradient boost MSE score: 0.899324223834112


In [11]:
print('Feature importances for Gradient Boost: {}'.format(gb_reg.feature_importances_))

Feature importances for Gradient Boost: [0.05727073 0.0037277  0.0253999  0.00652419 0.03254813 0.25586954
 0.07429848 0.13833774 0.01491821 0.05917194 0.06138606 0.04219165
 0.22835572]


In [12]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [13]:
gbm_params = {
    'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[100, 200, 300],
     'max_depth': [5, 10, 15, 20, 25, 30]
}

In [14]:
gbm = xgb.XGBRegressor()

In [15]:
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_params, scoring='neg_mean_squared_error', cv=10, verbose=1)

In [16]:
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 10 folds for each of 90 candidates, totalling 900 fits
Best parameters found:  {'colsample_bytree': 0.8, 'max_depth': 5, 'n_estimators': 200}
Lowest RMSE found:  3.0136812442445815


[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed:  1.5min finished


In [17]:
pred = grid_mse.predict(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred)), 2)))

Root mean square error for test dataset: 3.07


In [18]:
print('Feature importances for XGradient Boost: {}'.format(gb_reg.feature_importances_))

Feature importances for XGradient Boost: [0.05727073 0.0037277  0.0253999  0.00652419 0.03254813 0.25586954
 0.07429848 0.13833774 0.01491821 0.05917194 0.06138606 0.04219165
 0.22835572]
