# Exploring Ensemble Methods

In [1]:
# Import the necesary packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load LendingClub dataset
houses = pd.read_csv('kc_house_data.csv')

In [3]:
# Lets make sure to remove any problem data points
houses = houses.dropna()

In [4]:
# lets look at what variables are avialable
houses.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

### Selecting features


In [5]:
# Select the target variable and the feature variables
target = 'price'
features = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

## Split data into training and validation sets

In [6]:
# split the data for test and training set validation
x_train, x_test, y_train, y_test = train_test_split(houses[features],houses[target], test_size = 0.3, random_state = 1)

## Gradient boosted Regressor

In [48]:
# Create Gradient Boosted ensemble regression model
gradient_regressor = GradientBoostingRegressor(loss = 'ls',max_leaf_nodes = 25)

# Train Gradient Boosted Model
gradient_boosted_ensemble_reg = gradient_regressor.fit(x_train, y_train)

In [49]:
# Making predictions
model_predict_train = gradient_boosted_ensemble_reg.predict(x_train)
model_predict_test = gradient_boosted_ensemble_reg.predict(x_test)

In [51]:
# Compute the residual sum of squares for the decision tree regression model
print("Residual  sum of squares:", np.mean((y_test - model_predict_test) ** 2))
print("Mean squares:", np.sqrt(np.mean((y_test - model_predict_test) ** 2)))
print("R-squared of the gradient boosted regression:", gradient_boosted_ensemble_reg.score(x_test, y_test))

Residual  sum of squares: 41638662821.48775
Mean squares: 204055.53857096785
R-squared of the gradient boosted regression: 0.7332057579442701
