In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Explore keys of the directory
print(boston.keys())

In [4]:
# List all the independent variables
print(boston.feature_names)

In [5]:
# Print description of those variables
print(boston)

In [6]:
# Create dataframe
df = pd.DataFrame(boston.data)
df.columns = boston.feature_names
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
# Add target variable 'price' to dataframe
df['PRICE'] = boston.target
df.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9


Analysis on dataset

In [9]:
# Get info on dataset
df.info()

In [10]:
# Transpose description
df.describe().T

Data Visualization

In [11]:
# slice to use all except the last column (which is 'price'):
column_list = df.columns[:-1]
sns.pairplot(df.loc[:,column_list], height=5)

In [12]:
# correlations between the selected columns
correlations = df.loc[:,column_list].corr()
correlations

In [13]:
# produce heatmap using seaborn
sns.set(font_scale=1)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(correlations, cmap='coolwarm', annot=True, fmt=".2f")

In [None]:
# Separate the target variable and rest of the variables using .iloc to subset the data.
X, Y = df.iloc[:,:-1], df.iloc[:,-1]
# Convert the dataset into an optimized data structure called Dmatrix that XGBoost supports 
# and gives it acclaimed performance and efficiency gains.
xgboost_data_dmatrix = xgb.DMatrix(data=X,label=Y)
# Create the train and test set for cross-validation of the results using the train_test_split 
# function from sklearn's model_selection using test_size 20%
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True, test_size=0.2, random_state=0)

In [15]:
# Instantiate the XGBoost regressor object by calling the XGBRegressor() class from the XGBoost library 
# with the hyper-parameters passed as arguments.
# xgboost_reg = xgb.XGBRegressor(objective ='reg:linear', verbosity = 0, colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 10)
xgboost_reg = xgb.XGBRegressor(colsample_bytree = 1.0, learning_rate = 0.1, max_depth = 10, subsample = 1.0)

In [20]:
# Fit the regressor to the training set and make predictions on the test set using the familiar .fit() and .predict() methods.
xgboost_reg.fit(X_train, y_train)
xgboost_train_score = xgboost_reg.score(X_train, y_train)  
print("Training score: ", xgboost_train_score)

# Kfold cross validation
xgboost_kf_cv_scores = cross_val_score(xgboost_reg, X_train, y_train, cv=10)
print(xgboost_kf_cv_scores)
print("K-fold CV mean: %.2f" % (xgboost_kf_cv_scores.mean()*100))
print("K-fold CV standard deviation: %.2f" % (xgboost_kf_cv_scores.std()*100))

# Compute prediction on the test set
xgboost_preds = xgboost_reg.predict(X_test)
# Compute the root-mean-square error (rmse) by invoking the mean_sqaured_error function from sklearn's metrics module.
xgboost_rmse = np.sqrt(mean_squared_error(y_test, xgboost_preds))
print("RMSE: %f" % (xgboost_rmse))

Training score:  0.9999864973339783
[0.90330177 0.86309502 0.94272749 0.72337767 0.93227713 0.93714686
 0.83822993 0.94267741 0.95303905 0.93001073]
K-fold CV mean: 89.66
K-fold CV standard deviation: 6.79
RMSE: 4.582927


Random Forest

In [53]:
# Create random forest regressor and fit the data
rf_reg = RandomForestRegressor(max_samples = 1.0, max_depth = 10)
rf_reg.fit(X_train, y_train)
rf_train_score = rf_reg.score(X_train, y_train)
print("Training score: ", rf_train_score)

# Kfold cross validation
rf_kf_cv_scores = cross_val_score(rf_reg, X_train, y_train, cv=10)
print(rf_kf_cv_scores)
print("Random forest K-fold CV mean: %.2f" % (rf_kf_cv_scores.mean()*100))
print("Random forest K-fold CV standard deviation: %.2f" % (rf_kf_cv_scores.std()*100))

# Compute predictions on test data
rf_preds = rf_reg.predict(X_test)
# Calculate Root mean-squared-error
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
print("RMSE: %f" % (rf_rmse))

Training score:  0.9821002556544126
[0.88085079 0.89885238 0.93412414 0.63604271 0.92784284 0.89863662
 0.74321062 0.92028281 0.89107389 0.91349335]
Random forest K-fold CV mean: 86.44
Random forest K-fold CV standard deviation: 9.20
RMSE: 4.285347


Linear regression

In [54]:
# Create linear regressor model and fit on the data
linear_regressor = LinearRegression()  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression

lr_train_score = rf_reg.score(X_train, y_train)
print("Training score: ", lr_train_score)

# Kfold cross validation
lr_kf_cv_scores = cross_val_score(linear_regressor, X_train, y_train, cv=10)
print(lr_kf_cv_scores)
print("linear regressor K-fold CV mean: %.2f" % (lr_kf_cv_scores.mean()*100))
print("linear regressor K-fold CV standard deviation: %.2f" % (lr_kf_cv_scores.std()*100))

# Compute predictions on test data
Y_pred = linear_regressor.predict(X_test)
# Calculate Root mean-squared-error
lr_rmse = np.sqrt(mean_squared_error(y_test, Y_pred))
print("RMSE: %f" % (lr_rmse))

Training score:  0.9821002556544126
[0.82792704 0.68862114 0.77639361 0.55893611 0.76232517 0.77427527
 0.51262036 0.77173959 0.83136542 0.80592263]
linear regressor K-fold CV mean: 73.10
linear regressor K-fold CV standard deviation: 10.52
RMSE: 5.783509


Feature Importance

In [59]:

# Function defination to plot feature importance
def plot_feature_importance(importance, names, model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names, 'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FRACTIONAL CONTRIBUTION')
    plt.ylabel('FEATURE NAMES')

# Code coppied from online source
# https://www.analyseup.com/learn-python-for-data-science/python-random-forest-feature-importance-plot.html

In [None]:
# Function call to plot xgboost feature importance
plot_feature_importance(xgboost_reg.feature_importances_, X_train.columns,'XGBOOST') 

In [None]:
# Function call to plot random forest feature importance
plot_feature_importance(rf_reg.feature_importances_, X_train.columns,'RANDOM FOREST') 

In [None]:
# Function call to plot linear regressor feature importance or Coefficient
plot_feature_importance(linear_regressor.coef_, X_train.columns,'LINEAR REGRESSION') 