In [9]:
import numpy as np
import pandas as pd
from ISLP import load_data
from ISLP.models import ModelSpec as MS
import sklearn.model_selection as skm
from sklearn.ensemble import (RandomForestRegressor as RF, GradientBoostingRegressor as GBR)
from matplotlib.pyplot import subplots
from sklearn.model_selection import cross_val_score

In [10]:
# Initalizing variables
Carseats = load_data('Carseats')
Carseats.head()

model = MS(Carseats.columns.drop('Sales'), intercept=False)
D = model.fit_transform(Carseats)
feature_names = list(D.columns)
X = np.asarray(D)

In [11]:
X_train, X_test, y_train, y_test = skm.train_test_split(X,
                                                        Carseats['Sales'],
                                                        test_size=0.3,
                                                        random_state=0)

In [12]:
# Fit the regression tree
reg = RF(max_features=X_train.shape[1], random_state=0)
reg.fit(X_train, y_train)

In [13]:
# Test MSE
np.mean((reg.predict(X_test)-y_test)**2)

2.0077445197500015

In [26]:
np.mean(cross_val_score(reg, X_train, y_train, cv=5))

0.6731099769096562

In [17]:
# Cross validation another way
validation = skm.ShuffleSplit(n_splits=5,
                              test_size=200,
                              random_state=0)
results = skm.cross_validate(reg,
                             D,
                             Carseats['Sales'],
                             cv=validation)
np.mean(results['test_score'])

0.6511126043799367

In [30]:
# Finding optimal level of tree complexity
cv_scores = []
for max_features in range(1,X_train.shape[1]):
    reg = RF(max_features=max_features, random_state=0)
    reg.fit(X_train, y_train)
    cv_scores.append(np.mean(cross_val_score(reg, X_train, y_train, cv=5)))

[0.4473326233403595,
 0.5264725193540191,
 0.574762478684291,
 0.618712631635532,
 0.6385676098952737,
 0.6389580100295555,
 0.6608047369291623,
 0.6463536045848741,
 0.6555641124737942,
 0.6676613043176157]

In [31]:
np.max(cv_scores)
# The best cv score was obtained with full model, so pruning doesn't really help

0.6676613043176157

In [36]:
# Creating dataframe to display
feature_imp = pd.DataFrame(
    {'importance': reg.feature_importances_},
    index=feature_names)
feature_imp.sort_values(by ='importance', ascending=False)

Unnamed: 0,importance
Price,0.280943
ShelveLoc[Good],0.216506
Age,0.109185
CompPrice,0.097009
ShelveLoc[Medium],0.07754
Advertising,0.074665
Income,0.055659
Population,0.043964
Education,0.031115
US[Yes],0.006771


In [None]:

# Bayesian Additive Regression Trees (BART) using ISLP
bart_carseats = BART(random_state=0, burnin=5, ndraw=15)
bart_bo.fit(X_train, y_train)