Recap: last time we discussed random forests. We saw that if we train a bunch of unrestricted trees in the same way, their errors are correlated and a majority vote of a bunch is not much better than one tree by itself.

We made the errors uncorrelated by training the trees on different subsets of the feature set, getting accuracy comparable to the sklearn implementation of random forests.

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# the data comes from https://archive.ics.uci.edu/dataset/186/wine+quality
df_red = pd.read_csv('data/winequality-red.csv',sep = ';')
df_red['color'] = 1
df_white = pd.read_csv('data/winequality-white.csv',sep = ';')
df_white['color'] = 0
df = pd.concat([df_red,df_white])
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [3]:
# correlation heatmap plot
# fig,ax = plt.subplots(figsize=(10, 7))
# ax = sns.heatmap(df.corr(), vmin=-1, vmax=1,cmap='vlag',annot=True)
# ax.set_xticklabels(ax.get_xticklabels(),rotation =90)
# ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
# ax.set_title('wine', pad = 25);

We'll predict target: quality using the other columns as features.

In [4]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'color']
target = ['quality']

In [5]:
x_tr,x_te,y_tr,y_te = train_test_split(df[features],df[target],
                                       test_size = 0.2)
x_tr,x_va,y_tr,y_va = train_test_split(x_tr,y_tr,
                                       test_size = 0.5)

In [6]:
# dropping random features
num_trees = 100
fss = [] # a list to hold the list of feature subsets
for i in range(num_trees):
    # we need a random binary array with not all 0s
    array = np.random.randint(0,2,12)
    while sum(array)==0:
        array = np.random.randint(0,2,12)
    fs = np.array(features)[[bool(x) for x in np.random.randint(0,2,12)]]
    fss.append(df[fs].columns)

trees = []
for i in range(num_trees):
    tr = DecisionTreeRegressor(random_state = np.random.randint(10**7))
    x_tr_subset = x_tr[fss[i]]
    tr.fit(x_tr_subset,y_tr)
    trees.append(tr)

def forest_pred(forest,inputs,fss):
    preds = []
    for tree,fs in zip(forest,fss):
        preds.append(tree.predict(inputs[fs]))
    avgs = sum(preds)/len(preds) # average the predictions
    return(avgs)

def score(preds,actual):
    '''
    accepts: preds: an iterable of predictions
             actual: the actual values
    returns: the R^2 score of the model which generated the predictions
    '''
    resid = actual - preds.reshape(-1,1)
    sse = np.sum(resid**2)
    resid_null = actual - np.mean(actual)
    sse_null = np.sum(resid_null**2)
    r2 = 1-sse/sse_null
    return(r2)

preds = forest_pred(trees,x_va,fss)
score(preds,y_va)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


quality    0.462691
dtype: float64

In [7]:
# sklearn built-in random forest
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(x_tr,y_tr)
rfr.score(x_va,y_va)

  rfr.fit(x_tr,y_tr)


0.47845143650304056

In [8]:
# our attempt using random subspace projection
num_trees = 100
trees = []
fss = []
for i in range(num_trees):
    k = np.random.randint(1,13) # of features kept
    fs = np.random.uniform(-100,100,k*12).reshape(-1,k)
    fs,_ = np.linalg.qr(fs) # gram-schmidt fs (this is a QR-factorization)
    np.random.shuffle(fs.T) # in case the columns aren't random, shuffle them.
    fss.append(fs)
    # here, fs is a 12-by-k orthonormal matrix.
    # multiplying (features)*fs projects onto the subspace of R^12 spanned by cols of fs
    tr = DecisionTreeRegressor(random_state = np.random.randint(2**32))
    tr.fit(np.matmul(x_tr,fs),y_tr)
    trees.append(tr)

def forest_pred(forest,inputs,fss):
    preds = []
    for tree,fs in zip(forest,fss):
        preds.append(tree.predict(np.matmul(inputs,fs)))
    avgs = sum(preds)/len(preds) # average the predictions
    return(avgs)

preds = forest_pred(trees,x_va,fss)
score(preds,y_va)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


quality    0.323588
dtype: float64

In [34]:
# we could try skewing things a little higher on the number of kept features
num_trees = 100
trees = []
fss = []
for i in range(num_trees):
    k = max(round(12-np.random.chisquare(3,1).item()),1)
    fs = np.random.uniform(-100,100,k*12).reshape(-1,k)
    fs,_ = np.linalg.qr(fs) # gram-schmidt X
    np.random.shuffle(fs.T) # in case the columns aren't random, shuffle them.
    fss.append(fs)
    tr = DecisionTreeRegressor(random_state = np.random.randint(2**32))
    tr.fit(np.matmul(x_tr,fs),y_tr)
    trees.append(tr)

def forest_pred(forest,inputs,fss):
    preds = []
    for tree,fs in zip(forest,fss):
        preds.append(tree.predict(np.matmul(inputs,fs)))
    avgs = sum(preds)/len(preds) # average the predictions
    return(avgs)

preds = forest_pred(trees,x_va,fss)
score(preds,y_va)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


quality    0.3427
dtype: float64

In [36]:
# using normalization, PCA, and random subspace projection
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(normalize(x_tr))
x_tr_p = pca.transform(normalize(x_tr))

num_trees = 100
trees = []
fss = []

for i in range(num_trees):
    k = max(round(12-np.random.chisquare(4,1).item()),1)
    fs = np.random.uniform(-100,100,k*12).reshape(-1,k)
    fs,_ = np.linalg.qr(fs) # gram-schmidt X
    np.random.shuffle(fs.T) # in case the columns aren't random, shuffle them.
    fss.append(fs)
    tr = DecisionTreeRegressor(random_state = np.random.randint(2**32))
    tr.fit(np.matmul(x_tr_p,fs),y_tr)
    trees.append(tr)

def forest_pred(forest,inputs,fss):
    inputs = pca.transform(normalize(inputs))
    preds = []
    for tree,fs in zip(forest,fss):
        preds.append(tree.predict(np.matmul(inputs,fs)))
    avgs = sum(preds)/len(preds) # average the predictions
    return(avgs)

preds = forest_pred(trees,x_va,fss)
score(preds,y_va)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


quality    0.351074
dtype: float64

BTW your homework assignment is to try normalization, then PCA, then dropping random columns.

Another intervention is to vary the rows involved in training by using bootstrapping.

In [38]:
# dropping random features
num_trees = 100
fss = [] # a list to hold the list of feature subsets
for i in range(num_trees):
    # we need a random binary array with not all 0s
    array = np.random.randint(0,2,12)
    while sum(array)==0:
        array = np.random.randint(0,2,12)
    fs = np.array(features)[[bool(x) for x in np.random.randint(0,2,12)]]
    fss.append(df[fs].columns)

trees = []
for i in range(num_trees):
    tr = DecisionTreeRegressor(random_state = np.random.randint(10**7))
    bootstrap_index = np.random.choice(x_tr.index,5000)
    x_tr_subset = x_tr[fss[i]]
    tr.fit(x_tr_subset.loc[bootstrap_index,],y_tr.loc[bootstrap_index,])
    trees.append(tr)
    
def forest_pred(forest,inputs,fss):
    preds = []
    for tree,fs in zip(forest,fss):
        preds.append(tree.predict(inputs[fs]))
    avgs = sum(preds)/len(preds) # average the predictions
    return(avgs)

def score(preds,actual):
    '''
    accepts: preds: an iterable of predictions
             actual: the actual values
    returns: the R^2 score of the model which generated the predictions
    '''
    resid = actual - preds.reshape(-1,1)
    sse = np.sum(resid**2)
    resid_null = actual - np.mean(actual)
    sse_null = np.sum(resid_null**2)
    r2 = 1-sse/sse_null
    return(r2)

preds = forest_pred(trees,x_va,fss)
score(preds,y_va)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


quality    0.465712
dtype: float64

In [40]:
# btw xgboost
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(x_tr,y_tr)
xgb.score(x_va,y_va)

0.3967527803842493

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

xgb = GradientBoostingRegressor()
xgb.fit(x_tr,y_tr)
xgb.score(x_va,y_va)

  y = column_or_1d(y, warn=True)


0.41082322171484587