# Tree Based Methods and Ensemble Methods
Sterling Hayden  

## Decision Tree
1. 'Hitters' Data is a dataset with baseball player salaries and numbers of features
associated with the players. We will split the data into 80-20 % training and validation
data.  
2. Fit a Decision Tree with Number of leaf nodes 5, 10, 15, 20, .... 100. For each model
compute training error and validation errror (MSE). For each Decision Tree with different
numbers of leaf nodes plot numbers of lead nodes vs MSE for training and validation
dataset.


In [3]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [8]:
#load Dataset and view some
df=pd.read_csv("Hitters.csv",index_col=0).dropna()
df.index.name = 'Player'
df.head()

Unnamed: 0_level_0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [5]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

#Split feature and response and replace categorical features with numerical dum
y = df.Salary
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
print(X.shape)


(263, 19)


In [10]:
#Split the data into 80-20 % training and validation data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=88)

### Fit a Decision Tree with Number of leaf nodes = 5, 10, 15, 20, ... 100


In [20]:
num_leaf = np.linspace(5,100, 25).astype(int)
train_MSE = []
test_MSE = []
for i in num_leaf: 
    regressor = DecisionTreeRegressor(random_state=0,max_leaf_nodes=i)
    regressor.fit(X_train,y_train)
    
    y_pred_train = regressor.predict(X_train)
    train_error = mean_squared_error(y_train,y_pred_train)
    train_MSE.append(train_error)
    y_pred_test = regressor.predict(X_test)
    test_error = mean_squared_error(y_test, y_pred_test)
    test_MSE.append(test_error)
    print(f"num_leaf: {i}, train_MSE: {train_error}, test_MSE: {test_error}")


num_leaf: 5, train_MSE: 83487.78103289659, test_MSE: 65264.8917396763
num_leaf: 8, train_MSE: 59178.41722101653, test_MSE: 55049.53294288881
num_leaf: 12, train_MSE: 42123.62142117608, test_MSE: 55884.30184611296
num_leaf: 16, train_MSE: 32103.00520555323, test_MSE: 59834.87706192988
num_leaf: 20, train_MSE: 23030.276059594475, test_MSE: 83123.09752191976
num_leaf: 24, train_MSE: 18123.63521510096, test_MSE: 85400.81806379478
num_leaf: 28, train_MSE: 14412.47919722019, test_MSE: 84581.25038135794
num_leaf: 32, train_MSE: 10820.12864793139, test_MSE: 78860.85438093558
num_leaf: 36, train_MSE: 8291.327600527131, test_MSE: 75869.61616616652
num_leaf: 40, train_MSE: 6430.743829992893, test_MSE: 73480.92570757102
num_leaf: 44, train_MSE: 5015.097584447165, test_MSE: 75213.81087687763
num_leaf: 48, train_MSE: 4045.6460659693194, test_MSE: 76126.41913450633
num_leaf: 52, train_MSE: 3127.363187750255, test_MSE: 76364.28142053106
num_leaf: 56, train_MSE: 2501.5409841228793, test_MSE: 76276.7181

In [22]:
best_num_leaf = num_leaf[np.argmin(test_MSE)]
best_test_MSE = min(test_MSE)
# best leaf # based on test mse
print(f"Best num_leaf: {best_num_leaf}, Best test_MSE: {best_test_MSE}")

Best num_leaf: 8, Best test_MSE: 55049.53294288881


## Bagging, Random Forest, and Boosting for Regression Task

In [4]:
#loading dataset
df=pd.read_csv("boston.csv").dropna()
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [5]:
#split dataset
X=df.iloc[:,:-1].values
y=df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=88)
X.shape

(394, 13)

### Bagging

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

In [7]:
#RandomsearchCV Parameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [41]:
# In Bagging one of the hyperparameter we can tune are numbers of trees (n_esti
# Numbers of samples to use in bootstraping (max_samples), etc. 
br = BaggingRegressor(random_state = 0)
br_params = {
 'max_samples': [0.5, 0.7], # Fraction of Data to use in Bootst
 'n_estimators': [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 100], #
 }
# Create a GridSearch object
br_gs = GridSearchCV(br, param_grid = br_params, cv = 3, verbose = 1, n_jobs = 1)

In [42]:
# Fit the Bagging Regressor for all parameter combinations we declared
br_gs.fit(X_train,y_train)


Fitting 3 folds for each of 22 candidates, totalling 66 fits


In [43]:
# Print Best HyperParameters for Bagging and use these params
br_gs.best_params_

{'max_samples': 0.7, 'n_estimators': 80}

In [45]:
# Create Bagging Regressor with Best params and fit with training data
best_br = BaggingRegressor(n_estimators = 80, max_samples = 0.7, oob_score=True)
best_br.fit(X_train,y_train)
# Predict on Train and Test Data
y_pred_train = best_br.predict(X_train)
y_pred_test = best_br.predict(X_test)
# Compute Train and Test and OOS Error
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
OOS_error = best_br.oob_score_
print("Training error is :",train_MSE)
print("Validation error is :",test_MSE)
print("Out of sample error:",OOS_error)

Training error is : 3.1358573214285728
Validation error is : 13.72869204905064
Out of sample error: 0.8388234522720248


**Validation Error:** This is the error calculated on a separate dataset, known as the validation set. It is used during the model development process to tune hyperparameters and assess model performance. The validation set is different from the training set but is still part of the same dataset. A validation error helps in selecting the best hyperparameters and detecting early signs of overfitting.  
  
**Out of Sample Error (Test Error):** The OOB score is typically a value between 0 and 1, and it indicates the proportion of correctly predicted instances in the out-of-bag samples. Higher OOB scores, closer to 1, suggest that your model is doing a good job of generalizing from the training data to unseen data. Conversely, lower OOB scores indicate that the model is not generalizing as well.

### Random Forest

In [46]:
from sklearn.ensemble import RandomForestRegressor
# define the rf estimator
rf = RandomForestRegressor(random_state = 0)

In [49]:
# RandomsearchCV Parameters
from sklearn.model_selection import RandomizedSearchCV
# number of trees (estimators/weak models) in Bagging
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 200)]
# number of features to consider at every split
max_features = ['sqrt', 'log2']
# minimum number of samples at leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
# Create the random grid
random_params = {'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth,
 'min_samples_leaf': min_samples_leaf}

In [51]:
rf_gs = RandomizedSearchCV(estimator = rf, param_distributions = random_params,
 n_iter = 100, cv = 3, verbose = 1, random_state=88)

In [52]:
# Fit the random search model
rf_gs.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [53]:
# best hyper parameters
rf_gs.best_params_

{'n_estimators': 300,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 30}

In [59]:
#Fit final model using best parameters from above RandomGridSearch
best_rf = RandomForestRegressor(n_estimators=300,min_samples_leaf=1,max_features='log2', max_depth=30,
 oob_score=True)
best_rf.fit(X_train,y_train)


In [60]:
#training error and other errors
y_pred_train = best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)
train_error=mean_squared_error(y_train, y_pred_train)
test_error=mean_squared_error(y_test, y_pred_test)
OOS_error = best_rf.oob_score_
print("Training error is :",train_error)
print("Validation error is :",test_error)
print("Out of sample error:",OOS_error)

Training error is : 1.6631721446208165
Validation error is : 18.072438170182803
Out of sample error: 0.8506874107617161


### XGboost (Improved Version of Boosting Method)

In [8]:
import xgboost as xgb
# Note we can do hyperparameter tuning RandomsearchCV or GridSearch for XGboost
# But here, I am showing only 1 model for specific paramters. 
xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators = 200, subsample = .7)
xg_reg.fit(X_train, y_train)



In [9]:
#training error
y_pred_train=xg_reg.predict(X_train)
y_pred_test=xg_reg.predict(X_test)
training_error=mean_squared_error(y_train, y_pred_train)
testing_error=mean_squared_error(y_test, y_pred_test)
print("Training error is :",training_error)
print("Testing error is :",testing_error)

Training error is : 2.97780448269632e-07
Testing error is : 15.44284607491354


## Bagging, Random Forest and Boosting for Classification Task

In [10]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
 'machine-learning-databases/wine/wine.data',
header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
 'Alcalinity of ash', 'Magnesium', 'Total phenols',
 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
 'Proline']

In [128]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [11]:
# Drop one of the classes so we have only two classes: Class 2 and Class 3.
# Converting to Binary Classification for Simplicity! 
df_wine = df_wine[df_wine['Class label'] != 1]
y = df_wine['Class label'].values
#X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values
X = df_wine.iloc[:,1:].values
print(X.shape)

(119, 13)


In [12]:
# Change (Encode) Class Labels to 0/1 from 2/3
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le = LabelEncoder()
y = le.fit_transform(y)
#print(y)
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

### Bagging

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier

In [14]:
#RandomsearchCV Parameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [15]:
# In Bagging one of the hyperparameter we can tune are numbers of trees (n_estimators)
# Numbers of samples to use in bootstraping (max_samples), etc. 
br = BaggingClassifier(random_state = 0)
br_params = {
 'max_samples': [0.5, 0.6, 0.7, 0.8], # Fraction of Data to use in Bootstraping
 'n_estimators': [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 100], #
 }
# Create a GridSearch object
br_gs = GridSearchCV(br, param_grid = br_params, cv = 3, verbose = 1, n_jobs = 1)

In [134]:
# Fit the Bagging Regressor for all parameter combinations we declared
br_gs.fit(X_train,y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


In [135]:
# Print Best HyperParameters for Bagging and use these params
br_gs.best_params_

{'max_samples': 0.5, 'n_estimators': 15}

In [136]:
# Create Bagging Classifier with Best params and fit with training data
best_br = BaggingClassifier(n_estimators = 15, max_samples = 0.5, oob_score=True)
best_br.fit(X_train,y_train)
# Predict on Train and Test Data
y_pred_train = best_br.predict(X_train)
y_pred_test = best_br.predict(X_test)
# Compute Train and Test and OOS Error
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
OOS_error = best_br.oob_score_
print("Training error is :",train_MSE)
print("Validation error is :",test_MSE)
print("Out of sample error:",OOS_error)

Training error is : 0.0
Validation error is : 0.08333333333333333
Out of sample error: 0.9578947368421052


### Random Forest

In [137]:
from sklearn.ensemble import RandomForestClassifier
# define the rf estimator
rf = RandomForestClassifier(random_state = 0)

In [138]:
# RandomsearchCV Parameters
from sklearn.model_selection import RandomizedSearchCV
# number of trees (estimators/weak models) in Bagging
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 100)]
# number of features to consider at every split
max_features = ['sqrt', 'log2']
# minimum number of samples at leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# Create the random grid
random_params = {'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth,
 'min_samples_leaf': min_samples_leaf}

In [139]:
rf_gs = RandomizedSearchCV(estimator = rf, param_distributions = random_params,
 n_iter = 100, cv = 3, verbose = 1, random_state=88)

In [140]:
# Fit the random search model
rf_gs.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [141]:
# best hyper parameters
rf_gs.best_params_

{'n_estimators': 330,
 'min_samples_leaf': 8,
 'max_features': 'sqrt',
 'max_depth': 60}

In [142]:
#Fit final model using best parameters from above RandomGridSearch
best_rf = RandomForestClassifier(n_estimators=330,min_samples_leaf=8,max_features='sqrt', max_depth=60,
 oob_score=True)
best_rf.fit(X_train,y_train)

In [143]:
#training error and other errors
y_pred_train = best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)
train_error=mean_squared_error(y_train, y_pred_train)
test_error=mean_squared_error(y_test, y_pred_test)
OOS_error = best_rf.oob_score_
print("Training error is :",train_error)
print("Validation error is :",test_error)
print("Out of sample error:",OOS_error)

Training error is : 0.0
Validation error is : 0.125
Out of sample error: 1.0


### XGboost

In [16]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(random_state = 0)

In [17]:
# Note this is a version of hyperparameter tuning RandomsearchCV or GridSearch for XGboost

# RandomsearchCV Parameters
from sklearn.model_selection import RandomizedSearchCV
# number of trees (estimators/weak models) in Bagging
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 200)]
# subsmple 
subsample = [.5, .6, .7, .8]
# Create the random grid
random_params = {
 'n_estimators': n_estimators,
 'subsample': subsample
}

In [18]:
xgb_rs = RandomizedSearchCV(estimator = xgb_clf, param_distributions = random_params,
 n_iter = 100, cv = 3, verbose = 1, random_state=88)

In [19]:
# Fit the random search model
xgb_rs.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [20]:
# best hyper parameters
xgb_rs.best_params_

{'subsample': 0.6, 'n_estimators': 600}

In [21]:
best_xbg_cls = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 600, subsample = .6)
best_xbg_cls.fit(X_train,y_train)

In [23]:
#training error
y_pred_train=best_xbg_cls.predict(X_train)
y_pred_test=best_xbg_cls.predict(X_test)
training_error=mean_squared_error(y_train, y_pred_train)
testing_error=mean_squared_error(y_test, y_pred_test)
print("Training error is :",training_error)
print("Testing error is :",testing_error)

Training error is : 0.0
Testing error is : 0.125


**I belive XGBoost to be the best model for the classidfication case.**