# Assignment 3: Regression & Ensembles

In [1]:
# enter your name and UFL email address
name = 'Yang Bai'
email = 'baiyang94@ufl.edu'

In [2]:
if name == 'enter your name' or email == 'enter your email':
    assert False, 'Enter your name & email first!'
else:
    print('Assignment 3 -- name: {}, email: {}\n'.format(name, email))
    
    # Load packages we need
    import sys
    import os
    import time

    import numpy as np
    import pandas as pd
    import sklearn

    from matplotlib import pyplot as plt
    plt.rcParams.update({'font.size': 16})

    # Let's check our software versions
    print('### Python version: ' + __import__('sys').version)
    print('### NumPy version: ' + np.__version__)
    print('### Scikit-learn version: ' + sklearn.__version__)
    print('------------')


    # load our packages / code
    sys.path.insert(1, '../common/')
    import utils
    import plots

Assignment 3 -- name: Yang Bai, email: baiyang94@ufl.edu

### Python version: 3.8.3 (default, Jul  2 2020, 11:26:31) 
[Clang 10.0.0 ]
### NumPy version: 1.19.5
### Scikit-learn version: 0.23.1
------------


In [3]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.
seed = 42

# deterministic seed for reproducibility
##rng = np.random.default_rng(seed)  # best practice but not fully implemented in scikit-learn
np.random.seed(seed)

prop_vec = [14, 3, 3]

## Part 1: Loading and Pre-processing Data

### For this assignment we'll load the Bike Sharing dataset (hourly)
### This dataset contains features of users bike sharing/rental on an hourly basis.
### The task is to predict how many users are sharing/renting a bike.

### Loading data

In [4]:
### Note: this dataset has missing values (artificially introduced), which you'll need to fill in before you can train a model
df = pd.read_csv('../data/bikesharehour.csv.gz', compression='gzip', header=0, na_values='?')

# Check that we loaded the data as expected
df_expected_shape = (17379, 15)

assert df.shape == df_expected_shape, 'Unexpected shape of df!'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      16320 non-null  float64
 1   year        16231 non-null  float64
 2   month       16304 non-null  float64
 3   hour        16254 non-null  float64
 4   holiday     16277 non-null  float64
 5   weekday     16282 non-null  float64
 6   workingday  16297 non-null  float64
 7   weathersit  16324 non-null  float64
 8   temp        16242 non-null  float64
 9   atemp       16271 non-null  float64
 10  hum         16252 non-null  float64
 11  windspeed   16281 non-null  float64
 12  registered  16244 non-null  float64
 13  nsqrtc      16263 non-null  float64
 14  count       17379 non-null  int64  
dtypes: float64(14), int64(1)
memory usage: 2.0 MB


In [5]:
## what does the data look like?
df.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered,nsqrtc,count
0,1.0,0.0,,0.0,0.0,6.0,0.0,1.0,,0.0,0.0,0.0,13.0,-5.0,16
1,1.0,0.0,,1.0,0.0,6.0,0.0,1.0,,0.0,0.0,0.0,32.0,-8.0,40
2,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,27.0,-7.0,32
3,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,-5.0,13
4,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1


### There are some NaNs which we'll have to impute!

In [6]:
# grab all the data as a numpy matrix
all_xy = df.to_numpy()

col_names = [c for c in df.columns]
features = col_names[:-1]
target = col_names[-1]

In [7]:
print('features: {} --- target: {}'.format(features, target))

features: ['season', 'year', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'registered', 'nsqrtc'] --- target: count


In [8]:
# how many NaNs in each column?
np.sum(np.isnan(all_xy), axis=0)

array([1059, 1148, 1075, 1125, 1102, 1097, 1082, 1055, 1137, 1108, 1127,
       1098, 1135, 1116,    0])

### Observe: no NaNs in the target/value column
### About 1000+ NaNs in each feature

In [9]:
# split into x and y
all_x_nan = all_xy[:,:-1]
all_y = all_xy[:,-1]

In [10]:
print(all_x_nan)

[[  1.   0.  nan ...   0.  13.  -5.]
 [  1.   0.  nan ...   0.  32.  -8.]
 [  1.   0.   1. ...   0.  27.  -7.]
 ...
 [  1.   1.  12. ...   3.  83.  -8.]
 [ nan   1.  12. ...   2.  48. -11.]
 [  1.   1.  12. ...   2.  37.  nan]]


## [Task 1] (10 points) Let's impute the missing values! Use Scikit-learn's SimpleImputer to replace all NaNs in 'all_x_nan' with the *most frequent* value in each column. Use copy=True and store the results in 'all_x' 

In [11]:
from sklearn.impute import SimpleImputer

###* put your code here (~2-3 lines) *###
imp_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent', copy=True)
imp_mf.fit(all_x_nan)
all_x = imp_mf.transform(all_x_nan)
print(all_x)

[[  1.   0.   7. ...   0.  13.  -5.]
 [  1.   0.   7. ...   0.  32.  -8.]
 [  1.   0.   1. ...   0.  27.  -7.]
 ...
 [  1.   1.  12. ...   3.  83.  -8.]
 [  3.   1.  12. ...   2.  48. -11.]
 [  1.   1.  12. ...   2.  37.   0.]]


In [12]:
# check that the shape is correct
assert all_x.shape == (17379, 14)

# check that there are no more NaNs
assert np.sum(np.sum(np.isnan(all_x), axis=0)) == 0

### Rescale the features

In [13]:
# We'll min-max normalize the features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True)
scaler.fit(all_x) 

scaled_all_x = scaler.transform(all_x)

### Let's split the data

In [14]:
# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(scaled_all_x, all_y, prop_vec, shuffle=True, seed=seed)

# sanity check shapes
train_x.shape, train_y.shape, test_x.shape, test_y.shape, val_x.shape, val_y.shape

((12166, 14), (12166,), (2607, 14), (2607,), (2606, 14), (2606,))

## [Task 2] (30 points) Let's train linear models!

### [Task 2a] (2 points) Train a Linear Regression model using the default hyperparameters. 

In [15]:
from sklearn.linear_model import LinearRegression

### Train a linear regression model on the training data (train_x, train_y)
### Call the resulting trained model 'lrmodel'
###* put your code here (~1 line) *###

lrmodel = LinearRegression().fit(train_x, train_y)

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def r2_mse_mae_eval(model, pref=''):
    # R^2 the coefficient of determination
    r2_train = model.score(train_x, train_y)
    r2_val = model.score(val_x, val_y)

    print('{}Train R^2: {:.3f}, Val  R^2: {:.3f}'.format(pref, r2_train, r2_val))

    train_pred = model.predict(train_x)
    val_pred = model.predict(val_x)

    # measure the error (MSE) wrt true target
    train_error = mean_squared_error(train_pred, train_y)
    val_error = mean_squared_error(val_pred, val_y)

    print('{}Train MSE: {:.3f}, Val MSE: {:.3f}'.format(pref, train_error, val_error))
    
    train_error = mean_absolute_error(train_pred, train_y)
    val_error = mean_absolute_error(val_pred, val_y)

    print('{}Train MAE: {:.3f}, Val MAE: {:.3f}'.format(pref, train_error, val_error))
    
r2_mse_mae_eval(lrmodel)

Train R^2: 0.847, Val  R^2: 0.833
Train MSE: 5033.098, Val MSE: 5532.778
Train MAE: 39.039, Val MAE: 39.742


### [Task 2b] (3 points) How good is that model? (A few sentences is fine.)

In [17]:
###* put your answer as comment here *###
#

meanv = np.mean(train_y)
train_pred = lrmodel.predict(train_x)
baseline_pred = np.ones_like(train_pred) * meanv
print('Baseline prediction mean: {}'.format(meanv))

baseline_error = mean_squared_error(baseline_pred, train_y)
print('Baseline error (MSE): {:.3f}'.format(baseline_error))

baseline_error = mean_absolute_error(baseline_pred, train_y)
print('Baseline error (MAE): {:.3f}'.format(baseline_error))

# Answer:
# By comparing the results over the training dataset and the validation dataset, we can see this model 
# generalizes well.
# But to evaluate whether the model works well or not, we need a baseline to compare with.
# Say, we compare it with a simple baseline model which always predict the mean.
# By comparing the MAE between these two models, we can say the linear regression model works pretty well.
#

Baseline prediction mean: 189.30100279467368
Baseline error (MSE): 32846.529
Baseline error (MAE): 142.710


### Let's setup some functions so we can tune hyperparameters

In [18]:
## some code to do a grid search and automatically train & evaluate the model with the best hyperparams.
from sklearn.model_selection import GridSearchCV

def do_grid_search(model, param_grid, x, y):
    gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error')
    gs_res = gs.fit(x, y)
    return  gs_res.best_params_


def search_train_eval(model, param_grid, tr_x=train_x, tr_y=train_y, v_x=val_x, v_y=val_y):
    
    # since we do CV for the grid search, let's concatenate the train and val sets for it
    search_x = np.r_[tr_x, v_x]
    search_y = np.r_[tr_y, v_y]
    
    hyperparams = do_grid_search(model, param_grid, search_x, search_y)
    
    class_obj = type(model)
    m = class_obj(**hyperparams).fit(tr_x, tr_y)
    
    cn = str(class_obj).split("'")[1]
    cn = cn.split('.')[-1]
    print('{}({})'.format(cn, hyperparams))

    r2_mse_mae_eval(m, pref='\t')

    return m

### [Task 2c] (5 points) Do a grid search to tune hyperparameters and train an ElasticNet model. You can choose the values of hyperparameters your search over, but you must search over 'alpha' and 'l1_ratio'. Ensure that during the search, the training of models converges in all cases (you may need to increase 'max_iter' based on your chosen values). 

In [19]:
### Hint: you should define a parameter grid dictionary and call search_train_eval() to do the actual search
### Note: you only need to pass a model instance (model) and a parameter grid (param_grid)
### Call the output of search_train_eval(): 'enmodel'
###* put your code here (~3 lines) *###

param_grid = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], 'l1_ratio': np.arange(0.01, 1, 0.01)}
model = sklearn.linear_model.ElasticNet()
enmodel = search_train_eval(model, param_grid)

ElasticNet({'alpha': 0.001, 'l1_ratio': 0.99})
	Train R^2: 0.847, Val  R^2: 0.833
	Train MSE: 5033.116, Val MSE: 5532.434
	Train MAE: 39.048, Val MAE: 39.749


### [Task 2d] (2 points) Do a grid search to tune hyperparameters and train a Ridge Regression model. You can choose the values of hyperparameters your search over, but you must search over 'alpha'. Ensure that during the search, the training of models converges in all cases (you may need to increase 'max_iter' based on your chosen values). 

In [20]:
### Call the output of search_train_eval(): 'rmodel'
###* put your code here (~3 lines) *###

param_grid = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]}
model = sklearn.linear_model.Ridge()
rmodel = search_train_eval(model, param_grid)

Ridge({'alpha': 0.1})
	Train R^2: 0.847, Val  R^2: 0.833
	Train MSE: 5033.101, Val MSE: 5532.577
	Train MAE: 39.046, Val MAE: 39.749


### [Task 2e] (3 points) Print the parameter values (w and b) of the ElasticNet and Ridge Regression models.

In [21]:
# Print the weights and bias for both models
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

### Make sure you print the weights and bias for both models and that it is clear which is which.
print('ElasticNet: \n---w: {} \n--- b: {}'.format(enmodel.coef_, enmodel.intercept_))
print('\nRidge Regression: \n---w: {} \n--- b: {}'.format(rmodel.coef_, rmodel.intercept_))

ElasticNet: 
---w: [ 26.397  14.278 -10.928  52.874 -8.286  6.622 -30.461 -30.814  0.000
 -5.692 -13.158  28.819  926.691 -14.575] 
--- b: 20.647297762732535

Ridge Regression: 
---w: [ 26.431  14.271 -10.972  52.865 -8.324  6.631 -30.472 -30.828  1.476
 -16.757 -13.215  28.873  926.780 -14.613] 
--- b: 20.661251354260088


### [Task 2f] (2 points) How similar are the parameter values of the two models?

In [22]:
###* put your answer as comment here *###
#
parameter_error = mean_absolute_error(np.append(enmodel.coef_, enmodel.intercept_), 
                                     np.append(rmodel.coef_, rmodel.intercept_))
print("MAE between the two set of parameters: {}".format(parameter_error))
# 
# Answer:
# To me, these two set of parameters are very similar in the sense of MAE.
#

MAE between the two set of parameters: 0.8637771652792111


### [Task 2g] (8 points) For each of the two models, display the three most important features alongside with their coefficients. Are these the same across both models?
### What are the coefficients? Which feature is the most important?

In [23]:
### Hint: don't forget that coefficients can be positive as well as negative.
###* put your code here *###
mwfidx_Elastic = (-np.abs(enmodel.coef_)).argsort()[:3]
most_important_features_Elastic = [col_names[x] for x in mwfidx_Elastic]
print('The three Most important features of the ElasticNet model: {} \n(weight: {})'.format(most_important_features_Elastic, 
                                                                                              enmodel.coef_[mwfidx_Elastic]))

mwfidx_Ridge = (-np.abs(rmodel.coef_)).argsort()[:3]
most_important_features_Ridge = [col_names[x] for x in mwfidx_Ridge]
print('The three Most important features of the Ridge Regression model: {} \n(weight: {})'.format(most_important_features_Ridge, 
                                                                                              rmodel.coef_[mwfidx_Ridge]))


###* put your answer as comment here *###
# 
# Answer: 
# The three most important features are the same across both models.
# The coefficients are printed below.
# The feature 'registered' is the most important one.
#

The three Most important features of the ElasticNet model: ['registered', 'hour', 'weathersit'] 
(weight: [ 926.691  52.874 -30.814])
The three Most important features of the Ridge Regression model: ['registered', 'hour', 'weathersit'] 
(weight: [ 926.780  52.865 -30.828])


### [Task 2h] (5 points) Take a look at the code of search_train_eval() and do_grid_search(). Answer the following questions: 
### 1. Why is the scoring function for the grid search 'neg_mean_squared_error' (as opposed to 'mean_squared_error')? 
### 2. Why is it okay to do the search over search_x and search_y which are the concatenation of the training and validation sets? 

In [24]:
### Hint: take a look at the documentation of scikit-learn for GridSearchCV and related classes.
###* put your answer as comment here *###
#
# 1. Because, all the scorer objects in the sklearn follow the convention that higher return values 
#    are better than lower return values. Hence, it has to be 'neg_mean_squared_error' 
#    instead of 'mean_squared_error'.
#
# 2. Because GridSearchCV does a cross-validated grid-search over a parameter grid, 
#    hence, the validation set is no longer needed.
#

## [Task 3] (30 points) Let's train polynomial regression models!

### [Task 3a] (5 points) Use PolynomialFeatures to create a version of the data with all features of degree 2.

In [25]:
from sklearn.preprocessing import PolynomialFeatures

### Use PolynomialFeatures to create a version of the data with all features of degree 2. Make sure to allow interactions (interaction_only=False) and set include_bias=False.
### Store the result in 'all_x_polyf'. Ensure that you make a copy of the original data and you use the scaled features ('scaled_all_x')!
###* put your code here (~2 lines) *###
polyf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
all_x_polyf = polyf.fit_transform(scaled_all_x.copy())

assert all_x_polyf.shape == (17379, 119)

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x_polyf, all_y, prop_vec, shuffle=True, seed=seed)

### Let's train a LinearRegression model and a Ridge model on our polynomial features.

In [26]:
from sklearn.linear_model import Ridge

# Train a linear regression model
pf_lrmodel = LinearRegression().fit(train_x, train_y)
r2_mse_mae_eval(pf_lrmodel)

print()

# Train a Ridge regression model
pf_ridgemodel = Ridge(alpha=0.5).fit(train_x, train_y)
r2_mse_mae_eval(pf_ridgemodel)

Train R^2: 0.927, Val  R^2: 0.923
Train MSE: 2410.398, Val MSE: 2553.171
Train MAE: 24.883, Val MAE: 24.807

Train R^2: 0.926, Val  R^2: 0.921
Train MSE: 2444.731, Val MSE: 2634.107
Train MAE: 24.644, Val MAE: 24.661


### [Task 3b] (5 points) What is the difference between LinearRegression and Ridge? (A sentence or two is fine.)

In [27]:
###* put your answer as comment here *###
#
# Ridge is a regularized version of LinearRegression.
# Ridge penalizes the model for the sum of squared value of the weights.
#

### [Task 3c] (5 points) Look at (e.g., print) the parameters of both the LinearRegression model ('pf_lrmodel') and the Ridge model ('pf_ridgemodel'). What do you notice? Explain what is going on.

In [28]:
###* put your code here *###
print('LinearRegression: \n---w: {} \n--- b: {}'.format(pf_lrmodel.coef_, pf_lrmodel.intercept_))
print('\nRidge Regression: \n---w: {} \n--- b: {}'.format(pf_ridgemodel.coef_, pf_ridgemodel.intercept_))

###  What do you notice? Explain what is going on.
###* put your answer as comment here *###
#
# I notice that the parameters of the Ridge model is much smaller than the ones in the LinearRegression model.
# Ridge penalizes the model for the sum of squared value of the weights.
#

LinearRegression: 
---w: [ 5.326  30521076005237.777  45.602  130.619  26943418922304.711  30.151
 -9980426199437.012  32.620  27106435058197.676 -59664970501572.828
  13937610703289.674 -15.169  714.621 -1777.331 -15.881  14.484  24.562
  36.437  6.538 -3.561  1.582 -7.538  3430595177194.502
  12908014073732.291 -7.685  12.806 -81.200 -18.620 -30521076005228.848
 -13.176  41.264 -4.891  1.611  5.390 -10.917 -7571619403674.456
  16594786022.967 -4.121  16.353 -113.504 -14.633 -41.108 -0.335 -6.680
  3.458 -2.974  2.181 -3960308083550.017  7834099707978.211  2.426 -11.298
  4.105 -0.968 -114.819 -1.390  8.415  27.144 -47.840 -5199916544932.583
  7453194668229.455 -15.876  45.100 -213.870 -14.104 -26943418922321.574
 -9.536 -13.706  5.559 -1616411.592 -1004507.483 -401834.677  42.514
 -0.721  18.139 -26.896  3.822  2.090 -7480638434452.571
  10444543480497.895 -9.198 -28.013 -25.604  6.392  9980426199429.299
 -10.121  0.000  13589086047201.746 -3.090  10.087 -45.943 -1.166 -28.786
  0.00

### [Task 3d] (5 points) Focus on the Ridge model. What are the three most important features? 

In [29]:
### Print the three most important features alongside with their weights.
### Remember: weights can be positive as well as negative.
### Hint: you can use the get_feature_names() method of PolynomialFeatures to relate polynomial features to the original features.
###* put your code here *###
mwfidx_Ridge = (-np.abs(pf_ridgemodel.coef_)).argsort()[:3]
# most_important_features_Ridge = [col_names[x] for x in mwfidx_Ridge]
print('The indices of the three Most important features of the Ridge Regression model: {} \n(weight: {})'.format(mwfidx_Ridge, 
                                                                                              pf_ridgemodel.coef_[mwfidx_Ridge]))


The indices of the three Most important features of the Ridge Regression model: [118  13  12] 
(weight: [ 1513.202 -1469.418  717.231])


### [Task 3e] (5 points) Let's use only these three most important features. Extract the three features from the polynomial features to create a new feature matrix with three columns.

In [30]:
### Extract the three features from 'all_x_polyf' and store the results in 'all_x_3most'
###* put your code here *###
all_x_3most = all_x_polyf[:, mwfidx_Ridge]

assert all_x_3most.shape == (17379, 3)

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x_3most, all_y, prop_vec, shuffle=True, seed=seed)

### [Task 3f] (2 points) Now train a LinearRegression model (default hyperparams) on the training data from 'all_x_3most'. What do you observe about the performance of this model? What is your conclusion?

In [31]:
### Call the model 'threemost_model' and evaluate it using r2_mse_mae_eval()
###* put your code here *###

pf_lrmodel_3most = LinearRegression().fit(train_x, train_y)
r2_mse_mae_eval(pf_lrmodel_3most)

Train R^2: 0.912, Val  R^2: 0.904
Train MSE: 2903.069, Val MSE: 3181.395
Train MAE: 23.434, Val MAE: 23.624


### [Task 3g] (3 points) How good is that model? What do you conclude?

In [32]:
###* put your answer as comment here *###
#
# Answer:
# The model trained with the three most important features performs not as good as the models trained over all
# polynomial features. But the performancs are very close.
#
#

## [Task 4] (30 points) Trees, More Trees, lots of Trees!

### We need to reset the data to the original form (before polynomial features)

In [33]:
# let's do some cleanup
del train_x, train_y, test_x, test_y, val_x, val_y

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(scaled_all_x, all_y, prop_vec, shuffle=True, seed=seed)

# sanity check shapes
train_x.shape, train_y.shape, test_x.shape, test_y.shape, val_x.shape, val_y.shape
assert train_x.shape == (12166, 14)

### Let's train a decision tree!

In [34]:
from sklearn.tree import DecisionTreeRegressor, plot_tree

dtmodel = DecisionTreeRegressor(random_state=seed).fit(train_x, train_y)

### Uncomment the code in the cell below if you have some time to wait around and want to visualize the tree, otherwise skip it!

In [35]:
### This will take a long time (10-20 minutes); skip if you are in a hurry
# let's plot what the tree looks like
# plt.figure(figsize=(16,12))

# plot_tree(dtmodel, feature_names=features, filled=True, label='all', rounded=True)

# plt.show()

### [Task 4a] (10 points) Answer some questions about the structure of our tree (dtmodel)

#### 1. Can the tree be visualized easily?
#### 2. How deep is the tree?
#### 3. How many nodes it contain?
#### 4. How many total splits are there?
#### 5. What is the impurity of the last 2 nodes?
#### Hint: lookup the scikit-learn documentation to know how to manipulate the 'tree_' attribute of Decision Trees.

In [36]:
###* put your answer as comment here *###
#
# 1. No. Because, first, it takes long time to run; second, when the size(depth) of the graph is large, it is hard to visualize the details of each node.
print("Depth of the tree: ", dtmodel.get_depth())
# 2. The depth of the tree is 35.
print("Number of nodes in the tree: ", dtmodel.tree_.node_count)
# 3. There are 18157 nodes in the tree.
print("Number of leaves: ", dtmodel.tree_.n_leaves)
print("Number of total splits = Number of nodes - Number of leaves = ", dtmodel.tree_.node_count - dtmodel.tree_.n_leaves)
# 4. Number of total splits is 9078.
print("The impurity of the last 2 nodes: ", dtmodel.tree_.impurity[-2:])
# 5. The impurity of the last two nodes are both 0.000
#

Depth of the tree:  35
Number of nodes in the tree:  18157
Number of leaves:  9079
Number of total splits = Number of nodes - Number of leaves =  9078
The impurity of the last 2 nodes:  [ 0.000  0.000]


In [37]:
help(sklearn.tree._tree.Tree)

Help on class Tree in module sklearn.tree._tree:

class Tree(builtins.object)
 |  Array-based representation of a binary decision tree.
 |  
 |  The binary tree is represented as a number of parallel arrays. The i-th
 |  element of each array holds information about the node `i`. Node 0 is the
 |  tree's root. You can find a detailed description of all arrays in
 |  `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
 |  nodes, resp. In this case the values of nodes of the other type are
 |  arbitrary!
 |  
 |  Attributes
 |  ----------
 |  node_count : int
 |      The number of nodes (internal nodes + leaves) in the tree.
 |  
 |  capacity : int
 |      The current capacity (i.e., size) of the arrays, which is at least as
 |      great as `node_count`.
 |  
 |  max_depth : int
 |      The depth of the tree, i.e. the maximum depth of its leaves.
 |  
 |  children_left : array of int, shape [node_count]
 |      children_left[i] holds the node id of the left child 

#### Let's evaluate the decision tree model.

In [38]:
r2_mse_mae_eval(dtmodel)

Train R^2: 1.000, Val  R^2: 0.968
Train MSE: 0.000, Val MSE: 1076.791
Train MAE: 0.000, Val MAE: 10.691


### [Task 4b] (5 points) Is it a good model? Is it overfitted? Is it better than the models trained in Tasks 2 and 3? (A few sentences suffice.)

In [39]:
###* put your answer as comment here *###
#
# Answer:
# 1. Yes, this is a good model with low a bias and a low variance.
# 2. Yes, this model is a little bit overfitted, since the performance over the validation set is 
#    little bit worse than the training set.
# 3. Yes, this model is better than the models trained in Tasks 2 and 3, because its performance over the 
#    validation dataset is better than the others.
#

### [Task 4c] (5 points) Train another decision tree but this time regularize it. Can you obtain a model with similar performance to 'dtmodel' but not (or at least less) overfitted? 

In [40]:
### Call your new model 'dtregmodel'
###* put your code here *###

dtregmodel = DecisionTreeRegressor(ccp_alpha=110,random_state=seed).fit(train_x, train_y)

r2_mse_mae_eval(dtregmodel)

Train R^2: 0.916, Val  R^2: 0.917
Train MSE: 2760.888, Val MSE: 2761.336
Train MAE: 32.212, Val MAE: 31.264


### [Task 4d] (5 points) Now let's train a random forest and see if we can train an even better model. Use search_train_eval() to do a grid search over hyperparameters. You are free to pick whatever hyperparameters & values you want, but you should try to avoid badly overfitting.

In [41]:
from sklearn.ensemble import RandomForestRegressor

### Call your random forest model 'rfmodel'
### Make sure to set random_state=seed for reproducibility!
###* put your code here *###

param_grid = {'max_depth':np.arange(5, 40, 5), 'n_estimators': np.arange(1, 100, 20)}
model = RandomForestRegressor(random_state=seed)
rfmodel = search_train_eval(model, param_grid)

RandomForestRegressor({'max_depth': 30, 'n_estimators': 81})
	Train R^2: 0.997, Val  R^2: 0.983
	Train MSE: 86.011, Val MSE: 547.458
	Train MAE: 2.859, Val MAE: 7.583


### [Task 4e] (5 points) Is your RF model better than the decision tree you trained for Task 4c? Justify your answer. What can you conclude about ensembles/random forests?

In [42]:
###* put your answer as comment here *###
#
# Answer:
# 1. Yes, my RF model is better than the decision tree I trained for Task 4c, because its performance on the 
# validation dataset is better than the decision tree in 4c in the sence of all three metrics: R^2, MSE, and MAE.
# 2. I can conclude that, compare to a normal decison tree, the ensembles/random forests has lower variance 
# and similar bias, thanks to more diversity of trees. Overall enseles/random forests is better.
#

## [CIS6930 Additional Task -- Task 5] (25 points): Stacking Meta Model

### For this task we'll use stacking to create a meta model or blender model to predict the target using predictions from 6 other models from Tasks 1 - 4 as features!

### [Task 5a] (10 points) Fill in the code below.

In [43]:
from sklearn.preprocessing import StandardScaler
    
# these are the models we'll use from previous tasks
# (this is why it's important that you named the models as instructed in Tasks 2-4)
regressors = [('lr', lrmodel), ('elasticnet', enmodel), ('ridge', rmodel), 
             ('dt', dtmodel), ('dtreg', dtregmodel), ('rf', rfmodel)]

# this will return predictions for all of our regressors on matrix x
def regressors_preds(x):
    num_regs = len(regressors)
    
    ### Create an array to contain the predictions from the regressors over all examples in 'x'
    ### Each regressor will correspond to one feature (i.e., one column)
    ### The numpy array you return should have shape (x.shape[0], num_regs)
    ###* put your code here (~4-6 lines) *###
    predictions = np.zeros(x.shape[0])
#     print("predictions shape: ", predictions.shape)
    for regressor in regressors:
#         print('---------------------------')
#         print("predictions shape: ", predictions.shape)
#         print("model_name: ", regressor[0])
#         print("shape of data: ", x.shape)
        x_pred = regressor[1].predict(x)
        predictions = np.c_[predictions, x_pred]
    predictions = predictions[:, 1:]
#     print("################################")
#     print("predictions shape: ", predictions.shape)
    assert predictions.shape == (x.shape[0], len(regressors))
    return predictions   

def stacking_train_eval(model_name, model, standardize=False):
    ### Create a new training dataset 'meta_train_x' and 'meta_train_y'
    ### For this use the validation data (val_x, val_y) alongside with regressors_preds()
    ###* put your code here (~2 lines) *###
    meta_train_x = regressors_preds(val_x)
    meta_train_y = val_y
    
    assert meta_train_x.shape == (2606, 6) and meta_train_x.shape[0] == meta_train_y.shape[0]


    ### Create our new test dataset 'meta_test_x' and 'meta_test_y'
    ### For this we use the test data (test_x, test_y) alongside with regressors_preds()
    ###* put your code here (~2 lines) *###
    meta_test_x = regressors_preds(test_x)
    meta_test_y = test_y
    
    assert meta_test_x.shape == (2607, 6) and meta_test_x.shape[0] == meta_test_y.shape[0]
    
    # zscore normalize the features if standardize = True
    if standardize:
        scaler = StandardScaler()
        meta_train_x = scaler.fit_transform(meta_train_x)
        meta_test_x = scaler.transform(meta_test_x)

    
    # train the meta model
    model.fit(meta_train_x, meta_train_y)

    # make predictions & eval
    train_pred = model.predict(meta_train_x)
    test_pred = model.predict(meta_test_x)

    train_error = mean_squared_error(train_pred, meta_train_y)
    val_error = mean_squared_error(test_pred, meta_test_y)
    
    print('Stacking (Meta model: {})'.format(model_name))
    r2_train = model.score(meta_train_x, meta_train_y)
    r2_val = model.score(meta_test_x, meta_test_y)
    print('\tTrain R^2: {:.3f}, Val  R^2: {:.3f}'.format(r2_train, r2_val))
    
    print('\tTrain MSE: {:.3f}, Test MSE: {:.3f}'.format(train_error, val_error))

    train_error = mean_absolute_error(train_pred, meta_train_y)
    val_error = mean_absolute_error(test_pred, test_y)

    print('\tTrain MAE: {:.3f}, Test MAE: {:.3f}'.format(train_error, val_error))
    
    return r2_val

### [Task 5b] (3 points) Train a SVM regression model with a linear kernel and C=100. Use stacking_train_eval(). You can set standardize=True to zscore normalize features.)

In [44]:
from sklearn.svm import SVR
### Train a SVM regressor with a linear kernel and C=100
### Note: the training will take a few minutes
###* put your code here (~2-3 lines) *###

svm_regressor = SVR(C = 100, kernel='linear')
model_name = [model[0] for model in regressors]
stacking_train_eval(model_name, svm_regressor, standardize=True)

Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: 0.984, Val  R^2: 0.978
	Train MSE: 540.267, Test MSE: 736.985
	Train MAE: 7.516, Test MAE: 7.742


0.9775845275741305

### [Task 5c] (2 points) How good is this model? (A few sentences suffice.)

In [45]:
###* put your answer as comment here *###
#
# This model is not as good as the rf model but better than other models.
#

### [Task 5d] (5 points) Now train a SVM regression model with any other kernel (i.e., not linear) and combination of hyperparameters of your choice. Can you train a better stacking model than for Task 5b?

In [46]:
from sklearn.model_selection import ParameterGrid
### Train a SVM regressor with any non-linear kernel and hyperparameters you want 
### You can do a hyperparameter search if you want
###* put your code here (~2-3 lines) *###

hyperparams_vals = {'C':[1e-1, 1.0, 10.0, 100.0, 1000, 3000, 4000], 
                    "gamma": [0.1, 0.01, 0.001],
                    "kernel": ['rbf']}
grid = ParameterGrid(hyperparams_vals)

best_r2_score = -1
for i, hyperparams in enumerate(list(grid)):
#     print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
#     print('hyperparams: ', hyperparams)
    svm_regressor = SVR().set_params(**hyperparams)
    r2_score = stacking_train_eval(model_name, svm_regressor, standardize=True)
    if best_r2_score < r2_score:
        best_r2_score = r2_score
        best_hyperparams = hyperparams
print('\n#######################################')
print("best_hyperparams: ", best_hyperparams)
best_model = SVR().set_params(**best_hyperparams)
stacking_train_eval(model_name, best_model, standardize=True)


Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: 0.325, Val  R^2: 0.328
	Train MSE: 22380.015, Test MSE: 22092.261
	Train MAE: 93.449, Test MAE: 92.096
Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: 0.107, Val  R^2: 0.116
	Train MSE: 29613.160, Test MSE: 29050.514
	Train MAE: 125.226, Test MAE: 123.313
Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: -0.041, Val  R^2: -0.028
	Train MSE: 34506.024, Test MSE: 33803.396
	Train MAE: 136.537, Test MAE: 134.421
Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: 0.823, Val  R^2: 0.819
	Train MSE: 5866.709, Test MSE: 5954.561
	Train MAE: 29.864, Test MAE: 30.703
Stacking (Meta model: ['lr', 'elasticnet', 'ridge', 'dt', 'dtreg', 'rf'])
	Train R^2: 0.859, Val  R^2: 0.852
	Train MSE: 4675.185, Test MSE: 4853.965
	Train MAE: 34.729, Test MAE: 35.068
Stacking (Meta model: ['lr', 'elasticnet', 'ridge', '

0.9781491059792333

### [Task 5e] (5 points) What do you conclude? Provide a plausible explanation why non-linear kernels do not seem to improve the result.

In [47]:
###* put your answer as comment here *###
#
# Answer:
# 1. My conclusion is that: non-linear kernels can improve the performance (see above result).
# 2. My results from 5d show that the premise of this question is incorrect.
#