# Assignment 3: Regression & Ensembles

In [None]:
# enter your name and UFL email address
name = 'enter your name'
email = 'enter your email'

In [None]:
if name == 'enter your name' or email == 'enter your email':
    assert False, 'Enter your name & email first!'
else:
    print('Assignment 3 -- name: {}, email: {}\n'.format(name, email))
    
    # Load packages we need
    import sys
    import os
    import time

    import numpy as np
    import pandas as pd
    import sklearn

    from matplotlib import pyplot as plt
    plt.rcParams.update({'font.size': 16})

    # Let's check our software versions
    print('### Python version: ' + __import__('sys').version)
    print('### NumPy version: ' + np.__version__)
    print('### Scikit-learn version: ' + sklearn.__version__)
    print('------------')


    # load our packages / code
    sys.path.insert(1, '../common/')
    import utils
    import plots

In [None]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.
seed = 42

# deterministic seed for reproducibility
##rng = np.random.default_rng(seed)  # best practice but not fully implemented in scikit-learn
np.random.seed(seed)

prop_vec = [14, 3, 3]

## Part 1: Loading and Pre-processing Data

### For this assignment we'll load the Bike Sharing dataset (hourly)
### This dataset contains features of users bike sharing/rental on an hourly basis.
### The task is to predict how many users are sharing/renting a bike.

### Loading data

In [None]:
### Note: this dataset has missing values (artificially introduced), which you'll need to fill in before you can train a model
df = pd.read_csv('../data/bikesharehour.csv.gz', compression='gzip', header=0, na_values='?')

# Check that we loaded the data as expected
df_expected_shape = (17379, 15)

assert df.shape == df_expected_shape, 'Unexpected shape of df!'

df.info()

In [None]:
## what does the data look like?
df.head()

### There are some NaNs which we'll have to impute!

In [None]:
# grab all the data as a numpy matrix
all_xy = df.to_numpy()

col_names = [c for c in df.columns]
features = col_names[:-1]
target = col_names[-1]

In [None]:
print('features: {} --- target: {}'.format(features, target))

In [None]:
# how many NaNs in each column?
np.sum(np.isnan(all_xy), axis=0)

### Observe: no NaNs in the target/value column
### About 1000+ NaNs in each feature

In [None]:
# split into x and y
all_x_nan = all_xy[:,:-1]
all_y = all_xy[:,-1]

## [Task 1] (10 points) Let's impute the missing values! Use Scikit-learn's SimpleImputer to replace all NaNs in 'all_x_nan' with the *most frequent* value in each column. Use copy=True and store the results in 'all_x' 

In [None]:
from sklearn.impute import SimpleImputer

###* put your code here (~2-3 lines) *###



In [None]:
# check that the shape is correct
assert all_x.shape == (17379, 14)

# check that there are no more NaNs
assert np.sum(np.sum(np.isnan(all_x), axis=0)) == 0

### Rescale the features

In [None]:
# We'll min-max normalize the features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True)
scaler.fit(all_x) 

scaled_all_x = scaler.transform(all_x)

### Let's split the data

In [None]:
# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(scaled_all_x, all_y, prop_vec, shuffle=True, seed=seed)

# sanity check shapes
train_x.shape, train_y.shape, test_x.shape, test_y.shape, val_x.shape, val_y.shape

## [Task 2] (30 points) Let's train linear models!

### [Task 2a] (2 points) Train a Linear Regression model using the default hyperparameters. 

In [None]:
from sklearn.linear_model import LinearRegression

### Train a linear regression model on the training data (train_x, train_y)
### Call the resulting trained model 'lrmodel'
###* put your code here (~1 line) *###



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def r2_mse_mae_eval(model, pref=''):
    # R^2 the coefficient of determination
    r2_train = model.score(train_x, train_y)
    r2_val = model.score(val_x, val_y)

    print('{}Train R^2: {:.3f}, Val  R^2: {:.3f}'.format(pref, r2_train, r2_val))

    train_pred = model.predict(train_x)
    val_pred = model.predict(val_x)

    # measure the error (MSE) wrt true target
    train_error = mean_squared_error(train_pred, train_y)
    val_error = mean_squared_error(val_pred, val_y)

    print('{}Train MSE: {:.3f}, Val MSE: {:.3f}'.format(pref, train_error, val_error))
    
    train_error = mean_absolute_error(train_pred, train_y)
    val_error = mean_absolute_error(val_pred, val_y)

    print('{}Train MAE: {:.3f}, Val MAE: {:.3f}'.format(pref, train_error, val_error))
    
r2_mse_mae_eval(lrmodel)

### [Task 2b] (3 points) How good is that model? (A few sentences is fine.)

In [None]:
###* put your answer as comment here *###
#
#
# 
#

### Let's setup some functions so we can tune hyperparameters

In [None]:
## some code to do a grid search and automatically train & evaluate the model with the best hyperparams.
from sklearn.model_selection import GridSearchCV

def do_grid_search(model, param_grid, x, y):
    gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error')
    gs_res = gs.fit(x, y)
    return  gs_res.best_params_


def search_train_eval(model, param_grid, tr_x=train_x, tr_y=train_y, v_x=val_x, v_y=val_y):
    
    # since we do CV for the grid search, let's concatenate the train and val sets for it
    search_x = np.r_[tr_x, v_x]
    search_y = np.r_[tr_y, v_y]
    
    hyperparams = do_grid_search(model, param_grid, search_x, search_y)
    
    class_obj = type(model)
    m = class_obj(**hyperparams).fit(tr_x, tr_y)
    
    cn = str(class_obj).split("'")[1]
    cn = cn.split('.')[-1]
    print('{}({})'.format(cn, hyperparams))

    r2_mse_mae_eval(m, pref='\t')

    return m

### [Task 2c] (5 points) Do a grid search to tune hyperparameters and train an ElasticNet model. You can choose the values of hyperparameters your search over, but you must search over 'alpha' and 'l1_ratio'. Ensure that during the search, the training of models converges in all cases (you may need to increase 'max_iter' based on your chosen values). 

In [None]:
### Hint: you should define a parameter grid dictionary and call search_train_eval() to do the actual search
### Note: you only need to pass a model instance (model) and a parameter grid (param_grid)
### Call the output of search_train_eval(): 'enmodel'
###* put your code here (~3 lines) *###



### [Task 2d] (2 points) Do a grid search to tune hyperparameters and train a Ridge Regression model. You can choose the values of hyperparameters your search over, but you must search over 'alpha'. Ensure that during the search, the training of models converges in all cases (you may need to increase 'max_iter' based on your chosen values). 

In [None]:
### Call the output of search_train_eval(): 'rmodel'
###* put your code here (~3 lines) *###



### [Task 2e] (3 points) Print the parameter values (w and b) of the ElasticNet and Ridge Regression models.

In [None]:
# Print the weights and bias for both models
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

### Make sure you print the weights and bias for both models and that it is clear which is which.
###* put your code here (~2 lines) *###



### [Task 2f] (2 points) How similar are the parameter values of the two models?

In [None]:
###* put your answer as comment here *###
#
# 
#
#


### [Task 2g] (8 points) For each of the two models, display the three most important features alongside with their coefficients. Are these the same across both models?
### What are the coefficients? Which feature is the most important?

In [None]:
### Hint: don't forget that coefficients can be positive as well as negative.
###* put your code here *###



###* put your answer as comment here *###
# 
# 
#

### [Task 2h] (5 points) Take a look at the code of search_train_eval() and do_grid_search(). Answer the following questions: 
### 1. Why is the scoring function for the grid search 'neg_mean_squared_error' (as opposed to 'mean_squared_error')? 
### 2. Why is it okay to do the search over search_x and search_y which are the concatenation of the training and validation sets? 

In [None]:
### Hint: take a look at the documentation of scikit-learn for GridSearchCV and related classes.
###* put your answer as comment here *###
#
# 1.
#
#
# 2. 
#
#

## [Task 3] (30 points) Let's train polynomial regression models!

### [Task 3a] (5 points) Use PolynomialFeatures to create a version of the data with all features of degree 2.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

### Use PolynomialFeatures to create a version of the data with all features of degree 2. Make sure to allow interactions (interaction_only=False) and set include_bias=False.
### Store the result in 'all_x_polyf'. Ensure that you make a copy of the original data and you use the scaled features ('scaled_all_x')!
###* put your code here (~2 lines) *###



assert all_x_polyf.shape == (17379, 119)

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x_polyf, all_y, prop_vec, shuffle=True, seed=seed)

### Let's train a LinearRegression model and a Ridge model on our polynomial features.

In [None]:
# Train a linear regression model
pf_lrmodel = LinearRegression().fit(train_x, train_y)
r2_mse_mae_eval(pf_lrmodel)

print()

# Train a Ridge regression model
pf_ridgemodel = Ridge(alpha=0.5).fit(train_x, train_y)
r2_mse_mae_eval(pf_ridgemodel)

### [Task 3b] (5 points) What is the difference between LinearRegression and Ridge? (A sentence or two is fine.)

In [None]:
###* put your answer as comment here *###
#
# 
#
#

### [Task 3c] (5 points) Look at (e.g., print) the parameters of both the LinearRegression model ('pf_lrmodel') and the Ridge model ('pf_ridgemodel'). What do you notice? Explain what is going on.

In [None]:
###* put your code here *###


###  What do you notice? Explain what is going on.
###* put your answer as comment here *###
#
# 
#
#

### [Task 3d] (5 points) Focus on the Ridge model. What are the three most important features? 

In [None]:
### Print the three most important features alongside with their weights.
### Remember: weights can be positive as well as negative.
### Hint: you can use the get_feature_names() method of PolynomialFeatures to relate polynomial features to the original features.
###* put your code here *###



### [Task 3e] (5 points) Let's use only these three most important features. Extract the three features from the polynomial features to create a new feature matrix with three columns.

In [None]:
### Extract the three features from 'all_x_polyf' and store the results in 'all_x_3most'
###* put your code here *###



assert all_x_3most.shape == (17379, 3)

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x_3most, all_y, prop_vec, shuffle=True, seed=seed)

### [Task 3f] (2 points) Now train a LinearRegression model (default hyperparams) on the training data from 'all_x_3most'. What do you observe about the performance of this model? What is your conclusion?

In [None]:
### Call the model 'threemost_model' and evaluate it using r2_mse_mae_eval()
###* put your code here *###



### [Task 3g] (3 points) How good is that model? What do you conclude?

In [None]:
###* put your answer as comment here *###
#
# 
#
#

## [Task 4] (30 points) Trees, More Trees, lots of Trees!

### We need to reset the data to the original form (before polynomial features)

In [None]:
# let's do some cleanup
del train_x, train_y, test_x, test_y, val_x, val_y

# split the data into train, test, val
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(scaled_all_x, all_y, prop_vec, shuffle=True, seed=seed)

# sanity check shapes
train_x.shape, train_y.shape, test_x.shape, test_y.shape, val_x.shape, val_y.shape
assert train_x.shape == (12166, 14)

### Let's train a decision tree!

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree

dtmodel = DecisionTreeRegressor(random_state=seed).fit(train_x, train_y)

### Uncomment the code in the cell below if you have some time to wait around and want to visualize the tree, otherwise skip it!

In [None]:
### This will take a long time (10-20 minutes); skip if you are in a hurry
# let's plot what the tree looks like
#plt.figure(figsize=(16,12))
#
#plot_tree(dtmodel, feature_names=features, filled=True, label='all', rounded=True)
#
#plt.show()

### [Task 4a] (10 points) Answer some questions about the structure of our tree (dtmodel)

#### 1. Can the tree be visualized easily?
#### 2. How deep is the tree?
#### 3. How many nodes it contain?
#### 4. How many total splits are there?
#### 5. What is the impurity of the last 2 nodes?
#### Hint: lookup the scikit-learn documentation to know how to manipulate the 'tree_' attribute of Decision Trees.

In [None]:
###* put your answer as comment here *###
#
# 
#
#

#### Let's evaluate the decision tree model.

In [None]:
r2_mse_mae_eval(dtmodel)

### [Task 4b] (5 points) Is it a good model? Is it overfitted? Is it better than the models trained in Tasks 2 and 3? (A few sentences suffice.)

In [None]:
###* put your answer as comment here *###
#
# 
#
#

### [Task 4c] (5 points) Train another decision tree but this time regularize it. Can you obtain a model with similar performance to 'dtmodel' but not (or at least less) overfitted? 

In [None]:
### Call your new model 'dtregmodel'
###* put your code here *###



r2_mse_mae_eval(dtregmodel)

### [Task 4d] (5 points) Now let's train a random forest and see if we can train an even better model. Use search_train_eval() to do a grid search over hyperparameters. You are free to pick whatever hyperparameters & values you want, but you should try to avoid badly overfitting.

In [None]:
from sklearn.ensemble import RandomForestRegressor

### Call your random forest model 'rfmodel'
### Make sure to set random_state=seed for reproducibility!
###* put your code here *###



### [Task 4e] (5 points) Is your RF model better than the decision tree you trained for Task 4c? Justify your answer. What can you conclude about ensembles/random forests?

In [None]:
###* put your answer as comment here *###
#
# 
#
#

## [CIS6930 Additional Task -- Task 5] (25 points): Stacking Meta Model

### For this task we'll use stacking to create a meta model or blender model to predict the target using predictions from 6 other models from Tasks 1 - 4 as features!

### [Task 5a] (10 points) Fill in the code below.

In [None]:
from sklearn.preprocessing import StandardScaler
    
# these are the models we'll use from previous tasks
# (this is why it's important that you named the models as instructed in Tasks 2-4)
regressors = [('lr', lrmodel), ('elasticnet', enmodel), ('ridge', rmodel), 
             ('dt', dtmodel), ('dtreg', dtregmodel), ('rf', rfmodel)]

# this will return predictions for all of our regressors on matrix x
def regressors_preds(x):
    num_regs = len(regressors)
    
    ### Create an array to contain the predictions from the regressors over all examples in 'x'
    ### Each regressor will correspond to one feature (i.e., one column)
    ### The numpy array you return should have shape (x.shape[0], num_regs)
    ###* put your code here (~4-6 lines) *###

    

def stacking_train_eval(model_name, model, standardize=False):
    ### Create a new training dataset 'meta_train_x' and 'meta_train_y'
    ### For this use the validation data (val_x, val_y) alongside with regressors_preds()
    ###* put your code here (~2 lines) *###

    
    assert meta_train_x.shape == (2606, 6) and meta_train_x.shape[0] == meta_train_y.shape[0]


    ### Create our new test dataset 'meta_test_x' and 'meta_test_y'
    ### For this we use the test data (test_x, test_y) alongside with regressors_preds()
    ###* put your code here (~2 lines) *###

    
    
    assert meta_test_x.shape == (2607, 6) and meta_test_x.shape[0] == meta_test_y.shape[0]
    
    # zscore normalize the features if standardize = True
    if standardize:
        scaler = StandardScaler()
        meta_train_x = scaler.fit_transform(meta_train_x)
        meta_test_x = scaler.transform(meta_test_x)

    
    # train the meta model
    model.fit(meta_train_x, meta_train_y)

    # make predictions & eval
    train_pred = model.predict(meta_train_x)
    test_pred = model.predict(meta_test_x)

    train_error = mean_squared_error(train_pred, meta_train_y)
    val_error = mean_squared_error(test_pred, test_y)

    print('Stacking (Meta model: {})'.format(model_name))
    print('\tTrain MSE: {:.3f}, Test MSE: {:.3f}'.format(train_error, val_error))

    train_error = mean_absolute_error(train_pred, meta_train_y)
    val_error = mean_absolute_error(test_pred, test_y)

    print('\tTrain MAE: {:.3f}, Test MAE: {:.3f}'.format(train_error, val_error))

### [Task 5b] (3 points) Train a SVM regression model with a linear kernel and C=100. Use stacking_train_eval(). You can set standardize=True to zscore normalize features.)

In [None]:
### Train a SVM regressor with a linear kernel and C=100
### Note: the training will take a few minutes
###* put your code here (~2-3 lines) *###



### [Task 5c] (2 points) How good is this model? (A few sentences suffice.)

In [None]:
###* put your answer as comment here *###
#
#
#

### [Task 5d] (5 points) Now train a SVM regression model with any other kernel (i.e., not linear) and combination of hyperparameters of your choice. Can you train a better stacking model than for Task 5b?

In [None]:
### Train a SVM regressor with any non-linear kernel and hyperparameters you want 
### You can do a hyperparameter search if you want
###* put your code here (~2-3 lines) *###




### [Task 5e] (5 points) What do you conclude? Provide a plausible explanation why non-linear kernels do not seem to improve the result.

In [None]:
###* put your answer as comment here *###
#
# 
#
#