<a id=top></a>
# DEV: Atlas Construction

This notebook served the purpose of model selection and hyperparameter optimization for atlas prediction across channels.

## Table of Contents

----

1. [Preparations](#prep)
    1. [Imports & Data Loading](#prep_imports_loading)
    2. [Preprocessing](#prep_preprocessing)
2. [Evaluation of Regressors](#regressors)
    1. [Baseline: Random Assignment (Dummy)](#reg_random)
    2. [k Nearest Neighbors](#reg_knn)
    3. [RBF MO-SVR](#reg_svr)
    4. [MT-Lasso](#reg_lasso)
    5. [MT-ElasticNet](#reg_enet)
    6. [Random Forest](#reg_forest)
    7. [MultiLayer Perceptron](#reg_mlp)
    8. [Bagging Ensemble (SVR)](#reg_bag)
    9. [Gradient Boosting Regressor (sklearn)](#sk_reg_boost)
    10. [Gradient Boosting Regressor (xgboost)](#xg_reg_boost)
3. [Comparative Assessment](#comp)
    1. [Current Run](#assess_this_run)
    2. [All Runs](#assess_all_runs)

----

<a id=prep></a>

## 1. Preparations

----

### Imports & Data Loading <a id=prep_imports_loading></a>

In [None]:
### Import modules

# External, general
from __future__ import division
import os, sys, pickle
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline

# External, specific
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing as sklearn_prep
from sklearn.decomposition import PCA
from sklearn import model_selection, metrics, base, dummy
from sklearn import neighbors, svm, multioutput, tree, linear_model, ensemble, neural_network
import xgboost
from scipy.stats import spearmanr

# Internal
import katachi.utilities.loading as ld
import katachi.utilities.atlas_helpers as atlas

In [None]:
### Configuration

# Target channel
channel_2  = "NLStdTomato"
#channel_2 =  "tagRFPtUtrCH"
#channel_2 = "mKate2GM130"

# Source feature space
space_type_source = "TFOR"
#space_type_source = "CFOR"
#space_type_source = "TFOR+CFOR"

# Target feature space
space_type_target = "TFOR"
#space_type_target = "CFOR"

# Preprocessing steps
#shape_type    = "raw"
#shape_type    = "z_normed"
shape_type    = "pca"
#sec_type      = "raw"
#sec_type      = "z_normed"
sec_type      = "pca"
restd_shape   = False
restd_sec     = False

# Removals and additions
shape_num_PCs = 20
sec_num_PCs   = 20

# Cross-validation
num_CVs = 3

# Hyperparameter optimization
score_weighted = True  # Whether to weigh the multioutput scores by variance when aggregating them

In [None]:
### Function to parse relevant IDs from IDR bulk data

def parse_from_IDR(dir_path, target):
    
    # Get all samples
    samples = [d for d in os.listdir(dir_path) if len(d)==10
               and os.path.isdir(os.path.join(dir_path, d))]
    
    # Select relevant samples
    relevant_samples = []
    for d in samples:
        
        # Get measured files
        files = [i for i in os.listdir(os.path.join(dir_path, d))
                 if i.startswith(d) and i.endswith('measured.tsv')]
            
        # Find target
        if any([target in f for f in files]):
            relevant_samples.append(d)
    
    return relevant_samples

In [None]:
### Load data

# Target dir and target IDs
dirpath = r'data/experimentA/extracted_measurements/'

# Target IDs
prim_IDs = parse_from_IDR(dirpath, channel_2)
print "Found %i training IDs!" % len(prim_IDs)

# Prep loader
loader = ld.DataLoaderIDR(dirpath, recurse=True, verbose=True)

# Load shape space from IDR
if space_type_source == "TFOR":
    shape_fspace, _, fspace_idx = loader.load_dataset("shape_TFOR_raw_measured.tsv", IDs=prim_IDs)
elif space_type_source == "CFOR":
    shape_fspace, _, fspace_idx = loader.load_dataset("shape_CFOR_raw_measured.tsv", IDs=prim_IDs)
elif space_type_source == "TFOR+CFOR":
    shape_fspace_TFOR, _, fspace_idx = loader.load_dataset("shape_TFOR_raw_measured.tsv", IDs=prim_IDs)
    shape_fspace_CFOR, _, _ = loader.load_dataset("shape_CFOR_raw_measured.tsv", IDs=prim_IDs)
    shape_fspace = np.concatenate([shape_fspace_TFOR, shape_fspace_CFOR], axis=1)
else:
    raise IOError("Invalid space_type_source.")

# Load target channel space from IDR
if space_type_target == "TFOR":
    sec_fspace, _, _ = loader.load_dataset(channel_2+"_TFOR_raw_measured.tsv")
elif space_type_target == "CFOR":
    sec_fspace, _, _ = loader.load_dataset(channel_2+"_CFOR_raw_measured.tsv")

# Report
print "shape_fspace.shape:", shape_fspace.shape
print "sec_fspace.shape:", sec_fspace.shape

### Preprocessing <a id=prep_preprocessing></a>

In [None]:
### Outlier Removal

# NOTE: This is a rather simplistic approach but it gets rid of the few really weird cases.

# Identify outliers based on extreme absolute values across the standardized sec_fspace
outlier_fspace = np.abs(sklearn_prep.StandardScaler().fit_transform(sec_fspace))
outliers_sec = outlier_fspace.sum(axis=1) > np.percentile(outlier_fspace.sum(axis=1), 95)

# Identify outliers based on extreme absolute values across the standardized shape_fspace
outlier_fspace = np.abs(sklearn_prep.StandardScaler().fit_transform(shape_fspace))
outliers_shape = outlier_fspace.sum(axis=1) > np.percentile(outlier_fspace.sum(axis=1), 95)

# Merge outlier mask
outliers = outliers_sec | outliers_shape

# Remove outliers from all imported datasets
print "Removing", np.sum(outliers), "outliers!"
shape_fspace = shape_fspace[~outliers]
sec_fspace   = sec_fspace[~outliers]
fspace_idx   = fspace_idx[~outliers]

In [None]:
### Standardization & PCA

# Shape space
shape_fspace_z = sklearn_prep.StandardScaler().fit_transform(shape_fspace)
shape_fspace_pca = PCA().fit_transform(shape_fspace_z)
if restd_shape:
    shape_fspace_pca = sklearn_prep.StandardScaler().fit_transform(shape_fspace_pca)

# Secondary channel space
sec_fspace_z   = sklearn_prep.StandardScaler().fit_transform(sec_fspace)
sec_fspace_pca = PCA().fit_transform(sec_fspace_z)
if restd_sec:
    sec_fspace_pca = sklearn_prep.StandardScaler().fit_transform(sec_fspace_pca)

### Feature Selection & Splitting <a id=prep_ml></a>

In [None]:
### Select features

# Crop
shape_fspace_pca = shape_fspace_pca[:, :shape_num_PCs]
sec_fspace_pca   = sec_fspace_pca[:, :sec_num_PCs]

# Decide which shape space to use
if shape_type == "raw":
    X = shape_fspace
elif shape_type == "z_normed":
    X = shape_fspace_z
elif shape_type == "pca":
    X = shape_fspace_pca
else:
    raise IOError("Invalid shape_type. Must be 'z_normed' or 'pca'.")
    
# Decide which secondary space to use
if sec_type == "raw":
    y = sec_fspace
elif sec_type == "z_normed":
    y = sec_fspace_z
elif sec_type == "pca":
    y = sec_fspace_pca
else:
    raise IOError("Invalid sec_type. Must be 'z_normed' or 'pca'.")

In [None]:
### Split data for cross-validation and evaluation

# Shuffle split
cv_sets = model_selection.ShuffleSplit(n_splits=num_CVs, test_size=0.3, random_state=42)

# Prepare generic cross validation scorers
scoring = {'eplained_variance'  : metrics.make_scorer(metrics.explained_variance_score),
           'mean_squared_error' : metrics.make_scorer(metrics.mean_squared_error),
           'r2_score'           : metrics.make_scorer(metrics.r2_score, multioutput='uniform_average'),
           'r2_weighted'        : metrics.make_scorer(metrics.r2_score, multioutput='variance_weighted'),
           'spearman_r_score'   : metrics.make_scorer(atlas.spearman_r_score),
           'spearman_p_bonf'    : metrics.make_scorer(atlas.spearman_p_bonf)}

# Whether to use the relevant-only r2 scorer
if score_weighted:
    hyperopt_scoring = metrics.make_scorer(metrics.r2_score, multioutput='variance_weighted')
else:
    hyperopt_scoring = metrics.make_scorer(metrics.r2_score, multioutput='uniform_average')
    
# Prepare a single train-test split for visualization
out = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = out

# Prepare dict for result collection
result_dict = {}

# Report
print "Preparations complete!"
print "  Final source fspace (full, train, test):", X.shape, X_train.shape, X_test.shape
print "  Final target fspace (full, train, test):", y.shape, y_train.shape, y_test.shape

<a id=regressors></a>

## 2. Evaluation of Regressors

### Baseline: Random Assignment (Dummy) <a id=reg_random></a>

In [None]:
### Baseline: random assignment
print "BASELINE: RANDOM ASSIGNMENT"

# Create dummy regressor that randomly assigns stuff
class RandomRegressor(base.BaseEstimator, base.RegressorMixin):
    """Dummy classifier that returns random samples of fit target values as predictions."""
    
    def __init__(self, random_state=None):
        if random_state:
            np.random.seed(random_state)
    
    def fit(self, X, y):
        self.y = y
        return self

    def predict(self, X, y=None):
        choice = np.random.choice(np.arange(self.y.shape[0]), 
                                  size=X.shape[0])
        return self.y[choice,:]

# Prepare "regressor"
baseline_random = RandomRegressor(random_state=42)

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(baseline_random, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["Random"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(baseline_random, "Random Baseline",
                                    X_train, X_test, y_train, y_test)

### k Nearest Neighbors <a id=reg_knn></a>

In [None]:
### Simple kNN regression

# Prepare regressor
knn_reg = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=10)

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(knn_reg, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["kNN"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(knn_reg, "k Nearest Neighbors",
                                    X_train, X_test, y_train, y_test)

### RBF MO-SVR <a id=reg_svr></a>

In [None]:
### Hyperparam screening for SVR

# Param grid
gd = 1.0 / X_test.shape[1]
print 'gd:', gd, '\n'
param_grid = [{'estimator__C': [0.1, 1.0, 10.0, 20.0], 
               'estimator__epsilon': [0.01, 0.1, 0.5], 
               'estimator__gamma': [gd, gd*0.1, gd*0.01]}]

# Prep regressor
svr = svm.SVR(kernel='rbf')
multi_svr = multioutput.MultiOutputRegressor(svr)

# Run grid search
clf = model_selection.GridSearchCV(multi_svr, param_grid, scoring=hyperopt_scoring, n_jobs=12, verbose=2)
clf.fit(X, y)

# Available outputs
print "\nOutputs:"
print sorted(clf.cv_results_.keys())

# key results
print "\nResults:"
print clf.best_estimator_
print clf.best_score_

In [None]:
### Multivariate-Multioutput (separate!) regression with RBF SVR

## Get optimized regressor
multi_svr = clf.best_estimator_

# Manually prepare regressor
#svr       = svm.SVR(kernel='rbf', C=10.0, epsilon=0.1,   # raw-NLS-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pca-NLS-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=10.0, epsilon=0.5,   # pcaW-NLS-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.01,  # raw-NLS-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=10.0, epsilon=0.5,   # pca-NLS-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pcaW-NLS-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=10.0, epsilon=0.1,   # raw-UtrCH-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pca-UtrCH-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.5,    # pcaW-UtrCH-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.1,   # raw-UtrCH-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pca-UtrCH-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.001)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.01,   # pcaW-UtrCH-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.001)
#svr       = svm.SVR(kernel='rbf', C=10.0, epsilon=0.1,   # raw-GM130-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.1,   # pca-GM130-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.5,   # pcaW-GM130-TFOR-TFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=20.0, epsilon=0.1,   # raw-GM130-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pca-GM130-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.01)
#svr       = svm.SVR(kernel='rbf', C=1.0, epsilon=0.5,    # pcaW-GM130-CFOR-CFOR
#                    gamma=1.0 / X_test.shape[1] * 0.1)
#multi_svr = multioutput.MultiOutputRegressor(svr, n_jobs=y_train.shape[1])

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(multi_svr, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True, 
                                        n_jobs=num_CVs)
atlas.report_cv_scores(scores)
result_dict["SVR"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(multi_svr, "MO-SVR (RBF)",
                                    X_train, X_test, y_train, y_test)

In [None]:
### Plot of predictability over target dimensions for SVR for publication

# Fit & predict for scoring
multi_svr.fit(X_train, y_train)
y_train_pred = multi_svr.predict(X_train)
y_test_pred = multi_svr.predict(X_test)

# Compute score for each dimension
score_train = [metrics.r2_score(y_train[:,dim], y_train_pred[:,dim])
               for dim in range(y_train.shape[1])]
score_test  = [metrics.r2_score(y_test[:,dim], y_test_pred[:,dim])
               for dim in range(y_test.shape[1])]

# Prep
fig,ax = plt.subplots(1, 1, figsize=(12,3))

# Plot scores for training set
ax.plot(score_train, 'k.-', alpha=0.25, label='train')

# Plot scores for test set
ax.plot(score_test, 'k.-', label='test')

# Cosmetics
ax.legend(loc=1)
ax.set_xticks(np.arange(0, y_train.shape[1], 1))
ax.set_xlim([-0.2, y_train.shape[1]-0.8])
ax.set_ylim([-0.1, 1.1])
ax.set_xlabel('target dimensions', fontsize=16)
ax.set_ylabel('r-squared score', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.tight_layout()

# Done
plt.show()

### MT-Lasso <a id=reg_lasso></a>

In [None]:
### Multivariate-Multivariable Linear Regression by Multi-Task Lasso

# Prepare regressor
#multi_lasso = linear_model.MultiTaskLasso(alpha=1/(10e4*X_train.shape[0]), random_state=42)
multi_lasso = linear_model.MultiTaskLassoCV(random_state=42, n_jobs=10)  # CV determines hyperparam alpha

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(multi_lasso, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["Lasso"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(multi_lasso, "MT-Lasso CV",
                                    X_train, X_test, y_train, y_test)

### MT-ElasticNec <a id=reg_enet></a>

In [None]:
### Multivariate-Multivariable Linear Regression by Multi-Task Elastic Net

# Prepare regressor
#multi_enet = linear_model.MultiTaskElasticNet(alpha=1/10e4, l1_ratio=0.5, random_state=52)
multi_enet = linear_model.MultiTaskElasticNetCV(random_state=52, n_jobs=10)  # CV determines hyperparam alpha

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(multi_enet, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["eNet"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(multi_enet, "MT-ElasticNet CV",
                                    X_train, X_test, y_train, y_test)

### Random Forest <a id=reg_forest></a>

In [None]:
### Hyperparam screening for Random Forest

# Param grid
param_grid = [{'bootstrap' : [True, False],
               'n_estimators' : [10, 100, 500, 1000, 1500], 
               'max_depth' : [10, 50, 100, None],
               'max_features' : ['auto', 'sqrt']}]

# Prep regressor
random_forest = ensemble.RandomForestRegressor(random_state=42)

# Run grid search
clf = model_selection.GridSearchCV(random_forest, param_grid, n_jobs=12, verbose=2, scoring=hyperopt_scoring)
clf.fit(X, y)

# Available outputs
print "\nOutputs:"
print sorted(clf.cv_results_.keys())

# key results
print "\nResults:"
print clf.best_estimator_
print clf.best_score_

In [None]:
### Multivariate-Multivariable Random Forest Regression

# Prepare regressor
random_forest = clf.best_estimator_
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # raw-NLS-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=50,       # pca-NLS-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=50,       # pcaW-NLS-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=10,       # raw-NLS-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=50,       # pca-NLS-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=50,       # pcaW-NLS-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # raw-UtrCH-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # pca-UtrCH-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # pcaW-UtrCH-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # raw-UtrCH-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=10,       # pca-UtrCH-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=10,       # pcaW-UtrCH-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # raw-GM130-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # pca-GM130-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=50,       # pcaW-GM130-TFOR-TFOR
#                                               max_features='sqrt', bootstrap=False,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=10,       # raw-GM130-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=10,       # pca-GM130-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)
#random_forest = ensemble.RandomForestRegressor(n_estimators=1500, max_depth=10,       # pcaW-GM130-CFOR-CFOR
#                                               max_features='sqrt', bootstrap=True,
#                                               random_state=42, n_jobs=10)

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(random_forest, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["Forest"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(random_forest, "Random Forest",
                                    X_train, X_test, y_train, y_test)

### MultiLayer Perceptron <a id=reg_mlp></a>

In [None]:
### Hyperparam screening for MLP

# Param grid
param_grid = [{'activation' : ['logistic', 'tanh', 'relu'],
               'solver'     : ['lbfgs', 'adam'],
               'alpha'      : [0.001, 0.0001, 0.00001]}]
# Note: This does not optimize the  hidden_layer_sizes but some preliminary 
#       testing showed that it's no use; it just increases overfitting

# Prep regressor
perceptron = neural_network.MLPRegressor(random_state=42, max_iter=1000)

# Run grid search
clf = model_selection.GridSearchCV(perceptron, param_grid, n_jobs=12, verbose=1, scoring=hyperopt_scoring)
clf.fit(X, y)

# Available outputs
print "\nOutputs:"
print sorted(clf.cv_results_.keys())

# key results
print "\nResults:"
print clf.best_estimator_
print clf.best_score_

In [None]:
### Multivariate-Multivariable Regression by Multi-Layer-Perceptron

# Prepare regressor
perceptron = clf.best_estimator_
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-5,    # raw-NLS-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='relu', alpha=1e-4,        # pca-NLS-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='relu', alpha=1e-4,        # pcaW-NLS-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-4,    # raw-NLS-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pca-NLS-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pcaW-NLS-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='tanh', alpha=1e-3,        # raw-UtrCH-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='relu', alpha=1e-5,        # pca-UtrCH-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='relu', alpha=1e-5,        # pcaW-UtrCH-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-5,    # raw-UtrCH-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pca-UtrCH-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pcaW-UtrCH-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='relu', alpha=1e-3,        # raw-GM130-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pca-GM130-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pcaW-GM130-TFOR-TFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-5,    # raw-GM130-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pca-GM130-CFOR-CFOR
#                                         solver='adam', random_state=42)
#perceptron = neural_network.MLPRegressor(activation='logistic', alpha=1e-3,    # pcaW-GM130-CFOR-CFOR
#                                         solver='adam', random_state=42)

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(perceptron, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True)
atlas.report_cv_scores(scores)
result_dict["MLP"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(perceptron, "MultiLayer Perceptron",
                                    X_train, X_test, y_train, y_test)

### Bagging Ensemble (SVR) <a id=reg_bag></a>

### Gradient Boosting Regressor (sklearn) <a id=sk_reg_boost></a>

In [None]:
### Hyperparam screening for GBR

# Param grid
param_grid = [{'estimator__learning_rate'  : [0.001, 0.01, 0.1], 
               'estimator__n_estimators'   : [100, 300, 600],
               'estimator__subsample'      : [0.1, 0.5, 1.0],
               'estimator__max_depth'      : [3, 5, 7]}]

# Prep regressor
boost       = ensemble.GradientBoostingRegressor(random_state=42)
multi_boost = multioutput.MultiOutputRegressor(boost, n_jobs=y_train.shape[1])

# Run grid search
clf = model_selection.GridSearchCV(multi_boost, param_grid, n_jobs=14, verbose=2, scoring=hyperopt_scoring)
clf.fit(X, y)

# Available outputs
print "\nOutputs:"
print sorted(clf.cv_results_.keys())

# key results
print "\nResults:"
print clf.best_estimator_
print clf.best_score_

In [None]:
### Multivariate-Multioutput (separate!) regression with Gradient Boosting Regressor

# Grab optimized regressor
multi_boost = clf.best_estimator_

## Manually prepare regressor
#boost       = ensemble.GradientBoostingRegressor(loss = 'huber', learning_rate=0.01,  # raw-NLS-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=5, random_state = 42)
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-NLS-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=5, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-NLS-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=5, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # raw-NLS-CFOR-CFOR
#                                                 n_estimators=300, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-NLS-CFOR-CFOR
#                                                 n_estimators=100, subsample=0.5,
#                                                 max_depth=5, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-NLS-CFOR-CFOR
#                                                 n_estimators=300, subsample=0.5,
#                                                 max_depth=5, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # raw-UtrCH-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-UtrCH-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-UtrCH-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # raw-UtrCH-CFOR-CFOR
#                                                 n_estimators=300, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-UtrCH-CFOR-CFOR
#                                                 n_estimators=100, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-UtrCH-CFOR-CFOR
#                                                 n_estimators=300, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # raw-GM130-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-GM130-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-GM130-TFOR-TFOR
#                                                 n_estimators=600, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # raw-GM130-CFOR-CFOR
#                                                 n_estimators=100, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pca-GM130-CFOR-CFOR
#                                                 n_estimators=100, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#boost       = ensemble.GradientBoostingRegressor(loss = 'ls', learning_rate=0.01,     # pcaW-GM130-CFOR-CFOR
#                                                 n_estimators=100, subsample=0.5,
#                                                 max_depth=3, random_state = 42) 
#multi_boost = multioutput.MultiOutputRegressor(boost, n_jobs=y_train.shape[1])

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(multi_boost, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True, 
                                        n_jobs=num_CVs)
atlas.report_cv_scores(scores)
result_dict["boost"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(multi_boost, "MO-GBR",
                                    X_train, X_test, y_train, y_test)

### Gradient Boosting Regressor (xgboost) <a id=xg_reg_boost></a>

In [None]:
### Multivariate-Multioutput (separate!) regression with XGBoost

# Prepare regressor
#xgb = xgboost.XGBRegressor(random_state = 42)      # Default hyperparams
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-NLS-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 5)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-NLS-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 5)
xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-NLS-TFOR-TFOR
                           learning_rate  = 0.01,
                           n_estimators   = 600,
                           subsample      = 0.5,
                           max_depth      = 5)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-NLS-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 300,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-NLS-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 100,
#                           subsample      = 0.5,
#                           max_depth      = 5)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-NLS-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 300,
#                           subsample      = 0.5,
#                           max_depth      = 5)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-UtrCH-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-UtrCH-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-UtrCH-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-UtrCH-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 300,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-UtrCH-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 100,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-UtrCH-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 300,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-GM130-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-GM130-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-GM130-TFOR-TFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 600,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # raw-GM130-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 100,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pca-GM130-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 100,
#                           subsample      = 0.5,
#                           max_depth      = 3)
#xgb = xgboost.XGBRegressor(random_state   = 42,     # pcaW-GM130-CFOR-CFOR
#                           learning_rate  = 0.01,
#                           n_estimators   = 100,
#                           subsample      = 0.5,
#                           max_depth      = 3)

multi_xgb = multioutput.MultiOutputRegressor(xgb, n_jobs=y_train.shape[1])

# Perform cross-validation
print "\nPerforming cross validation..."
scores = model_selection.cross_validate(multi_xgb, X, y, scoring=scoring,
                                        cv=cv_sets, return_train_score=True, 
                                        n_jobs=num_CVs)
atlas.report_cv_scores(scores)
result_dict["xgb"] = scores

# Plot result of an example fit
report = atlas.visualize_regression(multi_boost, "MO-XGB",
                                    X_train, X_test, y_train, y_test)

<a id=comp></a>

## 3. Comparative Assessment

----

### Current Run <a id=assess_this_run></a>

In [None]:
### Plot test R^2 values

# Interactive choice of score
from ipywidgets import interact
@interact(score=sorted(result_dict.values()[0].keys()))
def plot_scores(score="test_r2_score"):
    
    # Prep
    fig   = plt.figure(figsize=(6,2))
    width = 0.75

    # Get relevant values
    score_means = np.array([np.mean(s[score]) for s in result_dict.values()])
    score_vals  = np.array([s[score] for s in result_dict.values()])
    score_names = np.array(result_dict.keys())

    # Sort by mean
    score_vals  = score_vals[np.argsort(score_means), :]
    score_names = score_names[np.argsort(score_means)]
    score_means = np.sort(score_means)

    # Plot bars
    plt.bar(range(len(score_means)), score_means,
            width=width, color='skyblue', edgecolor='', zorder=0)

    # Add scatter
    if not score=="score_time":
        for i in range(len(score_means)):  
            plt.scatter([i+width/2 for j in score_vals[i,:]], score_vals[i,:], 
                        color='0.3', alpha=0.5, edgecolor='', s=10)

    # Cosmetics
    plt.xlim([-width/2, len(score_means)-1+width+width/2])
    plt.ylim([0.0, max([1.1, score_vals.flatten().max()+0.1*score_vals.flatten().max()])])
    plt.xticks(np.arange(len(score_means))+width/2, score_names, 
               ha="right", fontsize=8, rotation=45)
    plt.ylabel(score)
    plt.title("Performance Comparison")

    # Show
    plt.show()

In [None]:
### Save result

# Save
with open("other/AtlasScores_" 
          + shape_type + ('W' if score_weighted else '') + "_"
          + channel_2 + "_" 
          + space_type_source + '_to_' 
          + space_type_target 
          + ".pkl", "wb") as outfile:
    pickle.dump(result_dict, outfile, protocol=pickle.HIGHEST_PROTOCOL)

### All Runs <a id=assess_all_runs></a>

----

[Back to Top](#top)