# Importing Header and SoilPrep

In [27]:
import import_ipynb 
from Header import *

In [28]:
import import_ipynb
from SoilPrep import * 

## Loading Data and MetaData

In [29]:
import import_ipynb
from LoadDataMetaData import *

In [30]:
ml_methods


['mult', 'cubist', 'svr', 'ridge', 'plsr', 'randomforest', 'gbrt']

# Building Model Tree (Mtree)

In [31]:
import os
os.system('say "your Meta Tree started building"')
import time

In [32]:
# 0. Available machine learning regression models --------------------------------- (7)
#ml_methods = ['mult', 'plsr', 'randomforest', 'svr', 'ridge', 'gbrt']
#ml_methods = ['mult', 'plsr', 'cubist', 'randomforest', 'ridge' 'gbrt', 'svr']

In [33]:
# prepare_spec = ['none', 'fod2', 'continuum', 'log']
# prepare_target = ['none', 'minmax']

def find_X(p, n):
    if p == 'fod2':
        X = fod_sampled[n]
    elif p == 'cr':
        X = sampled_cr[n] 
    elif p == 'log':
        X = sampled_log[n]
    else:
        X = sampled_spec[n]
    return X

def find_spec(p, n, m):
    if n == 0:
        if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
            spec = find_X(p, 100)
        else:
            spec = find_X(p,n)            
    else:
        spec = find_X(p,n)
    return spec    


def find_y(t):
    i = target_names.index(t)     
    y = T[i]
    return y
        

In [34]:
def build_tree_for (method_name):
    tree ={}
    start = time.time()
    m = method_name
    #-- code to build tree----
    for t in target_names:
        print('tree for: '+ m +' ------> running on: ' + t)
        tree[t] ={}
        for tp in prepare_target:
            tree[t][tp] ={}
            for p in prepare_spec:
                tree[t][tp][p] ={}
                for n in nbands_sampling:
                    tree[t][tp][p][n] ={}
                    Y = tree[t][tp][p][n]
                    
                        
                    #------ setting spec to appropriate (sampled) spectra----
                    spec = find_spec(p, n, m)
                    
                    
                    #---- target selection and normalization ---
                    if tp == 'none':
                        y = T[target_names.index(t)]
                        rand_n = rand_t[target_names.index(t)]  #-- for future use in train-test split
                    else:
                        y = NT[target_names.index(t)]
                        rand_n = rand_nt[target_names.index(t)] #-- for future use in train-test split
                       
                            
                    #---- performing train-test split----------------------
                    X_train, X_test, y_train, y_test = train_test_split(spec, y, test_size= tst_siz, random_state=rand_n)
                        
                        
                    #------INITIATING the appropriate models-----------------------------------------------
                    #----- PLSR: best parameters, fitting, and prediction
                    if m == 'plsr':
                        n_com = best_param_PLSR (X_train, y_train, rand_n, max_n_comp)
                        Y['n_comp'] = n_com
                        Model = PLSRegression(n_components=n_com, scale=True)
                    #----- MULTILINEAR:  fitting, and prediction---------- 
                    elif m == 'mult':
                        Model = linear_model.LinearRegression()    
                    #----- RANDOM_FOREST:   fitting, and prediction---------- 
                    elif m == 'randomforest': 
                        Model = RandomForestRegressor(random_state= 23)    
                    #----- CUBIST REGRESSION:    fitting and prediction---------
                    elif m == 'cubist':
                        Model = Cubist(n_rules = 50, n_committees = 5, random_state = 42)    
                    #------ SUPPORT VECTOR MACHINE FOR REGRESSION: fitting and prediction-----------      
                    elif m == 'svr': 
                        Model = SVR()
                    #------ RIDGE REGRESSION: fitting and prediction-----------      
                    elif m == 'ridge': 
                        Model = KernelRidge()
                    #------ GRADIENT BOOSTING REGRESSION: fitting and prediction-----------      
                    else: 
                        Model = GradientBoostingRegressor()                         
                        
                    Model.fit(X_train, y_train)
                    y_pred = Model.predict(X_test)
                    yhat_pred = Model.predict(X_train)
                                
                    Y['test'] = y_test
                    Y['testP'] = y_pred
                    Y['train'] = y_train
                    Y['trainP'] = yhat_pred
                    Y['iqrp_test'] = find_iqrp(y_pred, y_test)
                    Y['r2_test'] = find_r2(y_pred, y_test)
                    Y['rpd_test'] = find_rpd(y_pred, y_test)
                    Y['rmse_test'] = find_rmse(y_pred, y_test)
                    
                    Y['r2_train'] = find_r2(yhat_pred, y_train)
                                                                
                                                                         
    end = time.time()                            
    os.system('say "your meta tree finished for one more method"')                            

    print('End time - Start time =', (end-start)) 
    
    return (tree.copy())


## Loading the saved Mtree

In [35]:
with open ('Mtree.pickle', 'rb') as file:
    Mtree = pickle.load(file)

## Mtree Building (run below code only once)

In [36]:
#max_n_comp = 2
tst_siz = 0.20

In [37]:
#Mtree ={}

In [38]:
# Mtree['mult'] = build_tree_for ('mult')
# Mtree['svr'] = build_tree_for ('svr')


# Mtree['cubist'] = build_tree_for ('cubist')

# Mtree['gbrt'] = build_tree_for ('gbrt')

In [39]:
#Mtree['randomforest'] = build_tree_for ('randomforest')

In [40]:
# max_n_comp = 6
# Mtree['plsr'] = build_tree_for ('plsr')

In [41]:
#Mtree['ridge'] = build_tree_for ('ridge')

## Saving the Mtree

In [42]:
# with open ('Mtree.pickle', 'wb') as file:
#     pickle.dump(Mtree, file)

## Best of all worlds

In [43]:
def best_score_on (Mtree, target, method, scorer):
    t=target
    m= method
    
    best_score = -1
    best_n_comp = 'NA'
    best_p = 'NA'
    best_n = 'NA'
    
    for tp in prepare_target:
        for p in prepare_spec:
            for n in nbands_sampling:
                Y = Mtree[m][t][tp][p][n]
                r2_train = Y['r2_train']
                r2_test = Y['r2_test']
                    
                if scorer == 'iqrp':
                    cur_score = Y['iqrp_test']
                elif scorer == 'rpd':
                    cur_score = Y['rpd_test']
                else:
                    cur_score = Y['r2_test']  
                    
                if cur_score > best_score and r2_train >= r2_test:
                    best_score = cur_score
                    best_tp = tp
                    best_n = n
                    best_p = p
                    if m == 'plsr':
                        best_n_comp = Y['n_comp']
                            
    param_list = [scorer, np.round(best_score,2), 'Spec:', best_p, 'bands:', best_n]                                 
    return (param_list)                                
    

In [44]:
def best_score_for (Mtree, target, scorer):
    
    for method in ml_methods:
        param_list= best_score_on (Mtree, target, method, scorer)
        print('For:'+target+'->', param_list, ':'+method)
     
    return

In [45]:
ml_methods

['mult', 'cubist', 'svr', 'ridge', 'plsr', 'randomforest', 'gbrt']

In [46]:
best_score_for(Mtree, 'Sand', 'r2')

For:Sand-> ['r2', 0.63, 'Spec:', 'none', 'bands:', 9] :mult
For:Sand-> ['r2', 0.69, 'Spec:', 'fod2', 'bands:', 23] :cubist
For:Sand-> ['r2', 0.1, 'Spec:', 'log', 'bands:', 90] :svr
For:Sand-> ['r2', 0.29, 'Spec:', 'log', 'bands:', 0] :ridge
For:Sand-> ['r2', 0.67, 'Spec:', 'fod2', 'bands:', 30] :plsr
For:Sand-> ['r2', 0.64, 'Spec:', 'fod2', 'bands:', 25] :randomforest
For:Sand-> ['r2', 0.64, 'Spec:', 'fod2', 'bands:', 20] :gbrt


In [47]:
best_score_for(Mtree, 'Silt', 'r2')

For:Silt-> ['r2', 0.24, 'Spec:', 'fod2', 'bands:', 7] :mult
For:Silt-> ['r2', 0.47, 'Spec:', 'fod2', 'bands:', 27] :cubist
For:Silt-> ['r2', 0.04, 'Spec:', 'cr', 'bands:', 11] :svr
For:Silt-> ['r2', 0.29, 'Spec:', 'log', 'bands:', 0] :ridge
For:Silt-> ['r2', 0.34, 'Spec:', 'none', 'bands:', 10] :plsr
For:Silt-> ['r2', 0.42, 'Spec:', 'fod2', 'bands:', 11] :randomforest
For:Silt-> ['r2', 0.43, 'Spec:', 'fod2', 'bands:', 27] :gbrt


In [48]:
best_score_for(Mtree, 'Clay', 'r2')

For:Clay-> ['r2', 0.77, 'Spec:', 'log', 'bands:', 15] :mult
For:Clay-> ['r2', 0.72, 'Spec:', 'fod2', 'bands:', 27] :cubist
For:Clay-> ['r2', 0.26, 'Spec:', 'fod2', 'bands:', 7] :svr
For:Clay-> ['r2', 0.47, 'Spec:', 'log', 'bands:', 0] :ridge
For:Clay-> ['r2', 0.57, 'Spec:', 'cr', 'bands:', 21] :plsr
For:Clay-> ['r2', 0.62, 'Spec:', 'cr', 'bands:', 23] :randomforest
For:Clay-> ['r2', 0.77, 'Spec:', 'cr', 'bands:', 23] :gbrt


In [49]:
best_score_for(Mtree, 'TOC', 'r2')

For:TOC-> ['r2', 0.51, 'Spec:', 'none', 'bands:', 19] :mult
For:TOC-> ['r2', 0.69, 'Spec:', 'log', 'bands:', 35] :cubist
For:TOC-> ['r2', 0.2, 'Spec:', 'log', 'bands:', 10] :svr
For:TOC-> ['r2', 0.45, 'Spec:', 'log', 'bands:', 0] :ridge
For:TOC-> ['r2', 0.46, 'Spec:', 'cr', 'bands:', 20] :plsr
For:TOC-> ['r2', 0.72, 'Spec:', 'cr', 'bands:', 33] :randomforest
For:TOC-> ['r2', 0.75, 'Spec:', 'cr', 'bands:', 33] :gbrt


In [50]:
best_score_for(Mtree, 'CaCO3', 'r2')

For:CaCO3-> ['r2', 0.51, 'Spec:', 'log', 'bands:', 5] :mult
For:CaCO3-> ['r2', 0.52, 'Spec:', 'log', 'bands:', 5] :cubist
For:CaCO3-> ['r2', 0.1, 'Spec:', 'log', 'bands:', 90] :svr
For:CaCO3-> ['r2', 0.52, 'Spec:', 'log', 'bands:', 0] :ridge
For:CaCO3-> ['r2', 0.56, 'Spec:', 'cr', 'bands:', 20] :plsr
For:CaCO3-> ['r2', 0.54, 'Spec:', 'fod2', 'bands:', 5] :randomforest
For:CaCO3-> ['r2', 0.57, 'Spec:', 'none', 'bands:', 3] :gbrt


## Plotting Model Accuracy (ipywidgets)

In [51]:
def plot_model_acc (method, target, spec_preprocessing, n_bands):
    
    m = method
    t = target
    
    i = target_names.index(target)
    
    
    p = spec_preprocessing
    n = n_bands
    tp = 'none'
    
    Y = Mtree[m][t][tp][p][n]
    
    y_test = Y['test']
    y_pred = Y['testP']
    y_train = Y['train']
    yhat_pred = Y['trainP']
    
    if m == 'plsr':
        n_com = Y['n_comp']
        y_pred = y_pred[:,0]
        yhat_pred = yhat_pred[:,0]
    
    
    iqrp_test = Y['iqrp_test']
    r2_test = Y['r2_test']
    rpd_test = Y['rpd_test']
    
    iqrp_train = find_iqrp(yhat_pred, y_train)
    r2_train = find_r2(yhat_pred, y_train)
    rpd_train = find_rpd(yhat_pred, y_train)
    
    y_tp = pd.DataFrame({'actual':y_test.values, 'predic': y_pred})
    z = np.polyfit(y_test, y_pred, 1)
    
    yhat_tp = pd.DataFrame({'actual':y_train.values, 'predic': yhat_pred})
    zhat = np.polyfit(y_train, yhat_pred, 1)
    
    fig, axes = plt.subplots(1,2, figsize=(18,8))
    
    #with plt.style.context(('ggplot')): ---- PLOT of test-prediction --------------------------------------
    y_tp.plot.scatter(ax= axes[0], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
    axes[0].plot(y_test, np.polyval(z, y_test),  c='blue', linewidth=1)
    axes[0].plot(y_test, y_test, color='green', linewidth=1)
    axes[0].tick_params(axis='both', labelsize=10)
    axes[0].text(0.05, 0.95, target_names[i]+' (Test Data)', transform=axes[0].transAxes, fontsize = 20, color = clr[i])
    axes[0].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_test,3)), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.95, 0.15, 'Method: '+method, transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 20)
    
    if method == 'plsr':
        axes[0].text(0.95, 0.05, 'n_component={:.2f}'.format(n_com), transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 12)
    
    #---------------------------------- ---- PLOT of train-prediction --------------------------------------
    yhat_tp.plot.scatter(ax= axes[1], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
    axes[1].plot(y_train, np.polyval(zhat, y_train),  c='blue', linewidth=1)
    axes[1].plot(y_train, y_train, color='green', linewidth=1)
    axes[1].tick_params(axis='both', labelsize=10)
    axes[1].text(0.05, 0.95,  target_names[i]+' (Training Data)', transform=axes[1].transAxes,fontsize = 20, color = clr[i])
    axes[1].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_train), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_train), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_train,3)), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.95, 0.15, 'Method: '+method, transform=axes[1].transAxes, 
                    horizontalalignment='right', fontsize = 20)
    return    

In [52]:
ipywidgets.interact( plot_model_acc, target = target_names, method = ml_methods, \
                    spec_preprocessing = prepare_spec,  \
                    n_bands = nbands_sampling )

interactive(children=(Dropdown(description='method', options=('mult', 'cubist', 'svr', 'ridge', 'plsr', 'rando…

<function __main__.plot_model_acc(method, target, spec_preprocessing, n_bands)>