In [1]:
# !pip install openpyxl
# !pip install imbalanced-learn
# !pip3 install ipympl
# !pip install import-ipynb
# !pip install shapely
# !pip install SciencePlots 
# !pip install seaborn
# !pip install tqdm
# !pip install ipywidgets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
import os, sys
from numpy import nan
import re
import ipympl
# from IPython.core.display import display, HTML
import ipywidgets
import json
from os import listdir
import glob
import math
from IPython.display import Image, display, HTML
from shapely.geometry import mapping
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, precision_score, recall_score, mean_absolute_error, make_scorer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from cubist import Cubist
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut, cross_validate
from sklearn.impute import SimpleImputer
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from tqdm import tqdm
import pickle
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from scipy import stats
np.seterr(divide='ignore', invalid='ignore')
pd.options.display.max_columns = 100
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline
from scipy.signal import savgol_filter
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d

In [3]:
import import_ipynb
from SoilPrep import * 

importing Jupyter notebook from SoilPrep.ipynb


# Step 0: Setting up decision paramenters (Data Tree)

In [4]:
# 1. Available smoothing filter types: savgol1 and savgol2 ------------------------ (1)
sg_filters = ['sg1', 'sg2']

# 2. Available window lengths for the smoothing filter ---------------------------- (2)
window_lengths = [0, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]

# 3. Available preprocessing for Spectral data ------------------------------------ (3)
prepare_spec = ['none', 'fod', 'continuum']
#prepare_spec = ['none', 'fod', 'continuum']

# 4. Number of bands available for resampling spectra ----------------------------- (4) 
nbands_sampling = [0, 5, 10, 20, 30, 40, 50, 100, 200, 500]

# 5. Names of target variables in the dataframe ----------------------------------- (5)
target_names = ['sand', 'silt', 'clay', 'TOC']

# 6. Available preprocessing for Target data -------------------------------------- (6)
prepare_target = ['none', 'minmax']

# 7. Available machine learning regression models --------------------------------- (7)
ml_methods = ['mult', 'plsr', 'randomforest', 'cubist', 'svr', 'ridge', 'gbrt']

# 8. Recorded predictions on test-train data for model accuracy  ------------------ (8)
test_train_predict = ['test', 'testP', 'train', 'trainP']

# Step 1a: Obtaining Spectra  (Noise and Outliers removal)

In [5]:
# Colour scheme definition
kado = '#8B7355'
mati = '#A52A2A'
balu = '#F4A460'

In [6]:
# ------------------- Reading lab data and data containing contaminated samples ------------------------
df = pd.read_csv('uae.csv')
df_faulty = pd.read_csv('oil.csv')
df_faulty = df_faulty.T
df_faulty.columns = df_faulty.iloc[0,:].copy()
df_faulty = df_faulty.reset_index(drop=True)
df_faulty = df_faulty.iloc[1:, :].copy()

# -------------------- Renaming and matching column names of lab data and data containing contaminated samples ------------------

df.rename(columns = {'Lon': 'long', 'Lat':'lat'}, inplace = True)
df_faulty.rename(columns = {'Lon': 'long', 'Lat':'lat'}, inplace = True)

# obtaining faulty rows of df using the data frame df_faulty ---------------------------------- 

faulty_rows =[]
(row, col) = df.shape
for i in range (0, row):
    lat = df.loc[i,'lat']
    long = df.loc[i,'long']
    temp = df_faulty.loc[:,:][(df_faulty.loc[:,'lat'] == lat) & (df_faulty.loc[:,'long'] == long)]
    (r, c) = temp.shape
    if r != 0:
        faulty_rows.append(i)
        
# removing faulty rows from lab data --------------------------------------------

clean_lab = df.drop(df.index[faulty_rows]).copy()

temp_spec = clean_lab.iloc[:, 7:2158].copy()
#temp_spec = spectra.copy()
spectra = temp_spec.iloc[:,150:2101].copy()
spectra.head(5)

Unnamed: 0,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,...,2401,2402,2403,2404,2405,2406,2407,2408,2409,2410,2411,2412,2413,2414,2415,2416,2417,2418,2419,2420,2421,2422,2423,2424,2425,2426,2427,2428,2429,2430,2431,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441,2442,2443,2444,2445,2446,2447,2448,2449,2450
0,0.219696,0.220718,0.221682,0.222762,0.223911,0.225072,0.225983,0.227195,0.228434,0.229447,0.230477,0.231651,0.232982,0.234067,0.235246,0.236547,0.237816,0.238973,0.240065,0.2412,0.242471,0.243782,0.245019,0.246231,0.247492,0.248815,0.25014,0.251416,0.252683,0.253977,0.255289,0.256609,0.257936,0.259318,0.260675,0.262001,0.263489,0.264948,0.266372,0.267808,0.269245,0.270721,0.272237,0.273741,0.275277,0.27685,0.278365,0.279914,0.281544,0.283125,...,0.553845,0.553308,0.552715,0.552262,0.551709,0.551014,0.550466,0.549654,0.548693,0.548122,0.547419,0.546803,0.54653,0.545801,0.544873,0.544079,0.543125,0.542145,0.541152,0.540185,0.539289,0.538305,0.537223,0.536029,0.534775,0.533649,0.532742,0.531715,0.530519,0.529449,0.528385,0.527379,0.526438,0.525538,0.52447,0.523266,0.522147,0.5207,0.519286,0.518121,0.516985,0.516036,0.51501,0.513881,0.51277,0.511402,0.510095,0.508928,0.507499,0.506154
1,0.187783,0.188788,0.189796,0.190691,0.191741,0.19286,0.193714,0.194845,0.195998,0.196942,0.197932,0.199039,0.200258,0.201246,0.202282,0.203435,0.204649,0.205728,0.206735,0.20781,0.209025,0.210258,0.211392,0.212488,0.213674,0.21493,0.216065,0.217232,0.21844,0.2196,0.220764,0.221956,0.22319,0.224454,0.22565,0.226782,0.22811,0.229408,0.230653,0.231906,0.23321,0.234532,0.235775,0.237029,0.238316,0.239613,0.2409,0.242202,0.243545,0.244887,...,0.504261,0.503778,0.503295,0.502756,0.502049,0.501413,0.500776,0.499971,0.499396,0.498757,0.498094,0.497683,0.497013,0.496031,0.49516,0.494134,0.492911,0.492027,0.491128,0.489963,0.488914,0.48784,0.486646,0.485485,0.484422,0.48324,0.481823,0.480533,0.479247,0.478024,0.476905,0.475752,0.474714,0.473333,0.471867,0.470726,0.469224,0.467688,0.466564,0.465216,0.463884,0.462804,0.461737,0.460651,0.4593,0.45802,0.456631,0.454874,0.453601,0.452144
2,0.165603,0.166606,0.167606,0.168545,0.169584,0.170685,0.171606,0.172728,0.173862,0.174827,0.175771,0.176913,0.178225,0.179122,0.180132,0.181309,0.182467,0.18355,0.184579,0.185635,0.186868,0.188132,0.189288,0.190384,0.191511,0.192711,0.193982,0.19514,0.19624,0.197401,0.198585,0.199772,0.200951,0.202163,0.203339,0.204486,0.205816,0.207072,0.208248,0.209476,0.210717,0.211974,0.213223,0.214438,0.215646,0.216856,0.218078,0.219319,0.220595,0.221881,...,0.306349,0.305574,0.304779,0.304093,0.303291,0.302305,0.301416,0.300412,0.299417,0.298638,0.297758,0.29685,0.29612,0.295239,0.294198,0.293308,0.292365,0.29144,0.290649,0.289555,0.288282,0.286841,0.285386,0.284332,0.283407,0.282481,0.281426,0.280206,0.278896,0.277767,0.276756,0.275625,0.274746,0.273805,0.27269,0.271665,0.270596,0.269414,0.268186,0.266977,0.265609,0.264296,0.263194,0.262063,0.26092,0.2597,0.258399,0.257115,0.255891,0.254916
3,0.258894,0.260033,0.261232,0.262371,0.263599,0.264844,0.265754,0.267021,0.268392,0.269578,0.270728,0.271973,0.273376,0.274612,0.275852,0.27716,0.278517,0.279843,0.281097,0.282317,0.283717,0.285161,0.286502,0.287831,0.289182,0.290585,0.292071,0.293412,0.294704,0.296116,0.297506,0.298905,0.300361,0.301824,0.303245,0.304631,0.306135,0.307635,0.309125,0.310584,0.312084,0.313623,0.315119,0.316595,0.318116,0.319678,0.321119,0.322629,0.324289,0.325896,...,0.530152,0.529644,0.528968,0.528384,0.527843,0.527236,0.526813,0.526146,0.525249,0.524549,0.523706,0.523134,0.522848,0.52213,0.521406,0.520417,0.519197,0.51822,0.517079,0.516214,0.515437,0.514247,0.513102,0.511791,0.510344,0.509186,0.508097,0.506785,0.505418,0.504128,0.502665,0.501399,0.500381,0.499255,0.498308,0.497345,0.496185,0.494824,0.493243,0.491827,0.490462,0.489007,0.487607,0.486217,0.485016,0.484023,0.483059,0.481886,0.480314,0.478795
4,0.094636,0.095097,0.09551,0.096142,0.09669,0.097173,0.097607,0.098273,0.098904,0.099322,0.099843,0.100478,0.101165,0.10159,0.102151,0.102836,0.103426,0.104015,0.104552,0.105025,0.105758,0.106476,0.107006,0.107597,0.108214,0.108849,0.109514,0.110105,0.11067,0.111302,0.111929,0.112549,0.113169,0.113786,0.114366,0.114948,0.115763,0.116463,0.11704,0.117727,0.118449,0.119169,0.119845,0.120538,0.121241,0.121925,0.122585,0.123269,0.124012,0.124778,...,0.399756,0.399185,0.398585,0.397928,0.397205,0.396555,0.395924,0.394999,0.394011,0.393144,0.392205,0.391567,0.391154,0.390397,0.389535,0.388471,0.387181,0.385958,0.384697,0.383455,0.382334,0.38124,0.380206,0.379344,0.378419,0.377441,0.376288,0.374928,0.373538,0.372178,0.371044,0.369877,0.368783,0.367857,0.366759,0.365569,0.364414,0.363116,0.36174,0.36075,0.359669,0.35821,0.35693,0.355704,0.354706,0.354042,0.35322,0.352019,0.350563,0.349028


In [7]:
# df = pd.read_csv('uae.csv')
# spectra = df.iloc[:, 7:2158].copy()
# temp_spec = spectra.copy()
# spectra = temp_spec.iloc[:,150:2101].copy()
# spectra.head(5)

In [8]:
# for i in range (0,5,1):
#     spectra.iloc[i,:].plot()

# Step 1b: Obtaining Targets (Outliers removal and Normalization)

In [22]:
df = clean_lab
# ------------- Target Isolation ----------------------

clr = ['#F4A460', '#8B7355', '#A52A2A', 'green']

def isolate_targets(df, target_names):
    T=[]
    for i in range (0,len(target_names)):
        T.append(df[target_names[i]])
    return(T)
    
T = isolate_targets(df,target_names) 


def normalize_targets(T):          
    NT =[]
    for i in range(0, len(T)):
        NT.append(min_max_normal(T[i].copy()))
    return(NT)

NT = normalize_targets(T)


# Step 1c: Spectra Preprocessing (Smooth, FOD/Contin, and Resample)

## Savgol smoothing (order 1 and order 2)

In [9]:
# -------------- Smoothed Spectra spec1 (savgol order 1) and spec2 (savgol order 2)  -----------

spec1 = {}
for i in window_lengths:
    spec1[i] = filt_sg(spectra, i, 'sg1')                   

spec2 = {}
for i in window_lengths:
    spec2[i] = filt_sg(spectra, i, 'sg2')

smth_spec = sgsmooth (spectra, 3)    

## First Order Derivative

In [10]:
fod_spec = fod(smth_spec)

# for i in range (0,5,1):
#     fod_spec.iloc[i,:].plot()


## Continuum Removal

In [11]:
cr_spec = continuum_removed(spec2[51])

# for i in range (0,5,1):
#     cr_spec.iloc[i,:].plot()
    

## Resampling (n_bands)

### 1. Sampled Original (sampled_spec: sampled clipped_spectra)

In [12]:
sampled_spec = {}
for n in nbands_sampling:
    sampled_spec[n] = resample_spectra (spec2[51], n)

In [13]:
# for i in range (0,5,1):
#     sampled_spec[200].iloc[i,:].plot()

### 2. Sampled Continuum Removed  (sampled_cr)

In [14]:
sampled_cr = {}
for n in nbands_sampling:
    sampled_cr[n] = resample_spectra (cr_spec, n)

In [15]:
# for i in range (0,5,1):
#     sampled_cr[200].iloc[i,:].plot()

### 3. Sampled FOD  (sampled_fod)

In [16]:
sampled_fod = {}
for n in nbands_sampling:
    sampled_fod[n] = resample_spectra (fod_spec, n)

In [17]:
# for i in range (0,10,1):
#     sampled_fod[200].iloc[i,:].plot()

## Visualizing Processed Spectrum (variable samples)

In [18]:
def plot_spec (sample, process):
    x1 = spec2[51].iloc[sample,:]
    x1.plot()
    if process == 'continuum':
        x2 = cr_spec.iloc[sample,:]
        x2.plot()
    else: 
        x3 = fod_spec.iloc[sample,:]*100
        
        x3.plot()
    plt.ylim([-0.6, 0.8])

ipywidgets.interact(plot_spec, sample = (0, 293,1), process = ['fod', 'continuum'])

interactive(children=(IntSlider(value=146, description='sample', max=293), Dropdown(description='process', opt…

<function __main__.plot_spec(sample, process)>

## Correlation between wavelengths and Targets

In [23]:
plt.style.use(['science','notebook','grid'])

def plot_corr (target, spec_cr_fod, n_bands):
    
    i = target_names.index(target)    
    
    if  spec_cr_fod == 'spec':
        r_val, p_val = find_rpval (resample_spectra(spec2[51], n_bands), T[i])
        r_val.iloc[0,:].plot(color = clr[i])
    elif  spec_cr_fod == 'cr':
        r_cr, p_cr = find_rpval (resample_spectra(cr_spec, n_bands), T[i])
        r_cr.iloc[0,:].plot(color = clr[i])
    else:
        r_fod, p_fod = find_rpval (resample_spectra(fod_spec, n_bands), T[i])
        r_fod.iloc[0,:].plot(color = clr[i])
    
    plt.ylim([-0.7, 0.7])

ipywidgets.interact(plot_corr, target = target_names, spec_cr_fod = ['spec', 'cr','fod'], n_bands = nbands_sampling)



interactive(children=(Dropdown(description='target', options=('sand', 'silt', 'clay', 'TOC'), value='sand'), D…

<function __main__.plot_corr(target, spec_cr_fod, n_bands)>

# Step 2:  Parameters for Best Train-Test Split  

In [24]:
# Building Mtree (Model Tree) ----------------------------------

tst_siz = 0.20

rand_t = [None] * (len(T))
err_t = [None] * (len(T))

print('Without Normalization:')
for i in range (0,len(T)):
    rand_t[i], err_t[i] = best_split(spectra.copy(), T[i], tst_siz) 
    print ('For '+ target_names[i]+ ' :test size =', tst_siz, '\t min bin error=', err_t[i], '\t at randome state =', rand_t[i])
    
rand_nt = [None] * (len(T))
err_nt = [None] * (len(T))

print('After Normalization:')
for i in range (0,len(T)):
    rand_nt[i], err_nt[i] = best_split(spectra.copy(), NT[i], tst_siz)     
    print ('For '+ target_names[i]+ ' :test size =', tst_siz, '\t min bin error=', err_nt[i], '\t at randome state =', rand_nt[i])
    
    

Without Normalization:
For sand :test size = 0.2 	 min bin error= 15.0 	 at randome state = 2
For silt :test size = 0.2 	 min bin error= 15.0 	 at randome state = 23
For clay :test size = 0.2 	 min bin error= 22.5 	 at randome state = 18
For TOC :test size = 0.2 	 min bin error= 15.0 	 at randome state = 20
After Normalization:
For sand :test size = 0.2 	 min bin error= 30.0 	 at randome state = 3
For silt :test size = 0.2 	 min bin error= 25.0 	 at randome state = 2
For clay :test size = 0.2 	 min bin error= 20.0 	 at randome state = 18
For TOC :test size = 0.2 	 min bin error= 30.0 	 at randome state = 10


# Step 3: Parameters for Best Model Fit 

## PLSR

In [25]:
from sklearn import linear_model

def best_param_PLSR (X_train, X_test, y_train, y_test, n_comp):
        
    iqrpL = []
#     r2L = []
#     rpdL = []
    
    for n in range(1,n_comp):
        Model = PLSRegression(n_components=n, scale=True)
        Model.fit(X_train, y_train)
        y_pred = Model.predict(X_test, copy=True)
        
        iqrp_test = find_iqrp(y_pred, y_test)
#         r2_test = find_r2(y_pred, y_test)         
#         rpd_test = find_rpd(y_pred, y_test)
               
        iqrpL.append(iqrp_test)
#         r2L.append(r2_test)
#         rpdL.append(rpd_test)
                
    
    IQRP = max(iqrpL)
    n_iqrp = iqrpL.index(max(iqrpL))+1
#     R2 = max(r2L)     
#     n_r2 = r2L.index(R2)+1    
#     RPD = max(rpdL)
#     n_rpd = rpdL.index(RPD)+1
    
    #print('IQRP :', IQRP,  'R2 :', R2,  '>>> n_comp: ', n_iqrp)    
    return (n_iqrp)
        
 

# Step 4: Building Model Tree (Mtree)

In [26]:
import os
os.system('say "your program has finished"')
import time

In [27]:
# 0. Available machine learning regression models --------------------------------- (7)
ml_methods = ['mult', 'plsr', 'randomforest', 'cubist', 'svr', 'ridge', 'gbrt']
#ml_methods = ['mult', 'plsr', 'cubist', 'randomforest', 'ridge' 'gbrt', 'svr']

In [30]:
max_n_comp = 5

def build_tree_for (method_name):
    tree ={}
    start = time.time()
    m = method_name
    #-- code to build tree----
    for t in target_names:
        print('tree for: '+ m +' ------> running on: ' + t)
        tree[t] ={}
        for tp in prepare_target:
            tree[t][tp] ={}
            for n in nbands_sampling:
                tree[t][tp][n] ={}
                for p in prepare_spec:
                    tree[t][tp][n][p] ={}
                    Y = tree[t][tp][n][p]
                    
                        
                    #------ setting spec to appropriate (sampled) spectra----
                    if p == 'none':
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_spec[100]
                            else:
                                spec = spec2[51]
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_spec[n]
                        
                    elif p == 'fod':
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_fod[100]
                            else:
                                spec = fod_spec
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_fod[n]
                                
                    else:  
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_cr[100]
                            else:
                                spec = cr_spec
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_cr[n]
                        
                    #---- target selection and normalization ---
                    if tp == 'none':
                        y = T[target_names.index(t)]
                        rand_n = rand_t[target_names.index(t)]  #-- for future use in train-test split
                    else:
                        y = NT[target_names.index(t)]
                        rand_n = rand_nt[target_names.index(t)] #-- for future use in train-test split
                        #print('one more target set')
                            
                    #---- performing train-test split----------------------
                    X_train, X_test, y_train, y_test = train_test_split(spec, y, test_size= tst_siz, random_state=rand_n)
                        
                        
                    #------INITIATING the appropriate models-----------------------------------------------
                    #----- PLSR: best parameters, fitting, and prediction
                    if m == 'plsr':
                        n_com = best_param_PLSR (X_train, X_test, y_train, y_test, max_n_comp)
                        Y['n_comp'] = n_com
                        Model = PLSRegression(n_components=n_com, scale=True)
                    #----- MULTILINEAR:  fitting, and prediction---------- 
                    elif m == 'mult':
                        Model = linear_model.LinearRegression()    
                    #----- RANDOM_FOREST:   fitting, and prediction---------- 
                    elif m == 'randomforest': 
                        Model = RandomForestRegressor(random_state= 23)    
                    #----- CUBIST REGRESSION:    fitting and prediction---------
                    elif m == 'cubist':
                        Model = Cubist(n_rules = 50, n_committees = 5, random_state = 42)    
                    #------ SUPPORT VECTOR MACHINE FOR REGRESSION: fitting and prediction-----------      
                    elif m == 'svr': 
                        Model = SVR()
                    #------ RIDGE REGRESSION: fitting and prediction-----------      
                    elif m == 'ridge': 
                        Model = KernelRidge()
                    #------ GRADIENT BOOSTING REGRESSION: fitting and prediction-----------      
                    else: 
                        Model = GradientBoostingRegressor()                         
                        
                    Model.fit(X_train, y_train)
                    y_pred = Model.predict(X_test)
                    yhat_pred = Model.predict(X_train)
                                
                    Y['test'] = y_test
                    Y['testP'] = y_pred
                    Y['train'] = y_train
                    Y['trainP'] = yhat_pred
                    Y['iqrp_test'] = find_iqrp(y_pred, y_test)
                    Y['r2_test'] = find_r2(y_pred, y_test)
                    Y['rpd_test'] = find_rpd(y_pred, y_test)
                    Y['rmse_test'] = find_rmse(y_pred, y_test)
                                                                
                                                                         
    end = time.time()                            
    os.system('say "your program has finished"')                            

    print('End time - Start time =', (end-start)) 
    
    return (tree.copy())


## Mtree initialisation (do not run below code every time)

In [31]:
Mtree ={}

## Creating different branches of Mtree (for separate methods)

### PLSR Branch 

In [32]:
Mtree['plsr'] = build_tree_for ('plsr')

tree for: plsr ------> running on: sand
tree for: plsr ------> running on: silt
tree for: plsr ------> running on: clay
tree for: plsr ------> running on: TOC
End time - Start time = 30.054133653640747


### Multiple Linear Regression Branch

In [33]:
Mtree['mult'] = build_tree_for ('mult')

tree for: mult ------> running on: sand
tree for: mult ------> running on: silt
tree for: mult ------> running on: clay
tree for: mult ------> running on: TOC
End time - Start time = 9.595892667770386



### Random Forest Branch

In [34]:
Mtree['randomforest'] = build_tree_for ('randomforest')

tree for: randomforest ------> running on: sand
tree for: randomforest ------> running on: silt
tree for: randomforest ------> running on: clay
tree for: randomforest ------> running on: TOC
End time - Start time = 863.304366350174


### SVM Branch

In [35]:
Mtree['svr'] = build_tree_for ('svr')

tree for: svr ------> running on: sand
tree for: svr ------> running on: silt
tree for: svr ------> running on: clay
tree for: svr ------> running on: TOC
End time - Start time = 15.116539478302002


### GBRT Branch

In [36]:
Mtree['gbrt'] = build_tree_for ('gbrt')

tree for: gbrt ------> running on: sand
tree for: gbrt ------> running on: silt
tree for: gbrt ------> running on: clay
tree for: gbrt ------> running on: TOC
End time - Start time = 554.2621576786041


### Ridge Regression Branch

In [37]:
Mtree['ridge'] = build_tree_for ('ridge')

tree for: ridge ------> running on: sand
tree for: ridge ------> running on: silt
tree for: ridge ------> running on: clay
tree for: ridge ------> running on: TOC
End time - Start time = 8.401322841644287


### Cubist Branch

In [38]:
Mtree['cubist'] = build_tree_for ('cubist')

tree for: cubist ------> running on: sand


  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

tree for: cubist ------> running on: silt


  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

tree for: cubist ------> running on: clay


  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

tree for: cubist ------> running on: TOC


  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _get_data_format(col_data) for col_name, col_data in df.iteritems()}
  return {col_name: _

End time - Start time = 344.5562596321106


## Best of all worlds

In [39]:
def best_model_parameters (Mtree, target, method, scorer):
    t=target
    m= method
    
    best_score = -1
    best_n_comp = 'NA'
    
    
    for tp in prepare_target:
        for n in nbands_sampling:
            for p in prepare_spec:
                Y = Mtree[m][t][tp][n][p]
                    
                if scorer == 'iqrp':
                    cur_score = Y['iqrp_test']
                elif scorer == 'rpd':
                    cur_score = Y['rpd_test']
                else:
                    cur_score = Y['r2_test']  
                    
                if cur_score > best_score:
                    best_score = cur_score
                    best_tp = tp
                    best_n = n
                    best_p = p
                    if m == 'plsr':
                        best_n_comp = Y['n_comp']
                            
    param_list = [scorer, np.round(best_score,2), 'Spec Prc:', best_p, 'n_bands:', best_n, 'Tar Prc:', best_tp, 'n_comp: ', best_n_comp]                                 
    return (param_list)                                
    

In [40]:
def best_score_for (Mtree, target, scorer):
    
    for method in ml_methods:
        param_list= best_model_parameters (Mtree, target, method, scorer)
        print('For:'+target+'->', param_list, ':'+method)
     
    return

In [41]:
best_score_for(Mtree, 'sand', 'iqrp')

For:sand-> ['iqrp', 1.57, 'Spec Prc:', 'fod', 'n_bands:', 10, 'Tar Prc:', 'none', 'n_comp: ', 'NA'] :mult
For:sand-> ['iqrp', 1.59, 'Spec Prc:', 'none', 'n_bands:', 500, 'Tar Prc:', 'minmax', 'n_comp: ', 3] :plsr
For:sand-> ['iqrp', 1.62, 'Spec Prc:', 'fod', 'n_bands:', 5, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :randomforest
For:sand-> ['iqrp', 1.56, 'Spec Prc:', 'continuum', 'n_bands:', 30, 'Tar Prc:', 'none', 'n_comp: ', 'NA'] :cubist
For:sand-> ['iqrp', 1.41, 'Spec Prc:', 'none', 'n_bands:', 5, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :svr
For:sand-> ['iqrp', 1.51, 'Spec Prc:', 'none', 'n_bands:', 0, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :ridge
For:sand-> ['iqrp', 1.56, 'Spec Prc:', 'continuum', 'n_bands:', 100, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :gbrt


In [42]:
best_score_for(Mtree, 'sand', 'r2')

For:sand-> ['r2', 0.49, 'Spec Prc:', 'none', 'n_bands:', 10, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :mult
For:sand-> ['r2', 0.51, 'Spec Prc:', 'none', 'n_bands:', 500, 'Tar Prc:', 'minmax', 'n_comp: ', 3] :plsr
For:sand-> ['r2', 0.52, 'Spec Prc:', 'fod', 'n_bands:', 5, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :randomforest
For:sand-> ['r2', 0.46, 'Spec Prc:', 'none', 'n_bands:', 10, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :cubist
For:sand-> ['r2', 0.37, 'Spec Prc:', 'none', 'n_bands:', 5, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :svr
For:sand-> ['r2', 0.45, 'Spec Prc:', 'none', 'n_bands:', 0, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :ridge
For:sand-> ['r2', 0.49, 'Spec Prc:', 'continuum', 'n_bands:', 100, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :gbrt


In [43]:
best_score_for(Mtree, 'TOC', 'iqrp')

For:TOC-> ['iqrp', 1.33, 'Spec Prc:', 'continuum', 'n_bands:', 10, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :mult
For:TOC-> ['iqrp', 1.36, 'Spec Prc:', 'continuum', 'n_bands:', 50, 'Tar Prc:', 'minmax', 'n_comp: ', 4] :plsr
For:TOC-> ['iqrp', 1.43, 'Spec Prc:', 'fod', 'n_bands:', 200, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :randomforest
For:TOC-> ['iqrp', 1.56, 'Spec Prc:', 'continuum', 'n_bands:', 0, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :cubist
For:TOC-> ['iqrp', 1.16, 'Spec Prc:', 'fod', 'n_bands:', 20, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :svr
For:TOC-> ['iqrp', 1.14, 'Spec Prc:', 'none', 'n_bands:', 0, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :ridge
For:TOC-> ['iqrp', 1.5, 'Spec Prc:', 'fod', 'n_bands:', 200, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :gbrt


In [44]:
best_score_for(Mtree, 'TOC', 'r2')

For:TOC-> ['r2', 0.53, 'Spec Prc:', 'continuum', 'n_bands:', 10, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :mult
For:TOC-> ['r2', 0.55, 'Spec Prc:', 'continuum', 'n_bands:', 50, 'Tar Prc:', 'minmax', 'n_comp: ', 4] :plsr
For:TOC-> ['r2', 0.65, 'Spec Prc:', 'fod', 'n_bands:', 0, 'Tar Prc:', 'none', 'n_comp: ', 'NA'] :randomforest
For:TOC-> ['r2', 0.66, 'Spec Prc:', 'continuum', 'n_bands:', 0, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :cubist
For:TOC-> ['r2', 0.42, 'Spec Prc:', 'fod', 'n_bands:', 10, 'Tar Prc:', 'none', 'n_comp: ', 'NA'] :svr
For:TOC-> ['r2', 0.42, 'Spec Prc:', 'none', 'n_bands:', 0, 'Tar Prc:', 'none', 'n_comp: ', 'NA'] :ridge
For:TOC-> ['r2', 0.63, 'Spec Prc:', 'fod', 'n_bands:', 200, 'Tar Prc:', 'minmax', 'n_comp: ', 'NA'] :gbrt


## Plotting Model Accuracy (ipywidgets)

In [None]:
def plot_model_acc (target, target_preprocessing, spec_preprocessing, n_bands, method):
    
    m = method
    t = target
    
    i = target_names.index(target)
    
    
    p = spec_preprocessing
    n = n_bands
    tp = target_preprocessing
    
    Y = Mtree[m][t][tp][n][p]
    
    y_test = Y['test']
    y_pred = Y['testP']
    y_train = Y['train']
    yhat_pred = Y['trainP']
    
    if m == 'plsr':
        n_com = Y['n_comp']
        y_pred = y_pred[:,0]
        yhat_pred = yhat_pred[:,0]
    
    
    iqrp_test = Y['iqrp_test']
    r2_test = Y['r2_test']
    rpd_test = Y['rpd_test']
    
    iqrp_train = find_iqrp(yhat_pred, y_train)
    r2_train = find_r2(yhat_pred, y_train)
    rpd_train = find_rpd(yhat_pred, y_train)
    
    y_tp = pd.DataFrame({'actual':y_test.values, 'predic': y_pred})
    z = np.polyfit(y_test, y_pred, 1)
    
    yhat_tp = pd.DataFrame({'actual':y_train.values, 'predic': yhat_pred})
    zhat = np.polyfit(y_train, yhat_pred, 1)
    
    fig, axes = plt.subplots(1,2, figsize=(18,8))
    
    #with plt.style.context(('ggplot')): ---- PLOT of test-prediction --------------------------------------
    y_tp.plot.scatter(ax= axes[0], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
    axes[0].plot(y_test, np.polyval(z, y_test),  c='blue', linewidth=1)
    axes[0].plot(y_test, y_test, color='green', linewidth=1)
    axes[0].tick_params(axis='both', labelsize=10)
    axes[0].text(0.05, 0.95, target_names[i]+' (Test Data)', transform=axes[0].transAxes, fontsize = 20, color = clr[i])
    axes[0].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_test,3)), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.95, 0.15, 'Method: '+method, transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 20)
    
    if method == 'plsr':
        axes[0].text(0.95, 0.05, 'n_component={:.2f}'.format(n_com), transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 12)
    
    #---------------------------------- ---- PLOT of train-prediction --------------------------------------
    yhat_tp.plot.scatter(ax= axes[1], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
    axes[1].plot(y_train, np.polyval(zhat, y_train),  c='blue', linewidth=1)
    axes[1].plot(y_train, y_train, color='green', linewidth=1)
    axes[1].tick_params(axis='both', labelsize=10)
    axes[1].text(0.05, 0.95,  target_names[i]+' (Training Data)', transform=axes[1].transAxes,fontsize = 20, color = clr[i])
    axes[1].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_train), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_train), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_train,3)), transform=axes[1].transAxes, fontsize = 16)
    axes[1].text(0.95, 0.15, 'Method: '+method, transform=axes[1].transAxes, 
                    horizontalalignment='right', fontsize = 20)
    return    

In [None]:
ipywidgets.interact(plot_model_acc, target = target_names,target_preprocessing = prepare_target, \
                    method = ml_methods, spec_preprocessing = prepare_spec, n_bands = nbands_sampling)

## Random Forest Regression (Hypertuning)

In [None]:
rf_reg = RandomForestRegressor(random_state= 23)  

# #to generate various random forests.
# n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num = 10)]
# max_depth = [int(x) for x in np.linspace(3, 4, num = 2)]
# max_features = ['log2', 'sqrt']
# min_samples_split = [3, 5, 8]

# # Create the random grid
# random_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features':max_features,\
#               'min_samples_split':min_samples_split}

# print(random_grid)

# scorer = make_scorer(r2_score, greater_is_better=True)
# cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=23)
# rf_random = RandomizedSearchCV(estimator=rf_reg, param_distributions = random_grid, cv = cv, n_iter = 300,\
#                             scoring=scorer, verbose=1, random_state = 10, error_score='raise', n_jobs=-1)

t='sand'
spec = sampled_cr[20]
y = NT[target_names.index(t)]
rand_n = rand_nt[target_names.index(t)]
X_train, X_test, y_train, y_test = train_test_split(spec, y, test_size= tst_siz, random_state=rand_n)

rf_reg.fit(X_train, y_train)

y_pred = rf_reg.predict(X_test)
y_pred = np.round(y_pred, 2)

score_cv = r2_score(y_test, y_pred)
print('Best R2 Score:', score_cv)

In [None]:
X_train_new = X_train.iloc[:,[0,1,2,4,7,8,9,11,14,16,17,18,19]]
X_test_new = X_test.iloc[:,[0,1,2,4,7,8,9,11,14,16,17,18,19]]

rf_reg.fit(X_train_new, y_train)
y_pred = rf_reg.predict(X_test_new)
y_pred = np.round(y_pred, 2)

score_cv = r2_score(y_test, y_pred)
print('Best R2 Score:', score_cv)

In [None]:
from matplotlib import pyplot
importance = rf_reg.feature_importances_ 
# summarize feature importance
for i,v in enumerate(importance):
 print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
print(rf_random.best_estimator_)
y_pred = rf_random.predict(X_test)
y_pred = np.round(y_pred, 2)

score_cv = r2_score(y_test, y_pred)
print('Best R2 Score:', score_cv)