In [1]:
# !pip install openpyxl
# !pip install imbalanced-learn
# !pip3 install ipympl
# !pip install import-ipynb
# !pip install shapely
# !pip install SciencePlots 
# !pip install seaborn
# !pip install tqdm
# !pip install ipywidgets
#!pip install cubist

In [2]:
import pandas as pd
pd.DataFrame.iteritems = pd.DataFrame.items
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
import os, sys
from numpy import nan
import re
import ipympl
# from IPython.core.display import display, HTML
import ipywidgets
import json
from os import listdir
import glob
import math
from IPython.display import Image, display, HTML
from shapely.geometry import mapping
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, precision_score, recall_score, mean_absolute_error, make_scorer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from cubist import Cubist
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut, cross_validate
from sklearn.impute import SimpleImputer
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from tqdm import tqdm
import pickle
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from scipy import stats
np.seterr(divide='ignore', invalid='ignore')
pd.options.display.max_columns = 100
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline
from scipy.signal import savgol_filter
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d

In [3]:
import import_ipynb
from SoilPrep import * 

importing Jupyter notebook from SoilPrep.ipynb


# Step 0: Setting up decision paramenters (Data Tree)

In [4]:
# 1. Available smoothing filter types: savgol1 and savgol2 ------------------------ (1)
sg_filters = ['sg1', 'sg2']

# 2. Available window lengths for the smoothing filter ---------------------------- (2)
window_lengths = [0, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]

# 3. Available preprocessing for Spectral data ------------------------------------ (3)
prepare_spec = ['none', 'fod', 'continuum']
#prepare_spec = ['none', 'fod', 'continuum']

# 4. Number of bands available for resampling spectra ----------------------------- (4) 
nbands_sampling = [0, 5, 10, 15, 20, 25, 30, 40, 50, 100]

# 5. Names of target variables in the dataframe ----------------------------------- (5)
target_names = ['sand', 'silt', 'clay', 'TOC']

# 6. Available preprocessing for Target data -------------------------------------- (6)
prepare_target = ['none', 'minmax']

# 7. Available machine learning regression models --------------------------------- (7)
ml_methods = ['mult', 'plsr', 'randomforest', 'cubist','svr', 'ridge', 'gbrt']

# 8. Recorded predictions on test-train data for model accuracy  ------------------ (8)
test_train_predict = ['test', 'testP', 'train', 'trainP']

# Step 1a: Obtaining Spectra  (Noise and Outliers removal)

In [5]:
# Colour scheme definition
kado = '#8B7355'
mati = '#A52A2A'
balu = '#F4A460'

In [6]:
#----------- Reading lab data, emit data and data having contaminated samples-----------------------------
df_lab = pd.read_csv('uae.csv')
df_sat = pd.read_csv('uae_emit.csv')

df_faulty = pd.read_csv('oil.csv')
df_faulty = df_faulty.T
df_faulty.columns = df_faulty.iloc[0,:].copy()
df_faulty = df_faulty.reset_index(drop=True)
df_faulty = df_faulty.iloc[1:, :].copy()
df_faulty.head(5)

#-------------Renaming columns---------------------------------------------------------------------------
df_sat.rename(columns= {'latitude':'long', 'longitude': 'lat'}, inplace=True)
df_lab.rename(columns = {'Lon': 'long', 'Lat':'lat'}, inplace = True)
df_faulty.rename(columns = {'Lon': 'long', 'Lat':'lat'}, inplace = True)

df_sat.rename(columns = {'OM': 'TOC', 'Clay': 'clay', 'Silt':'silt', 'Sand': 'sand'}, inplace = True)

In [7]:
clean_sat = df_sat.copy()
clean_lab = df_lab.copy()

clean_sat.reset_index(drop=True)
clean_lab.reset_index(drop=True)
clean_lab.shape

(294, 2158)

In [8]:
# obtaining corresponding rows of clean_sat using clean_lab and removing extra rows -------------- 

missing_rows =[]

(row, col) = clean_sat.shape
small_lab = clean_lab.iloc[0:row,:].copy()

for i in range (0, row):
    lat = clean_sat.iloc[i, clean_sat.columns.get_loc('lat')]
    long = clean_sat.iloc[i, clean_sat.columns.get_loc('long')]
    temp = clean_lab.loc[:,:][(clean_lab.loc[:,'lat'] == lat) & (clean_lab.loc[:,'long'] == long)]
    (r, c) = temp.shape
    if r == 0:
        #print('missing:', i)
        missing_rows.append(i)
        #print(r, 'at', i)
    else:
        #print(i)
        small_lab.iloc[i,:] = temp.iloc[0, :].copy()
        

small_lab = small_lab.drop(small_lab.index[missing_rows]).copy()
small_sat = clean_sat.drop(clean_sat.index[missing_rows]).copy()
#small_sat.rename(columns = {'OM': 'TOC', 'Clay': 'clay', 'Silt':'silt', 'Sand': 'sand'}, inplace = True)
small_lab.reset_index()
small_sat.reset_index()
small_lab.shape

(83, 2158)

## Extracting spectra from small_sat and small_lab

In [9]:
min_loc_sat = small_sat.columns.get_loc('field_30')
max_loc_sat = small_sat.columns.get_loc('field_312')
small_sat_spec = small_sat.iloc[:, min_loc_sat: max_loc_sat+1].copy()
print(small_sat_spec.shape)

min_loc_lab = small_lab.columns.get_loc('380')
max_loc_lab = small_lab.columns.get_loc('2500')
small_lab_spec = small_lab.iloc[:, min_loc_lab: max_loc_lab].copy()
print(small_lab_spec.shape)

(83, 283)
(83, 2120)


### Resampling small_lab_spec into 283 bands

In [10]:
sampled_lab_spec = resample_spectra(small_lab_spec, 283)
sampled_lab_spec.shape

(83, 283)

### Renaming columns of sampled_sat_spec using columns of sampled_lab_spec

In [11]:
header = list(sampled_lab_spec.columns.values)
sampled_sat_spec = small_sat_spec.copy()
sampled_sat_spec.columns = header
sampled_sat_spec.shape

(83, 283)

### Dropping faulty bands from sampled_sat_spec and sampled_lab_spec

In [12]:
# Select faulty column based on condition (i,e. boundary columns-- [1320/125:1447/142] + [1769/185:1964/211])-
faulty_columns = list(np.arange(125,146,1)) + list(np.arange(185,212,1))
print(faulty_columns)

[125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211]


In [13]:
sat_spec = sampled_sat_spec.drop(sampled_sat_spec.columns[faulty_columns], axis=1).copy()
lab_spec = sampled_lab_spec.drop(sampled_lab_spec.columns[faulty_columns], axis=1).copy()

## Setting the "spectra" for future computations

In [14]:
spectra = sat_spec.copy()
spectra.shape

(83, 235)

# Step 1b: Obtaining Targets (Outliers removal and Normalization)

In [15]:
df = small_sat
# ------------- Target Isolation ----------------------

clr = ['#F4A460', '#8B7355', '#A52A2A', 'green']

def isolate_targets(df, target_names):
    T=[]
    for i in range (0,len(target_names)):
        T.append(df[target_names[i]])
    return(T)
    
T = isolate_targets(df,target_names) 


def normalize_targets(T):          
    NT =[]
    for i in range(0, len(T)):
        NT.append(min_max_normal(T[i].copy()))
    return(NT)

NT = normalize_targets(T)


# Step 1c: Spectra Preprocessing (Smooth, FOD/Contin, and Resample)

## Savgol smoothing (order 1 and order 2)

In [16]:
# -------------- Smoothed Spectra spec1 (savgol order 1) and spec2 (savgol order 2)  -----------

spec1 = {}
for i in window_lengths:
    spec1[i] = filt_sg(spectra, i, 'sg1')                   

spec2 = {}
for i in window_lengths:
    spec2[i] = filt_sg(spectra, i, 'sg2')

smth_spec = sgsmooth (spectra, 3)    

## First Order Derivative

In [17]:
fod_spec = fod(smth_spec)

# for i in range (0,5,1):
#     fod_spec.iloc[i,:].plot()


## Continuum Removal

In [18]:
cr_spec = continuum_removed(spec2[51])

# for i in range (0,5,1):
#     cr_spec.iloc[i,:].plot()
    

## Resampling (n_bands)

### 1. Sampled Original (sampled_spec: sampled clipped_spectra)

In [19]:
sampled_spec = {}
for n in nbands_sampling:
    sampled_spec[n] = resample_spectra (spec2[51], n)

In [20]:
# for i in range (0,5,1):
#     sampled_spec[200].iloc[i,:].plot()

### 2. Sampled Continuum Removed  (sampled_cr)

In [21]:
sampled_cr = {}
for n in nbands_sampling:
    sampled_cr[n] = resample_spectra (cr_spec, n)

In [22]:
# for i in range (0,5,1):
#     sampled_cr[200].iloc[i,:].plot()

### 3. Sampled FOD  (sampled_fod)

In [23]:
sampled_fod = {}
for n in nbands_sampling:
    sampled_fod[n] = resample_spectra (fod_spec, n)

In [24]:
# for i in range (0,10,1):
#     sampled_fod[200].iloc[i,:].plot()

## Visualizing Processed Spectrum (variable samples)

In [25]:
row, col = spectra.shape

def plot_spec (sample, process):
    x1 = spec2[51].iloc[sample,:]
    x1.plot()
    if process == 'continuum':
        x2 = cr_spec.iloc[sample,:]
        x2.plot()
    else: 
        x3 = fod_spec.iloc[sample,:]*10
        
        x3.plot()
    plt.ylim([-0.6, 0.8])

ipywidgets.interact(plot_spec, sample = (0, row,1), process = ['fod', 'continuum'])

interactive(children=(IntSlider(value=41, description='sample', max=83), Dropdown(description='process', optio…

<function __main__.plot_spec(sample, process)>

## Correlation between wavelengths and Targets

In [26]:
plt.style.use(['science','notebook','grid'])

def plot_corr (target, spec_cr_fod, n_bands):
    
    i = target_names.index(target)    
    
    if  spec_cr_fod == 'spec':
        r_val, p_val = find_rpval (resample_spectra(spec2[51], n_bands), T[i])
        r_val.iloc[0,:].plot(color = clr[i])
    elif  spec_cr_fod == 'cr':
        r_cr, p_cr = find_rpval (resample_spectra(cr_spec, n_bands), T[i])
        r_cr.iloc[0,:].plot(color = clr[i])
    else:
        r_fod, p_fod = find_rpval (resample_spectra(fod_spec, n_bands), T[i])
        r_fod.iloc[0,:].plot(color = clr[i])
    
    plt.ylim([-0.7, 0.7])

ipywidgets.interact(plot_corr, target = target_names, spec_cr_fod = ['spec', 'cr','fod'], n_bands = nbands_sampling)



interactive(children=(Dropdown(description='target', options=('sand', 'silt', 'clay', 'TOC'), value='sand'), D…

<function __main__.plot_corr(target, spec_cr_fod, n_bands)>

# Step 2:  Parameters for Best Train-Test Split  

In [27]:
# Building Mtree (Model Tree) ----------------------------------

tst_siz = 0.20

rand_t = [None] * (len(T))
err_t = [None] * (len(T))

print('Without Normalization:')
for i in range (0,len(T)):
    rand_t[i], err_t[i] = best_split(spectra.copy(), T[i], tst_siz) 
    print ('For '+ target_names[i]+ ' :test size =', tst_siz, '\t min bin error=', err_t[i], '\t at randome state =', rand_t[i])
    
rand_nt = [None] * (len(T))
err_nt = [None] * (len(T))

print('After Normalization:')
for i in range (0,len(T)):
    rand_nt[i], err_nt[i] = best_split(spectra.copy(), NT[i], tst_siz)     
    print ('For '+ target_names[i]+ ' :test size =', tst_siz, '\t min bin error=', err_nt[i], '\t at randome state =', rand_nt[i])
    
    

Without Normalization:
For sand :test size = 0.2 	 min bin error= 20.0 	 at randome state = 34
For silt :test size = 0.2 	 min bin error= 10.0 	 at randome state = 25
For clay :test size = 0.2 	 min bin error= 10.0 	 at randome state = 20
For TOC :test size = 0.2 	 min bin error= 10.0 	 at randome state = 0
After Normalization:
For sand :test size = 0.2 	 min bin error= 20.0 	 at randome state = 34
For silt :test size = 0.2 	 min bin error= 20.0 	 at randome state = 25
For clay :test size = 0.2 	 min bin error= 10.0 	 at randome state = 20
For TOC :test size = 0.2 	 min bin error= 15.0 	 at randome state = 30


# Step 3: Leave one out Predictions

##  leave_one_out

In [28]:
def leave_one_out (spectra, target, method):
    m = method
    (r,c) = spectra.shape
    
    Y_test = target.copy()
    Y_pred = target.copy()
    
    n_comp = 'NA'
    
    if method == 'plsr':
        avg_iqrp =[0]
        Yp =['NA']
        for n in range (1,6):
            for i in range (0,r): 
                full_spec = spectra.copy()
                X_train = full_spec.drop(full_spec.index[i], axis=0)
                X_test = full_spec.iloc[[i],:].copy()
                full_tar = target.copy()
                y_train = full_tar.drop(full_tar.index[i], axis=0)
                y_test = full_tar.iloc[i].copy() 
            
                Model = PLSRegression(n_components=n, scale=True)
                Model.fit(X_train, y_train)
                y_pred = Model.predict(X_test, copy=True)
                Y_pred.iloc[i] = y_pred
                
            avg_iqrp.append(np.round(find_iqrp (Y_pred, Y_test),2))
            Yp.append(Y_pred)
        
        max_iqrp = max(avg_iqrp)
        n_comp = avg_iqrp.index(max_iqrp)
        Y_pred = Yp[n_comp]
                
        
    else:  
        for i in range (0,r): 
            full_spec = spectra.copy()
            X_train = full_spec.drop(full_spec.index[i], axis=0)
            X_test = full_spec.iloc[[i],:].copy()
            full_tar = target.copy()
            y_train = full_tar.drop(full_tar.index[i], axis=0)
            y_test = full_tar.iloc[i].copy() 
            
            #----- MULTILINEAR:  fitting, and prediction---------- 
            if m == 'mult':
                Model = linear_model.LinearRegression()    
            #----- RANDOM_FOREST:   fitting, and prediction---------- 
            elif m == 'randomforest': 
                Model = RandomForestRegressor(random_state= 23)    
            #----- CUBIST REGRESSION:    fitting and prediction---------
            elif m == 'cubist':
                Model = Cubist(n_rules = 50, n_committees = 5, random_state = 42)    
            #------ SUPPORT VECTOR MACHINE FOR REGRESSION: fitting and prediction-----------      
            elif m == 'svr': 
                Model = SVR()
            #------ RIDGE REGRESSION: fitting and prediction-----------      
            elif m == 'ridge': 
                Model = KernelRidge()
            #------ GRADIENT BOOSTING REGRESSION: fitting and prediction-----------      
            else: 
                Model = GradientBoostingRegressor()
                    
            Model.fit(X_train, y_train)
            y_pred = Model.predict(X_test)
            Y_pred.iloc[i] = y_pred

    return (Y_pred, n_comp)


In [29]:
y_pred , n_comp = leave_one_out (sampled_spec[50], NT[0],'randomforest')
n_comp

'NA'

In [30]:
find_r2(y_pred, NT[0])

0.2428589913405974

# Step 4: Building Model Tree (Mtree)

In [31]:
import os
os.system('say "M Tree bulding about to start"')
import time

In [32]:
# 0. Available machine learning regression models --------------------------------- (7)
ml_methods = ['mult', 'plsr', 'randomforest', 'cubist', 'svr', 'ridge', 'gbrt']
#ml_methods = ['mult', 'plsr', 'cubist', 'randomforest', 'ridge' 'gbrt', 'svr']

In [33]:
max_n_comp = 5

def build_tree_for (method_name):
    tree ={}
    start = time.time()
    m = method_name
    #-- code to build tree----
    for t in target_names:
        print('tree for: '+ m +' ------> running on: ' + t)
        tree[t] ={}
        for tp in prepare_target:
            tree[t][tp] ={}
            for n in nbands_sampling:
                tree[t][tp][n] ={}
                for p in prepare_spec:
                    tree[t][tp][n][p] ={}
                    Y = tree[t][tp][n][p]
                    
                        
                    #------ setting spec to appropriate (sampled) spectra----
                    if p == 'none':
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_spec[100]
                            else:
                                spec = spec2[51]
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_spec[n]
                        
                    elif p == 'fod':
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_fod[100]
                            else:
                                spec = fod_spec
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_fod[n]
                                
                    else:  
                        if n == 0:
                            if m == 'randomforest' or m == 'cubist' or m == 'gbrt':
                                #---- reched here due to tree based methods e.g. randomforest--
                                spec = sampled_cr[100]
                            else:
                                spec = cr_spec
                        else:
                            #---- reached here with some n!=0----
                            spec = sampled_cr[n]
                        
                    #---- target selection and normalization ---
                    if tp == 'none':
                        y = T[target_names.index(t)]
                        
                    else:
                        y = NT[target_names.index(t)]
                        
                    #----- calling find_y_pred to compute leave one out predictions----
                    y_pred, n_com = leave_one_out (spec, y, m)
                    y_test = y
                    
                    #----- Model accuracy evaluation and plotting datas-----------
                                
                    Y['test'] = y_test
                    Y['testP'] = y_pred
                    Y['n_comp'] = n_com

                    Y['iqrp_test'] = find_iqrp(y_pred, y_test)
                    Y['r2_test'] = find_r2(y_pred, y_test)
                    Y['rpd_test'] = find_rpd(y_pred, y_test)
                    Y['rmse_test'] = find_rmse(y_pred, y_test)
                                                                
                                                                         
    end = time.time()                            
    os.system('say "M Tree has been built for one more method"')                            

    print('End time - Start time =', (end-start)) 
    
    return (tree.copy())


## Mtree initialisation (do not run below code every time)

In [34]:
Mtree ={}

## Creating different branches of Mtree (for separate methods)

### PLSR Branch 

In [35]:
Mtree['plsr'] = build_tree_for ('plsr')

tree for: plsr ------> running on: sand
tree for: plsr ------> running on: silt
tree for: plsr ------> running on: clay
tree for: plsr ------> running on: TOC
End time - Start time = 390.79838705062866


### Multiple Linear Regression Branch

In [36]:
Mtree['mult'] = build_tree_for ('mult')

tree for: mult ------> running on: sand
tree for: mult ------> running on: silt
tree for: mult ------> running on: clay
tree for: mult ------> running on: TOC
End time - Start time = 76.25055289268494


### SVM Branch

In [37]:
Mtree['svr'] = build_tree_for ('svr')

tree for: svr ------> running on: sand
tree for: svr ------> running on: silt
tree for: svr ------> running on: clay
tree for: svr ------> running on: TOC
End time - Start time = 52.789610862731934


### Ridge Regression Branch

In [38]:
Mtree['ridge'] = build_tree_for ('ridge')

tree for: ridge ------> running on: sand
tree for: ridge ------> running on: silt
tree for: ridge ------> running on: clay
tree for: ridge ------> running on: TOC
End time - Start time = 50.558943033218384



### Random Forest Branch

In [39]:
Mtree['randomforest'] = build_tree_for ('randomforest')

tree for: randomforest ------> running on: sand
tree for: randomforest ------> running on: silt
tree for: randomforest ------> running on: clay
tree for: randomforest ------> running on: TOC
End time - Start time = 3720.8849902153015


### GBRT Branch

In [40]:
Mtree['gbrt'] = build_tree_for ('gbrt')

tree for: gbrt ------> running on: sand
tree for: gbrt ------> running on: silt
tree for: gbrt ------> running on: clay
tree for: gbrt ------> running on: TOC
End time - Start time = 2445.6525502204895


### Cubist Branch

In [41]:
Mtree['cubist'] = build_tree_for ('cubist')

tree for: cubist ------> running on: sand
tree for: cubist ------> running on: silt
tree for: cubist ------> running on: clay
tree for: cubist ------> running on: TOC
End time - Start time = 1189.9978563785553


## Best of all worlds

In [42]:
ml_methods = ['mult', 'plsr', 'randomforest','cubist','svr', 'ridge', 'gbrt']

In [43]:
def best_model_parameters (Mtree, target, method, scorer):
    t=target
    m= method
    
    best_score = -1
    best_n_comp = 'NA'
    
    
    for tp in prepare_target:
        for n in nbands_sampling:
            for p in prepare_spec:
                Y = Mtree[m][t][tp][n][p]
                    
                if scorer == 'iqrp':
                    cur_score = Y['iqrp_test']
                elif scorer == 'rpd':
                    cur_score = Y['rpd_test']
                else:
                    cur_score = Y['r2_test']  
                    
                if cur_score > best_score:
                    best_score = cur_score
                    best_tp = tp
                    best_n = n
                    best_p = p
                    if m == 'plsr':
                        best_n_comp = Y['n_comp']
                            
    param_list = [scorer, np.round(best_score,2), 'Spec:', best_p, 'bands:', best_n, 'Tar:', best_tp]                                 
    return (param_list)                                
    

In [44]:
def best_score_for (Mtree, target, scorer):
    
    for method in ml_methods:
        param_list= best_model_parameters (Mtree, target, method, scorer)
        print('For:'+target+'->', param_list, ':'+method)
     
    return

In [45]:
best_score_for(Mtree, 'sand', 'iqrp')

For:sand-> ['iqrp', 1.51, 'Spec:', 'none', 'bands:', 10, 'Tar:', 'none'] :mult
For:sand-> ['iqrp', 1.56, 'Spec:', 'none', 'bands:', 10, 'Tar:', 'none'] :plsr
For:sand-> ['iqrp', 1.73, 'Spec:', 'continuum', 'bands:', 30, 'Tar:', 'minmax'] :randomforest
For:sand-> ['iqrp', 1.56, 'Spec:', 'continuum', 'bands:', 20, 'Tar:', 'minmax'] :cubist
For:sand-> ['iqrp', 1.45, 'Spec:', 'continuum', 'bands:', 15, 'Tar:', 'minmax'] :svr
For:sand-> ['iqrp', 1.51, 'Spec:', 'none', 'bands:', 0, 'Tar:', 'minmax'] :ridge
For:sand-> ['iqrp', 1.72, 'Spec:', 'continuum', 'bands:', 25, 'Tar:', 'none'] :gbrt


In [46]:
best_score_for(Mtree, 'sand', 'r2')

For:sand-> ['r2', 0.13, 'Spec:', 'none', 'bands:', 10, 'Tar:', 'none'] :mult
For:sand-> ['r2', 0.19, 'Spec:', 'none', 'bands:', 10, 'Tar:', 'none'] :plsr
For:sand-> ['r2', 0.34, 'Spec:', 'continuum', 'bands:', 30, 'Tar:', 'minmax'] :randomforest
For:sand-> ['r2', 0.19, 'Spec:', 'continuum', 'bands:', 20, 'Tar:', 'minmax'] :cubist
For:sand-> ['r2', 0.06, 'Spec:', 'continuum', 'bands:', 15, 'Tar:', 'minmax'] :svr
For:sand-> ['r2', 0.13, 'Spec:', 'none', 'bands:', 0, 'Tar:', 'minmax'] :ridge
For:sand-> ['r2', 0.33, 'Spec:', 'continuum', 'bands:', 25, 'Tar:', 'none'] :gbrt


In [47]:
best_score_for(Mtree, 'TOC', 'iqrp')

For:TOC-> ['iqrp', 1.07, 'Spec:', 'none', 'bands:', 5, 'Tar:', 'minmax'] :mult
For:TOC-> ['iqrp', 1.08, 'Spec:', 'none', 'bands:', 40, 'Tar:', 'minmax'] :plsr
For:TOC-> ['iqrp', 1.08, 'Spec:', 'continuum', 'bands:', 20, 'Tar:', 'minmax'] :randomforest
For:TOC-> ['iqrp', 1.13, 'Spec:', 'fod', 'bands:', 40, 'Tar:', 'minmax'] :cubist
For:TOC-> ['iqrp', 1.08, 'Spec:', 'continuum', 'bands:', 100, 'Tar:', 'minmax'] :svr
For:TOC-> ['iqrp', 1.06, 'Spec:', 'none', 'bands:', 0, 'Tar:', 'minmax'] :ridge
For:TOC-> ['iqrp', 1.05, 'Spec:', 'fod', 'bands:', 5, 'Tar:', 'minmax'] :gbrt


In [48]:
best_score_for(Mtree, 'TOC', 'r2')

For:TOC-> ['r2', 0.18, 'Spec:', 'none', 'bands:', 5, 'Tar:', 'minmax'] :mult
For:TOC-> ['r2', 0.2, 'Spec:', 'none', 'bands:', 40, 'Tar:', 'minmax'] :plsr
For:TOC-> ['r2', 0.19, 'Spec:', 'continuum', 'bands:', 20, 'Tar:', 'minmax'] :randomforest
For:TOC-> ['r2', 0.27, 'Spec:', 'fod', 'bands:', 40, 'Tar:', 'minmax'] :cubist
For:TOC-> ['r2', 0.2, 'Spec:', 'continuum', 'bands:', 100, 'Tar:', 'minmax'] :svr
For:TOC-> ['r2', 0.18, 'Spec:', 'none', 'bands:', 0, 'Tar:', 'minmax'] :ridge
For:TOC-> ['r2', 0.15, 'Spec:', 'fod', 'bands:', 5, 'Tar:', 'minmax'] :gbrt


## Plotting Model Accuracy (ipywidgets)

In [49]:
def plot_model_acc (target, target_preprocessing, spec_preprocessing, n_bands, method):
    
    m = method
    t = target
    
    i = target_names.index(target)
    
    
    p = spec_preprocessing
    n = n_bands
    tp = target_preprocessing
    
    Y = Mtree[m][t][tp][n][p]
    
    y_test = Y['test']
    y_pred = Y['testP']
#     y_train = Y['train']
#     yhat_pred = Y['trainP']
    
    if m == 'plsr':
        n_com = Y['n_comp']
        y_pred = y_pred[:,0]
#         yhat_pred = yhat_pred[:,0]
    
    
    iqrp_test = Y['iqrp_test']
    r2_test = Y['r2_test']
    rpd_test = Y['rpd_test']
    
#     iqrp_train = find_iqrp(yhat_pred, y_train)
#     r2_train = find_r2(yhat_pred, y_train)
#     rpd_train = find_rpd(yhat_pred, y_train)
    
    y_tp = pd.DataFrame({'actual':y_test.values, 'predic': y_pred})
    z = np.polyfit(y_test, y_pred, 1)
    
#     yhat_tp = pd.DataFrame({'actual':y_train.values, 'predic': yhat_pred})
#     zhat = np.polyfit(y_train, yhat_pred, 1)
    
    fig, axes = plt.subplots(1,2, figsize=(18,8))
    
    #with plt.style.context(('ggplot')): ---- PLOT of test-prediction --------------------------------------
    y_tp.plot.scatter(ax= axes[0], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
    axes[0].plot(y_test, np.polyval(z, y_test),  c='blue', linewidth=1)
    axes[0].plot(y_test, y_test, color='green', linewidth=1)
    axes[0].tick_params(axis='both', labelsize=10)
    axes[0].text(0.05, 0.95, target_names[i]+' (Test Data)', transform=axes[0].transAxes, fontsize = 20, color = clr[i])
    axes[0].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_test), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_test,3)), transform=axes[0].transAxes, fontsize = 16)
    axes[0].text(0.95, 0.15, 'Method: '+method, transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 20)
    
    if method == 'plsr':
        axes[0].text(0.95, 0.05, 'n_component={:.2f}'.format(n_com), transform=axes[0].transAxes, 
                    horizontalalignment='right', fontsize = 12)
    
    #---------------------------------- ---- PLOT of train-prediction --------------------------------------
#     yhat_tp.plot.scatter(ax= axes[1], x="actual", y="predic", alpha=0.8, color = clr[i], edgecolors='k')
#     axes[1].plot(y_train, np.polyval(zhat, y_train),  c='blue', linewidth=1)
#     axes[1].plot(y_train, y_train, color='green', linewidth=1)
#     axes[1].tick_params(axis='both', labelsize=10)
#     axes[1].text(0.05, 0.95,  target_names[i]+' (Training Data)', transform=axes[1].transAxes,fontsize = 20, color = clr[i])
#     axes[1].text(0.05, 0.90, 'IQRP ={:.2f}'.format(iqrp_train), transform=axes[1].transAxes, fontsize = 16)
#     axes[1].text(0.05, 0.85, 'RPD ={:.2f}'.format(rpd_train), transform=axes[1].transAxes, fontsize = 16)
#     axes[1].text(0.05, 0.80, 'R2 ={:.2f}'.format(np.round(r2_train,3)), transform=axes[1].transAxes, fontsize = 16)
#     axes[1].text(0.95, 0.15, 'Method: '+method, transform=axes[1].transAxes, 
#                     horizontalalignment='right', fontsize = 20)
    return    

In [50]:
ipywidgets.interact(plot_model_acc, target = target_names,target_preprocessing = prepare_target, \
                    method = ml_methods, spec_preprocessing = prepare_spec, n_bands = nbands_sampling)

interactive(children=(Dropdown(description='target', options=('sand', 'silt', 'clay', 'TOC'), value='sand'), D…

<function __main__.plot_model_acc(target, target_preprocessing, spec_preprocessing, n_bands, method)>

## Saving Mtree using pickle

In [56]:
with open ('uae_sampled_sat_l1out.pickle', 'wb') as file:
    pickle.dump(Mtree, file)

In [57]:
with open ('uae_sampled_sat_l1out.pickle', 'rb') as file:
    Mtree1 = pickle.load(file)

In [59]:
#Mtree1

In [51]:
person1 = {'name': 'Abhishek', 'age': '34'}

In [53]:
with open ('name_age.pickle', 'wb') as file:
    pickle.dump(person1, file)

In [54]:
with open ('name_age.pickle', 'rb') as file:
    data = pickle.load(file)

In [55]:
data

{'name': 'Abhishek', 'age': '34'}