In [1]:
import import_ipynb 
from HeaderFile import *

import import_ipynb
from SoilPrep import *

importing Jupyter notebook from HeaderFile.ipynb
importing Jupyter notebook from SoilPrep.ipynb


In [2]:
with open ('meta_data.pickle', 'rb') as file:
    MetaData = pickle.load(file)

sg_filters = MetaData['sg_filters']
window_lengths = MetaData['window_lengths']
prepare_spec = MetaData['prepare_spec']
nbands_sampling = MetaData['nbands_sampling']
target_names = MetaData['target_names']
prepare_target = MetaData['prepare_target']
ml_methods = MetaData['ml_methods']


with open ('data.pickle', 'rb') as file:
    Data = pickle.load(file)

spectra = Data['spectra']
T = Data['T']
NT = Data['NT']
smth_spec = Data['smth_spec']
fod_spec = Data['fod_spec']
cr_spec = Data['cr_spec'] 
log_spec = Data['log_spec']
sampled_spec = Data['sampled_spec']
sampled_cr = Data['sampled_cr']
sampled_fod = Data['sampled_fod']
sampled_log = Data['sampled_log'] 
rand_t = Data['rand_t']
rand_nt = Data['rand_nt'] 


with open ('Mtree.pickle', 'rb') as file:
    Mtree = pickle.load(file)

 #  Models for Sand, Silt, Clay, TOC, and CaCO3..

##  Sand (Best 3 Models)

Sand -> [ iqrp - **3.05**, r2 - **0.69**, Spec - none, n_bands - 50, Tar - none] :   **cubist**  (Best Model) <br/><br/>
***Description***: We get the best model for sand when we use cubist method. \
While training the model we utilize the smoothed spectra resampled to 50 bands and
sand values are used without any preprocessing.

In [3]:
sand_model1_cubist = Cubist(n_rules = 50, n_committees = 5, random_state = 42) 
sand_model1_cubist.fit(sampled_spec[50], T[0])

Cubist(n_committees=5, n_rules=50, random_state=42)

Sand-> [ iqrp - **2.99**, r2 - **0.67**, Spec - fod, n_bands - 30, Tar - minmax] : **gbrt**   (Second Best) <br/><br/>
***Description*** We get the second best model for sand when we use gbrt (gradient boosted regression tree) method. \
While training the model we utilize the first order derivative of the smoothed spectra which is resampled to 30 bands 
and sand values are used by  normalizing it in the range of 0-10.

In [4]:
sand_model2_gbrt = GradientBoostingRegressor() 
sand_model2_gbrt.fit(sampled_fod[30], NT[0])

GradientBoostingRegressor()

Sand-> [ iqrp - **2.75**, r2 -**0.62**, Spec - fod, n_bands - 25, Tar - none]: **mult**  (Third Best) <br/><br/>
***Description*** We get the third best model for sand when we use mult (multiple linear regression) method. \
While training the model we utilize the first order derivative of the smoothed spectra which is resampled to 25 bands and sand values are used without any preprocessing.

In [5]:
sand_model3_mult = linear_model.LinearRegression()
sand_model3_mult.fit(sampled_fod[25], T[0])

LinearRegression()

##  Silt (Best  Model)

Silt -> [ iqrp - **1.89**, r2 - **0.41**, Spec - fod, n_bands - 25, Tar - minmax] :   **plsr**  (Best Model) <br/><br/>
***Description***: We get the best model for silt when we use plsr method. \
While training the model we utilize the first order derivative of the smoothed spectra which is resampled to 25 bands and
silt values are used by normalizing it in the range of 0-10.

In [6]:
silt_model_plsr = PLSRegression(n_components= 4, scale=True)
silt_model_plsr.fit(sampled_fod[25], NT[1])

PLSRegression(n_components=4)

##  Clay (Best 3 Models)

Clay -> [ iqrp - **2.99**, r2 - **0.77**, Spec - log, n_bands - 15, Tar - minmax] :   **mult**  (Best Model) <br/><br/>
***Description***: We get the best model for clay when we use mult (multiple linear regression) method. \
While training the model we utilize the inverse logarithmic reflectance (1/logR) of the smoothed spectra which is resampled to 15 bands and
clay values are used by  normalizing it in the range of 0-10.

In [7]:
clay_model1_mult = linear_model.LinearRegression()
clay_model1_mult.fit(sampled_log[15], NT[2])

LinearRegression()

Clay-> [ iqrp - **2.75**, r2 - **0.73**, Spec - continuum, n_bands - 15, Tar - minmax] : **gbrt**   (Second Best) <br/><br/>
***Description*** We get the second best model for clay when we use gbrt (gradient boosted regression tree) method. \
While training the model we utilize the continuum removed version of the smoothed spectra which is resampled to 15 bands 
and clay values are used by  normalizing it in the range of 0-10.

In [8]:
clay_model2_gbrt = GradientBoostingRegressor() 
clay_model2_gbrt.fit(sampled_cr[15], NT[2])

GradientBoostingRegressor()

Clay -> [ iqrp - **2.46**, r2 - **0.66**, Spec - none, n_bands - 40, Tar - none] :   **cubist**  (Best Model) <br/><br/>
***Description***: We get the third best model for clay when we use cubist method. \
While training the model we utilize the smoothed spectra resampled to 40 bands and
clay values are used without any preprocessing.

In [9]:
clay_model3_cubist = Cubist(n_rules = 50, n_committees = 5, random_state = 42) 
clay_model3_cubist.fit(sampled_spec[40], T[2])

Cubist(n_committees=5, n_rules=50, random_state=42)

##  TOC (Best 3 Models)

TOC -> [ iqrp - **2.21**, r2 - **0.75**, Spec - continuum, n_bands - 30, Tar - minmax] :   **gbrt**  (Best Model) <br/><br/>
***Description***: We get the best model for TOC when we use gbrt (gradient boosted regression tree) method. \
While training the model we utilize the continuum removed version of the smoothed spectra which is resampled to 30 bands and
TOC values are used by  normalizing it in the range of 0-10.

In [10]:
TOC_model1_gbrt = GradientBoostingRegressor() 
TOC_model1_gbrt.fit(sampled_cr[30], NT[3])

GradientBoostingRegressor()

TOC-> [ iqrp - **2.02**, r2 - **0.71**, Spec - continuum, n_bands - 30, Tar - minmax] : **randomforest**   (Second Best) <br/><br/>
***Description*** We get the second best model for TOC when we use random forest method. \
While training the model we utilize the continuum removed version of the smoothed spectra which is resampled to 30 bands 
and TOC values are used by  normalizing it in the range of 0-10.

In [11]:
TOC_model2_randomforest =RandomForestRegressor(random_state= 23)   
TOC_model2_randomforest.fit(sampled_cr[30], NT[3])

RandomForestRegressor(random_state=23)

TOC-> [ iqrp - **2.21**, r2 - **0.69**, Spec - log, n_bands - 35, Tar - none] : **cubist**   (Third Best) <br/><br/>
***Description*** We get the third best model for TOC when we use cubist method. \
While training the model we utilize the inverse logarithmic reflectance (1/logR) of the smoothed spectra which is resampled to 35 bands and TOC values are used without any preprocessing.

In [12]:
TOC_model3_cubist = Cubist(n_rules = 50, n_committees = 5, random_state = 42) 
TOC_model3_cubist.fit(sampled_log[35], T[3])

Cubist(n_committees=5, n_rules=50, random_state=42)

##  CaCO3 (Best 3 Models)

CaCO3 -> [ iqrp - **2.27**, r2 - **0.58**, Spec - fod, n_bands - 20, Tar - none] :   **plsr**  (Best Model) <br/><br/>
***Description***: We get the best model for CaCO3 when we use plsr method. \
While training the model we utilize the first order derivative of the smoothed spectra which is resampled to 20 bands and
CaCO3 values are used without any preprocessing.

In [13]:
CaCO3_model1_plsr = PLSRegression(n_components= 3, scale=True)
CaCO3_model1_plsr.fit(sampled_fod[20], T[4])

PLSRegression(n_components=3)

CaCO3-> [ iqrp - **2.18**, r2 - **0.55**, Spec - fod, n_bands - 10, Tar - minmax] : **randomforest**   (Second Best) <br/><br/>
***Description*** We get the second best model for CaCO3 when we use random forest method. \
While training the model we utilize the first order derivative of the smoothed spectra which is resampled to 10 bands 
and CaCO3 values are used by  normalizing it in the range of 0-10.

In [14]:
CaCO3_model2_randomforest =RandomForestRegressor(random_state= 23)   
CaCO3_model2_randomforest.fit(sampled_fod[10], NT[4])

RandomForestRegressor(random_state=23)

CaCO3 -> [ iqrp - **2.14**, r2 - **0.53**, Spec - continuum, n_bands - 30, Tar - none] :   **gbrt**  (Third Best) <br/><br/>
***Description***: We get the third best model for CaCO3 when we use gbrt (gradient boosted regression tree) method. \
While training the model we utilize the continuum removed version of the smoothed spectra which is resampled to 30 bands and
CaCO3 values are used without any preprocessing.

In [15]:
CaCO3_model3_gbrt = GradientBoostingRegressor() 
CaCO3_model3_gbrt.fit(sampled_cr[30], T[4])

GradientBoostingRegressor()