# Importing Header, SoilPrep, and PlotFunctions

In [18]:
import import_ipynb 
from Header import *

import import_ipynb
from SoilPrep import *

In [19]:
import import_ipynb
from PlotFunctions import *

In [20]:
# import import_ipynb
# from HyperMain import * 

## Loading  Mtree 

In [21]:
with open ('HMtree.pickle', 'rb') as file:
    HMtree = pickle.load(file)
 

 #  Best Models for Sand, Silt, Clay,TOC and CaCO3.

##  Sand 

### GBRT

Sand -> [ r2 - **0.35**, Spec - fod, n_bands -11]     <br/><br/>
***Description***: To obtain the best model for sand using gradient boosting regression tree method we trained the model on the first order derivative of the  smoothed spectra which was  resampled to the 11 bands provided by Venus.

In [23]:
sand_model_gbrt = GradientBoostingRegressor(learning_rate = 0.1, max_depth = 3, n_estimators = 30, random_state = 42) 
sand_model_gbrt.fit(fod_sampled11,  T[0])

### Kernel Ridge Regression

Sand -> [ r2 - **0.35**, Spec - log, n_bands -11]

Description: To obtain the best model for sand using kernel ridge regression method we trained the model on the logarithmic transformed spectra which was resampled to the 11 bands provided by Venus.

In [24]:
sand_model_kernelridge = KernelRidge(alpha = 0.1) 
sand_model_kernelridge.fit(sampled_log11,  T[0])

## Silt

### GBRT

Silt -> [ r2 - **0.34**, Spec - fod, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for silt using gradient boosting regression tree method we trained the model on the first order derivative of the smoothed spectra which was resampled to the 11 bands provided by Venus.


In [25]:
silt_model_gbrt = GradientBoostingRegressor(learning_rate = 0.1, max_depth = 3, n_estimators = 30, random_state = 42) 
silt_model_gbrt.fit(fod_sampled11,  T[1])

###  Support Vector Regression 

Silt -> [ r2 - **0.22**, Spec - log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for silt using svr method we trained the model on the logarithmic transformed spectra which was resampled to the 11 bands provided by Venus.


In [26]:
silt_model_svr = SVR(C = 10, gamma = 1, kernel = 'rbf')
silt_model_svr.fit(sampled_log11,  T[1])

## Clay 

### PLSR

Clay -> [ r2 - **0.49**, Spec - log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for Clay using plsr method we trained the model on the logarithmic transformed spectra which was resampled to the 11 bands provided by Venus.


In [27]:
clay_model_plsr = PLSRegression(n_components=4, scale=True)
clay_model_plsr.fit(sampled_log11,  T[2])

In [28]:
from sklearn.cross_decomposition import PLSRegression

# Fit PLSR model
clay_model_plsr = PLSRegression(n_components=4, scale=True)
clay_model_plsr.fit(sampled_log11,  T[2]) # X is your predictor variables, y is your response variable

# Access loadings
loadings_clay = clay_model_plsr.x_loadings_
print(loadings_clay)


[[ 0.27567221  0.58356036  0.11510752 -0.53132086]
 [ 0.28481812  0.54343697  0.08533526 -0.1526397 ]
 [ 0.2959181   0.47821452  0.09862668  0.26857369]
 [ 0.31788823  0.28502757 -0.10292882  0.73627816]
 [ 0.33087477 -0.03594014 -0.43090598  0.15085951]
 [ 0.33063984 -0.07583983 -0.36268175  0.04602635]
 [ 0.33010162 -0.10676229 -0.29067039 -0.03953149]
 [ 0.32934031 -0.13514646 -0.17542099 -0.12427914]
 [ 0.32952689 -0.13486746 -0.00310792 -0.1318811 ]
 [ 0.32969903 -0.09394362  0.45118027 -0.11198128]
 [ 0.32839336 -0.08639971  0.6577016  -0.16553547]]


In [29]:
# Spectral bands
mid_locs = ['420', '443', '490', '555', '638', '672', '702', '742', '782', '865', '910']

# Identify most important bands for each component
num_components = len(loadings_clay[0])
for component in range(num_components):
    component_loadings = loadings_clay[:, component]
    absolute_loadings = [abs(loading) for loading in component_loadings]
    most_important_indices = sorted(range(len(absolute_loadings)), key=lambda i: absolute_loadings[i], reverse=True)[:4]
    most_important_bands = [mid_locs[i] for i in most_important_indices]
    print(f"Component {component + 1}: Most important bands are {most_important_bands}")


Component 1: Most important bands are ['638', '672', '702', '865']
Component 2: Most important bands are ['420', '443', '490', '555']
Component 3: Most important bands are ['910', '865', '638', '672']
Component 4: Most important bands are ['555', '420', '490', '910']


### Cubist

Clay -> [ r2 - **0.43**, Spec - fod, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for clay using cubist method we trained the model on the first order derivative  of the smoothed spectra which was  resampled to the 11 bands provided by Venus.


In [30]:
clay_model_cubist = Cubist(n_committees = 10, n_rules = 10)
clay_model_cubist.fit(fod_sampled11,  T[2])

  x = x.applymap(lambda a: a.lstrip())


## TOC

### Cubist

Clay -> [ r2 - **0.50**, Spec - fod_log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for TOC using cubist method we trained the model on the first order derivative  of logarithmic tranformed smoothed spectra which was  resampled to the 11 bands provided by Venus.


In [31]:
TOC_model_cubist = Cubist(n_committees = 20, n_rules = 10)
TOC_model_cubist.fit(fod_log11,  T[3])

  x = x.applymap(lambda a: a.lstrip())


###  Support Vector Regression 

TOC -> [ r2 - **0.42**, Spec - log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for TOC using svr method we trained the model on the logarithmic transformed spectra which was resampled to the 11 bands provided by Venus.


In [32]:
TOC_model_svr = SVR(C = 10, gamma = 1, kernel = 'rbf')
TOC_model_svr.fit(sampled_log11,  T[3])

## CaCO3

### Cubist

CaCO3 -> [ r2 - **0.60**, Spec - fod_log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for CaCO3 using cubist method we trained the model on the first order derivative  of logarithmic tranformed smoothed spectra which was  resampled to the 11 bands provided by Venus.


In [33]:
CaCO3_model_cubist = Cubist(n_committees = 5, n_rules = 10)
CaCO3_model_cubist.fit(fod_log11,  T[4])

  x = x.applymap(lambda a: a.lstrip())


### PLSR

CaCO3 -> [ r2 - **0.55**, Spec - fod_log, n_bands - 11]   <br/><br/>
***Description***:To obtain the best model for CaCO3 using plsr method we trained the model on the first order derivative of the logarithmic transformed smoothed spectra which was resampled to the 11 bands provided by Venus.


In [34]:
CaCO3_model_plsr = PLSRegression(n_components=10, scale=True)
CaCO3_model_plsr.fit(fod_log11,  T[4])

In [35]:
from sklearn.cross_decomposition import PLSRegression

# Fit PLSR model
CaCO3_model_plsr = PLSRegression(n_components=10, scale=True)
CaCO3_model_plsr.fit(sampled_log11,  T[4]) # X is your predictor variables, y is your response variable

# Access loadings
loadings_CaCO3 = CaCO3_model_plsr.x_loadings_
print(loadings_CaCO3)


[[ 2.92674161e-01  4.48662616e-01  9.38740974e-02 -1.28281102e+00
   5.60856703e-01 -1.95428106e-01 -5.07645584e-02 -4.80728952e-01
   9.35245329e-04  5.14109219e-02]
 [ 2.99374916e-01  4.01413174e-01 -2.17442916e-02 -3.51942165e-01
   1.65711960e-01  1.62225383e-01  2.10720072e-01  7.87707526e-01
   6.81354347e-02 -4.93415644e-02]
 [ 3.06693977e-01  3.28018494e-01 -1.05574018e-01  6.32448799e-01
  -4.88242646e-01  1.99896052e-01 -2.53045121e-01 -2.55452015e-01
  -1.83258972e-01 -6.51334478e-02]
 [ 3.17921924e-01  1.15135816e-01 -3.41287022e-01  1.84663716e+00
  -4.64667992e-01  1.11435595e-02  1.10536422e-02 -1.08290591e-01
   2.63554660e-01  1.40288562e-01]
 [ 3.14174669e-01 -2.22213321e-01 -3.55512068e-01 -2.87635204e-01
  -1.11337516e-02 -6.04411169e-01  6.78419419e-01  1.53532291e-01
  -1.01957570e+00 -3.80635162e-01]
 [ 3.11973175e-01 -2.61529418e-01 -2.58593962e-01 -4.48479609e-01
   4.97019939e-02 -2.93101212e-01 -2.99410123e-01 -9.65883646e-03
   6.90532605e-01 -5.23383329e-02

In [17]:
# Spectral bands
mid_locs = ['420', '443', '490', '555', '638', '672', '702', '742', '782', '865', '910']

# Identify most important bands for each component
num_components = len(loadings_CaCO3[0])
for component in range(num_components):
    component_loadings = loadings_CaCO3[:, component]
    absolute_loadings = [abs(loading) for loading in component_loadings]
    most_important_indices = sorted(range(len(absolute_loadings)), key=lambda i: absolute_loadings[i], reverse=True)[:10]
    most_important_bands = [mid_locs[i] for i in most_important_indices]
    print(f"Component {component + 1}: Most important bands are {most_important_bands}")

Component 1: Most important bands are ['555', '638', '672', '865', '702', '910', '782', '742', '490', '443']
Component 2: Most important bands are ['420', '443', '490', '742', '782', '702', '865', '672', '910', '638']
Component 3: Most important bands are ['910', '865', '638', '555', '672', '702', '490', '782', '420', '742']
Component 4: Most important bands are ['555', '420', '490', '910', '702', '742', '672', '865', '443', '638']
Component 5: Most important bands are ['420', '490', '555', '742', '910', '782', '443', '702', '865', '672']
Component 6: Most important bands are ['638', '782', '742', '910', '672', '490', '420', '443', '702', '865']
Component 7: Most important bands are ['638', '910', '865', '702', '672', '490', '443', '742', '782', '420']
Component 8: Most important bands are ['443', '420', '490', '865', '910', '638', '555', '702', '742', '672']
Component 9: Most important bands are ['638', '865', '910', '672', '702', '782', '742', '555', '490', '443']
Component 10: Most 