In [1]:
# !pip install openpyxl
# !pip install imbalanced-learn
# !pip3 install ipympl
# !pip install shapely
# !pip install SciencePlots 
# !pip install seaborn
# !pip install tqdm
# !pip install ipywidgets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
import os, sys
from numpy import nan
import re
import ipympl
# from IPython.core.display import display, HTML
import ipywidgets
import json
from os import listdir
import glob
import math
from IPython.display import Image, display, HTML
from shapely.geometry import mapping
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, precision_score, recall_score, mean_absolute_error, make_scorer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut, cross_validate
from sklearn.impute import SimpleImputer
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from tqdm import tqdm
import pickle
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from scipy import stats
np.seterr(divide='ignore', invalid='ignore')
pd.options.display.max_columns = 100
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline
from scipy.signal import savgol_filter
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d

# 1. Data Preprocessing (Spectra)

## Smoothing Filter ( First Order)


In [3]:
# Input spectra as features

def sgsmooth (spectrum,window):
    filt_spec = spectrum.iloc[:,:].copy()
    (row,col) = spectrum.shape
    if row == 1:
        print('you should have given spectra as a dataframe -- not series')
    if window != 0:
        for c in range (0,col):
            if c>= window and c <= (col-window):      ## SAFE
                filt_spec.iloc[:,c] = spectrum.iloc[:,(c-window):(c+window)].mean(axis=1).copy()
            elif c < window and c <= (col-window):    ##  LEFT
                filt_spec.iloc[:,c] = spectrum.iloc[:,0:(c+window)].mean(axis=1).copy()
            elif c > (col-window) and c >= window:    ## RIGHT
                filt_spec.iloc[:,c] = spectrum.iloc[:,(c-window):col].mean(axis=1).copy()
            else:     ## LEFT & RIGHT  c < window and c > (col-window)
                filt_spec.iloc[:,c] = spectrum.iloc[:, 0:col].mean(axis=1).copy()        
    return (filt_spec.iloc[:,:].copy())





## Smoothing Filter ( Savitzky Golay First and Second Order)

In [4]:
# -------------- Smoothing Spectra using sg1: (savgol order 1) and sg2: (savgol order 2)  -----------

#--INPUT: window_len (should be odd positive integer), filt_type (should be 'sg1' or 'sg2')


def filt_sg(spectra, window_len, filt_type):
    sg = filt_type
    w = window_len
    
    if sg == 'sg1':
        if w ==0 or w == 1:
            smth_spec = spectra.copy()   
        else:
            smth_spec = spectra.copy()
            pd.DataFrame(savgol_filter(smth_spec, w, 1, axis=1), columns=smth_spec.columns, index=smth_spec.index)
            
    else:
        if w ==0 or w == 1:
            smth_spec = spectra.copy()   
        else:
            smth_spec = spectra.copy()
            pd.DataFrame(savgol_filter(smth_spec, w, 2, axis=1), columns=smth_spec.columns, index=smth_spec.index)
            
    return (smth_spec)

#--OUTPUT: smoothed spectra with same column names and row indices as original (input) spectra

## First Order Derivative

In [5]:
def fod (spectra):
    fo_spec = spectra.iloc[:,:].copy()
    (row,col) = fo_spec.shape
    
    for i in range(0,col):
        if i==col-1:
            fo_spec.iloc[:,i] = fo_spec.iloc[:,i-1]
        else:    
            fo_spec.iloc[:,i] = (spectra.iloc[:,i+1]- spectra.iloc[:,i])
        
    #fo_spec = 100*fo_spec
    return(fo_spec.copy())

## Continuum Removal

In [6]:
def continuum_removal(points, show=False):
    x1, y1 = points.T
    augmented = np.concatenate([points, [(x1[0], np.min(y1)-1), (x1[-1], np.min(y1)-1)]], axis=0)
    hull = ConvexHull(augmented)
    continuum_points = points[np.sort([v for v in hull.vertices if v < len(points)])]
    continuum_function = interp1d(*continuum_points.T)
    yprime = continuum_function(x1) - y1
    #yprime = y1 / continuum_function(x1)

    if show:
        fig, axes = plt.subplots(2, 1, sharex=True)
        axes[0].plot(x1, y1, label='Data')
        axes[0].plot(*continuum_points.T, label='Continuum')
        axes[0].legend()
        axes[1].plot(x1, yprime, label='Data / Continuum')
        axes[1].legend()

    return yprime


def continuum_removed(spectra):
    cr_spec = spectra.copy()    
    row, col = spectra.shape
    x1 = np.arange (0, col, 1)
    
    
    for r in range(0,row,1):
        y1 = cr_spec.iloc[r,:]
        points = np.c_[x1, y1]
        yprime = continuum_removal(points, show=False)
        cr_spec.iloc[r,:] = yprime
        
    return cr_spec

## Resampling (n_bands)

In [7]:
def resample_spectra (spectra, n_band):
    row, width = spectra.shape
    if n_band == 0:
        red_spectra = spectra.copy()
    else:
        w = width/n_band
        
        #----- obtaining the sampling locations in indx----
        indx = []
        for i in range (0,n_band,1):
            indx.append(np.floor((i+0.5)*w))
            
        #------ applying smoothing filter on spectra---------
        temp_smooth = sgsmooth (spectra, np.floor(0.5*w).astype(int))
        
        #------ picking values at sampling locations---------
        red_spectra = temp_smooth.iloc[:, indx].copy()
        
    return (red_spectra)

# 2. Data Preprocessing (Target Variables)

## Z-score Normalization 

In [8]:
def z_score (target):
    X = target.copy()
    # outliers removal
    mean = X.mean()
    std = X.std()
    X.loc[abs((X - mean)) >= 4*std] = mean
    
    X = (X-mean)/std
    return (X.copy())
 

# udf = pd.read_csv('uae.csv')
# Y = lognormal (udf['TOC'].copy())
# plt.hist(udf['TOC']/2.5, bins=98)
# plt.show()
# plt.hist(Y, bins=98)
# plt.show()



## Standard Normalization (Log+MinMax)

In [9]:
def min_max_normal (target):
    X = target.copy()
    # outliers removal
    mean = X.mean()
    std = X.std()
    X.loc[abs((X - mean)) >= 4*std] = mean
    # shift min to 1
    m = X.min()
    dis = (1-m)
    X = X + dis    
    # apply log transform
    #X = np.log(X)    
    # normalize/rescale using Min-Max method 
    minX = X.min()
    maxX = X.max()
    diff = maxX - minX
    X = ((X-minX)*10)/diff    
    return (X.copy())

## Calculating Pearson Correlation Coefficient

In [10]:
# Pearson corelation between different wavelengths and Targets/Outputs (i.e, sand, clay, silt, and TOC) 

def find_rpval (spectra, tar):
    (r, c) = spectra.shape
    
    r_val = spectra.iloc[[0], :].copy()
    p_val = spectra.iloc[[0], :].copy()
    
    for j in range(0, c):
        r_val.iloc[0,j], p_val.iloc[0,j] = stats.pearsonr(tar, spectra.iloc[:, j])
    
    return(r_val, p_val)


# 3. Train Test Split

In [11]:
# best_split finds random state and minimum error for best train test split

def best_split (X,y,tst_siz):
    ymin = y.min()
    ymax = y.max()
    trn_siz = 1-tst_siz
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= tst_siz, random_state=0)
    rand_st = 0
    bin_train = np.histogram(y_train, bins = 8, range = (ymin,ymax), density=False)
    bin_test = np.histogram(y_test, bins = 8, range = (ymin,ymax), density=False)
    error = abs((bin_train[0])/trn_siz - (bin_test[0])/tst_siz)
    cum_error = error.sum()
    min_err= cum_error
    
    for i in np.arange(1,42,1):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= tst_siz, random_state=i)
        bin_train = np.histogram(y_train, bins = 8, range = (ymin,ymax), density=False)
        bin_test = np.histogram(y_test, bins = 8, range = (ymin,ymax), density=False)
        error = abs((bin_train[0])/trn_siz - (bin_test[0])/tst_siz)
        cum_error = error.sum()
        if cum_error < min_err:
            min_err = cum_error
            rand_st = i
            #print(i)

    return (rand_st, min_err)


# 4. IQRP, RPD, R2, RMSE

In [12]:
def find_iqrp (Yp, Y):
    mse = mean_squared_error(Yp, Y)
    rmse = np.sqrt(mse)
    X = Y.copy()
    l = len(X)
    q1 = math.floor(l/4)
    q3 = math.floor(3*l/4)
    X = X.sort_values().reset_index(drop=True)
    res = (X[q3] - X[q1])/rmse
    return(res)

def find_rpd (Yp, Y):
    mse = mean_squared_error(Yp, Y)
    rmse = np.sqrt(mse)
    res = Y.std()/rmse
    return(res)

def find_r2 (Yp, Y):
    res = r2_score(Y, Yp)
    return(res)

def find_rmse(Yp, Y):
    res = np.sqrt(mean_squared_error(Y, Yp))
    return(res)

    