## Import All Libraries

In [None]:
#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['font.family'] = 'Times New Roman'

#import time libraries
import copy
from time import time
from tqdm import tqdm
import sys

#import rest libraries
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn import datasets, linear_model
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor


#import ABC libraries
from Hive import Hive
from Hive import Utilities

plt.rcParams.update({'font.size': 22})


## Dataset Preparation

In [None]:
# Load raw dataset
dataset = pd.read_csv('data/dataset_bisbul_flavonoid.csv')

# Slice dataset into input (X, wavelength features) and output (Y, flavonoid value)
num_features = 224
X = dataset.iloc[:, :num_features]
Y = dataset.iloc[:, 224:225]
#Y = dataset.iloc[:,225:226]
wavelengths = X.columns.values.tolist()

k_fold = KFold(n_splits=5, shuffle=True)
n_fold = 5


rmse_ori = []
fold_num = 5
max_components = num_features - 1


rmse_train = np.zeros(n_fold)
rmse_test  = np.zeros(n_fold)
r2_train   = np.zeros(n_fold)
r2_test    = np.zeros(n_fold)


## Regression with PLS and Optimization via ABC

## 1. Basic PLSR Regresion With Few Components

In [None]:
'''
=====================================================
CORE METHOD
-------------
REGRESSION WITH PLS & PARAMETER OPTIMIZATION VIA ABC
=====================================================
'''
st = time()

'''Step 1. Finding PLSR Best Component'''


# Data Training and Data Testing Declaration FOR PLSR ALL

opt_components = 50

for k, (train, test) in enumerate(k_fold.split(X, Y)):
    X_train, Y_train = X.iloc[train], Y.iloc[train]
    X_test, Y_test = X.iloc[test], Y.iloc[test]
    
    PLSR_ALL = PLSRegression(n_components = opt_components)
    PLSR_ALL.fit(X_train, Y_train)
    
    Y_train_meas_ALL = Y_train
    Y_train_pred_ALL = PLSR_ALL.predict(X_train)
    Y_test_meas_ALL = Y_test
    Y_test_pred_ALL = PLSR_ALL.predict(X_test)

    rmse_train[k] = np.sqrt(mse(Y_train_meas_ALL, Y_train_pred_ALL))
    r2_train[k] = r2_score(Y_train_meas_ALL, Y_train_pred_ALL)
    rmse_test[k] = np.sqrt(mse(Y_test_meas_ALL, Y_test_pred_ALL))
    r2_test[k] =  r2_score(Y_test_meas_ALL, Y_test_pred_ALL)

    
    # Declare input X to be optimized
    x_train = X - PLSR_ALL.x_mean_
    x_train /= PLSR_ALL.x_std_ 
    

    # Calculate predicted value with Cross-validation
    Y_cv = Y_train_pred_ALL
    #Y_cv = cross_val_predict(PLSR_ALL, X_train, Y_train, cv = fold_num)

    # Get PLSR RMSE
    #rmse = np.sqrt(mse(Y_train, Y_cv))
    #rmse_ori.append(rmse)

# Get limit of coeficient/parameters value B
    coef_plsr_ALL = PLSR_ALL.coef_


end_time = time() - st





In [None]:
score_table = pd.DataFrame([rmse_train, rmse_test, r2_train, r2_test]).transpose()
score_table.columns = ['RMSE_train','RMSE_test','R2_train','R2_test']
rmse = np.average(rmse_test)
rmse_ori.append(rmse)

mean_score =  pd.DataFrame(score_table.mean()).transpose()
mean_score.index = ['Average']
score_table = pd.concat([score_table,mean_score])
(score_table)




In [None]:
# Make predictions and determine the error
predictions = PLSR_ALL.predict(X_test)
errors = abs(predictions - Y_test)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / Y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
#-- Plot Training Data Only (Without Wavelength Selection)
plt.title('PLSR Measured vs. Predicted Value (n = 50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))

plt.scatter(Y_train_meas_ALL, Y_train_pred_ALL, label = 'Training Data',color = 'red')
#plt.scatter(Y_test_meas_ALL, Y_test_pred_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([100, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.xlim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()


In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('PLSR Measured vs. Predicted Value (n=50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
#plt.scatter(Y_train_meas_ALL, Y_train_pred_ALL, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_ALL, Y_test_pred_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()



In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('PLSR Measured vs. Predicted Value (n=50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(Y_train_meas_ALL, Y_train_pred_ALL, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_ALL, Y_test_pred_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()




## 2. ARTIFICIAL BEE COLONY OPTIMIZATION

In [None]:
# Declare objective function to be minimized by ABC

def R2_abc_ALL(B):
    # Declare parameter B to be optimized
    B = np.reshape(np.array(B), (num_features, 1))
    # Declare input X to be optimized
    
    length_to_split = int(len(dataset) * 0.8)
    X_train = X[:length_to_split]
    Y_train = Y[:length_to_split]
    x_train = X_train - PLSR_ALL.x_mean_
    x_train /= PLSR_ALL.x_std_
    kf = KFold(n_splits=5,shuffle=True)
    kf.split(X)
    return r2_score(Y_train, np.dot(x_train, B) + PLSR_ALL.y_mean_)


In [None]:
'''Step 4. Optimization of PLSR w/ Selected Wavelengths via ABC'''
# Prepare ABC
n_bees = 50
iter_count = 1
lr = 0.0001
st = time()



# Prepare ABC class for ALL wavelengths
lower_lim_ALL = [b - (abs(b)*lr) for b in coef_plsr_ALL] 
upper_lim_ALL = [b - (abs(b)*lr) for b in coef_plsr_ALL]


#abc_model = Hive.BeeHive(lower = lower_lim_ALL,
#                         upper = upper_lim_ALL,
#                         fun = rmse_abc_ALL,
#                         numb_bees = n_bees,
#                         max_itrs = iter_count)

abc_model = Hive.BeeHive(lower = lower_lim_ALL,
                         upper = upper_lim_ALL,
                         fun = R2_abc_ALL,
                         numb_bees = n_bees,
                         max_itrs = iter_count)

# Optimize coefficient/parameters value B
cost = abc_model.run()
rmse_optimized_ALL = abc_model.best
coef_optimized_ALL = abc_model.solution

for k, (train, test) in enumerate(k_fold.split(X, Y)):
    X_train, Y_train = X.iloc[train], Y.iloc[train]
    X_test, Y_test = X.iloc[test], Y.iloc[test]
    
    PLSR_ABC_ALL = copy.deepcopy(PLSR_ALL)
    PLSR_ABC_ALL.coef_ = coef_optimized_ALL
    PLSR_ABC_ALL.fit(X_train, Y_train)
    
    Y_train_meas_ABC = Y_train
    Y_train_pred_ABC = PLSR_ABC_ALL.predict(X_train)
    Y_test_meas_ABC = Y_test
    Y_test_pred_ABC = PLSR_ABC_ALL.predict(X_test)
    
    rmse_train[k] = np.sqrt(mse(Y_train_meas_ABC, Y_train_pred_ABC))
    r2_train[k] = r2_score(Y_train_meas_ABC, Y_train_pred_ABC)
    rmse_test[k] = np.sqrt(mse(Y_test_meas_ABC, Y_test_pred_ABC))
    r2_test[k] =  r2_score(Y_test_meas_ABC, Y_test_pred_ABC)
end_time = time()-st

In [None]:
end_time

In [None]:
score_table = pd.DataFrame([rmse_train, rmse_test, r2_train, r2_test]).transpose()
score_table.columns = ['RMSE_train','RMSE_test','R2_train','R2_test']
rmse = np.average(rmse_test)
rmse_ori.append(rmse)

mean_score =  pd.DataFrame(score_table.mean()).transpose()
mean_score.index = ['Average']
score_table = pd.concat([score_table,mean_score])
(score_table)




In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('PLSR ABC Measured vs. Predicted Value (n = 50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(Y_train_meas_ABC, Y_train_pred_ABC, label = 'Training Data',color = 'red')
#plt.scatter(Y_test_meas_ABC, Y_test_pred_ABC, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()




In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('PLSR-ABC Measured vs. Predicted Value (n=50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
#plt.scatter(Y_train_meas_ABC, Y_train_pred_ABC, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_ABC, Y_test_pred_ABC, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()





In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('PLSR-ABC Measured vs. Predicted Value (n=50)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(Y_train_meas_ABC, Y_train_pred_ABC, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_ABC, Y_test_pred_ABC, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()






## 3. RANDOM FOREST ALL WAVELENGTHS REGRESSION

In [None]:
features = pd.read_csv('data/dataset_bisbul_flavonoid.csv')
labels = features['Flavonoid']
features= features.drop(['Flavonoid'], axis = 1)


In [None]:
st = time()

RF_ALL = RandomForestRegressor(n_estimators = 1000, random_state = 12121323)
for k, (train, test) in enumerate(k_fold.split(features, labels)):
    X_train, Y_train = features.iloc[train], labels.iloc[train]
    X_test, Y_test = features.iloc[test], labels.iloc[test]
    RF_ALL.fit(X_train, Y_train)
    pred_train = RF_ALL.predict(X_train)
    
    Y_train_meas_RF_ALL = Y_train
    Y_train_pred_RF_ALL = RF_ALL.predict(X_train)
    Y_test_meas_RF_ALL = Y_test
    Y_test_pred_RF_ALL = RF_ALL.predict(X_test)
    
    rmse_train[k] = np.sqrt(mse(Y_train_meas_RF_ALL, Y_train_pred_RF_ALL))
    r2_train[k] = r2_score(Y_train_meas_RF_ALL, Y_train_pred_RF_ALL)
    rmse_test[k] = np.sqrt(mse(Y_test_meas_RF_ALL, Y_test_pred_RF_ALL))
    r2_test[k] =  r2_score(Y_test_meas_RF_ALL, Y_test_pred_RF_ALL)

end_time = time()-st

In [None]:
end_time

In [None]:
end_time/60

In [None]:
score_table = pd.DataFrame([rmse_train, rmse_test, r2_train, r2_test]).transpose()
score_table.columns = ['RMSE_train','RMSE_test','R2_train','R2_test']
rmse = np.average(rmse_test)
rmse_ori.append(rmse)

mean_score =  pd.DataFrame(score_table.mean()).transpose()
mean_score.index = ['Average']
score_table = pd.concat([score_table,mean_score])
(score_table)



In [None]:
# Make predictions and determine the error
predictions = RF_ALL.predict(X_test)
errors = abs(predictions - Y_test)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / Y_test))
accuracy = 100 - mape
print('Accuracy RF:', round(accuracy, 2), '%.')

In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (All Wavelengths)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(Y_train_meas_RF_ALL, Y_train_pred_RF_ALL, label = 'Training Data',color = 'red')
#plt.scatter(Y_test_meas_RF_ALL, Y_test_pred_RF_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()



In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (All Wavelengths)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
#plt.scatter(Y_train_meas_RF_ALL, Y_train_pred_RF_ALL, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_RF_ALL, Y_test_pred_RF_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()




In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (All Wavelengths)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(Y_train_meas_RF_ALL, Y_train_pred_RF_ALL, label = 'Training Data',color = 'red')
plt.scatter(Y_test_meas_RF_ALL, Y_test_pred_RF_ALL, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()




## 4. RF SELECTED WAVELENGTH REGRESSION

In [None]:
# Get numerical feature importances
importances = list(RF_ALL.feature_importances_)

# Saving feature names for later use
feature_list = list(X.columns)

In [None]:
# Get numerical feature importances
importances = list(RF_ALL.feature_importances_)

# Saving feature names for later use
feature_list = list(features.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 50, random_state=1506669923)

# Extract the two most important features
important_indices = [feature_list.index('731.79'), feature_list.index('726.33'),
                    feature_list.index('421.24'), feature_list.index('734.52'),
                    feature_list.index('742.72'), feature_list.index('729.06'),
                    feature_list.index('737.25'), feature_list.index('413.37'),
                    feature_list.index('402.9'), feature_list.index('723.6'),
                    feature_list.index('416'), feature_list.index('400.28'),
#                     feature_list.index('397.66'), feature_list.index('739.98'),
                    feature_list.index('423.86'), feature_list.index('718.15'),
                    feature_list.index('972.84'), feature_list.index('405.52'),
                    feature_list.index('410.75'), feature_list.index('720.87'),
                    feature_list.index('950.38'), feature_list.index('426.49'),
                    feature_list.index('431.74'), feature_list.index('452.79'),
                    feature_list.index('925.19'), feature_list.index('978.46'),
                    feature_list.index('418.62'), feature_list.index('434.37'),
                    feature_list.index('455.43'), feature_list.index('712.7'),
                    feature_list.index('745.45'), feature_list.index('992.54'),
                    feature_list.index('408.13'), feature_list.index('447.52'),
                    feature_list.index('715.42'), feature_list.index('781.09'),
                    feature_list.index('922.39'), feature_list.index('970.03'),
                    feature_list.index('998.17'), feature_list.index('429.12'),
                    feature_list.index('439.63'), feature_list.index('444.89'),
                    feature_list.index('463.34'), feature_list.index('748.19'),
                    feature_list.index('891.71'), feature_list.index('914.02'),
                    feature_list.index('919.6'), feature_list.index('947.58'),
                    feature_list.index('964.41'), feature_list.index('967.22'),
                    feature_list.index('975.65'), feature_list.index('984.09'),
                    feature_list.index('437'), feature_list.index('442.26'),
                    feature_list.index('450.16'), feature_list.index('458.06'),
                    feature_list.index('460.7'), feature_list.index('468.62'),
                    feature_list.index('471.26'), feature_list.index('481.83'),
                    feature_list.index('484.47'), feature_list.index('487.12'),
                    feature_list.index('526.93'), feature_list.index('529.59'),
                    feature_list.index('534.91'), feature_list.index('537.57'),
                    feature_list.index('540.24'), feature_list.index('542.91'),
                    feature_list.index('545.57'), feature_list.index('548.24'),
                    feature_list.index('550.91'), feature_list.index('553.58'),
                    feature_list.index('556.25'), feature_list.index('558.92'),
                    feature_list.index('561.59'), feature_list.index('564.26'),
                    feature_list.index('661.1'), feature_list.index('663.81'),
                    feature_list.index('674.65'), feature_list.index('685.5'),
                    feature_list.index('693.65'), feature_list.index('699.09'),
                    feature_list.index('701.81'), feature_list.index('704.53'),
                    feature_list.index('707.25'), feature_list.index('709.97'),
                    feature_list.index('750.93'), feature_list.index('753.66'),
                    feature_list.index('759.14'), feature_list.index('764.62'),
                    feature_list.index('772.85'), feature_list.index('775.6'),
                    feature_list.index('778.34'), feature_list.index('786.58'),
                    feature_list.index('800.34'), feature_list.index('805.85'),
                    feature_list.index('808.61'), feature_list.index('875.03'),
                    feature_list.index('888.93'), feature_list.index('897.28'),
                    feature_list.index('902.86'), feature_list.index('916.81'),
                    feature_list.index('930.78'), feature_list.index('939.18'),
                    feature_list.index('955.99'), feature_list.index('958.8'),
                    feature_list.index('961.6'), feature_list.index('981.27'),
                    feature_list.index('986.9'), feature_list.index('989.72'),
                    feature_list.index('995.35'), feature_list.index('1000.99'),
                    feature_list.index('1003.81')]

train_features_nparr = np.array(X_train)
test_features_nparr = np.array(X_test)

train_important = train_features_nparr[:, important_indices]
test_important = test_features_nparr[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, Y_train)



In [None]:
st = time()
for k, (train, test) in enumerate(k_fold.split(features, labels)):
    X_train, y_train = features.iloc[train], labels.iloc[train]
    X_test, y_test = features.iloc[test], labels.iloc[test]
#    print('Train:', train, 'Test:', test)
    
    train_features_nparr = np.array(X_train)
    test_features_nparr = np.array(X_test)

    train_important = train_features_nparr[:, important_indices]
    test_important = test_features_nparr[:, important_indices]
    
    rf_most_important.fit(train_important, y_train)
    
    
    train_features_nparr = np.array(X_train)
    test_features_nparr = np.array(X_test)

    train_important = train_features_nparr[:, important_indices]
    test_important = test_features_nparr[:, important_indices]
    
    rf_most_important.fit(train_important, y_train)
    pred_train = rf_most_important.predict(train_important)
    rmse_train[k] = np.sqrt(mse(y_train, pred_train))
    r2_train[k] = rf_most_important.score(train_important, y_train)
    pred_test = rf_most_important.predict(test_important)
    rmse_test[k] = np.sqrt(mse(y_test, pred_test))
    r2_test[k] = rf_most_important.score(test_important, y_test)

    

    
score_table = pd.DataFrame([rmse_train, rmse_test, r2_train, r2_test]).transpose()
score_table.columns = ['RMSE_train','RMSE_test','R2_train','R2_test']

mean_score =  pd.DataFrame(score_table.mean()).transpose()
mean_score.index = ['Average']
score_table = pd.concat([score_table,mean_score])
#score_table.to_csv(OUTPUT_PATH+'CrossVal_Using_Feature_selection_2.csv')
(score_table)
end_time = time()-st

In [None]:
end_time 

In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (Feature Selected)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(y_train, pred_train, label = 'Training Data',color = 'red')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()





In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (Feature Selected)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
#plt.scatter(y_train, pred_train, label = 'Training Data',color = 'red')
plt.scatter(y_test, pred_test, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()






In [None]:
#-- Plot Testing Data Only (ALL wavelengths)
plt.title('RF Measured vs. Predicted Value (Feature Selected)')
plt.xlabel('Measured Value[Flavonoid (mg/ml)]')
plt.ylabel('Predicted Value[Flavonoid (mg/ml)]')
gradien_x = list(range(0,1500))
gradien_y = list(range(0,1500))
plt.scatter(y_train, pred_train, label = 'Training Data',color = 'red')
plt.scatter(y_test, pred_test, label = 'Test Data',color = 'blue')
plt.plot(gradien_x,gradien_y,color='black',dashes=[6, 2])
plt.legend(loc = 'upper right')
#plt.xlim([0.0, 1.0])
plt.xlim([0, 500])
#plt.ylim([0.0, 1.0])
plt.ylim([0, 500])
plt.gcf().set_size_inches(12.5, 10)
plt.grid()
plt.show()





