## Dimensional Reduction Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn import manifold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from time import time

from sqlalchemy import create_engine
import mysql.connector
import pymysql



In [None]:
torch.cuda.is_available()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


In [None]:
try:
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="********"
    )
    print("Connection established")
    cursor = mydb.cursor(buffered=True)
    cursor.execute("use `SERS_ML_TEST`;")


except mysql.connector.Error as err:
    print("An error occurred:", err)

In [None]:
# input_dir = input("Please enter the directory: \n")

input_dir = 'Exp_Data/'
output_dir = 'Exp_Result/'

In [None]:
##
training_sizes = [50,100,200,400,800,1600,3200]

In [None]:
## ensure database connection

# from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:********@localhost:3306/SERS_ML_TEST')


xlabel_General = np.array([ 'ATCC 27662_Amp16','BL21_Amp16', 'BW25113_Amp16', 'DH5\u03B1(WT)_Amp16','DH5\u03B1(ampR)_Amp16']) 
x_General = np.array([0, 1, 2, 3, 4])
color_map = [ 'darkorange','darkviolet','navy','red','darkgreen']
marker_list = ['o', 'o', 'o','o','o' ]


df = pd.DataFrame(np.array([x_General,xlabel_General,color_map, marker_list]).T, columns = ['id', 'label', 'color_map', 'marker_list'])
df.to_sql('Visual_Labels', engine, index=False, if_exists='append')
print(df)


## List of Data_Visualization into database

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")



sql_command = """INSERT INTO `Visualize_Methods`(`method`)
               VALUES ('PCA'),('T_SNE');"""

cursor.execute(sql_command)
mydb.commit()


## PCA: Data Exploration

In [None]:

# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """SELECT `data_path`, `id` FROM `Preprocess_SERS`
                WHERE `file_name` = '{file_name}';""".format(
                    file_name = 'Labeled_SERS_dataset.csv')
cursor.execute(sql_command)
labeled_info =  cursor.fetchall()
labeled_path = labeled_info[0][0]
labeled_id   = labeled_info[0][1]


df = pd.read_csv(labeled_path, header = 0)  # input_dir + 'Labeled_SERS_dataset.csv'

# Sorting by Label
df.sort_values(by = 'label', inplace=True)


plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, '400.0':'1550.0']


pca = PCA(n_components=2)
pca_feature = pca.fit_transform(feature)




plt.figure()
temp_label = 0
for i in range(pca_feature.shape[0]):  
    if temp_label == 0 and int(df.iloc[i, 0]) == 0:
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 0]) and (temp_label != 0):
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)
        temp_label = temp_label + 1
        print( xlabel_General[int(df.iloc[i, 0])])
        print( int(df.iloc[i, 0]) )
    else :
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)

handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(reversed(handles), reversed(labels),bbox_to_anchor=(1.1, 1), loc='upper left')


plt.xlabel('PC1')
plt.ylabel('PC2')
plt.axis('square') 
plt.savefig(output_dir + 'Labeled_SERS_PCA.png', dpi=300, transparent=True, bbox_inches='tight')


## Generate PCA dataset 
pca_dataframe = pd.DataFrame(pca_feature, columns=['1st_D', '2nd_D'])
pca_dataframe.insert(0, 'label', df.iloc[:, 0])
print(pca_dataframe)
pca_dataframe.to_csv(output_dir + 'PCA_dataset.csv', index= False)



## Record PCA Figure & dataset into Database
sql_command = """INSERT INTO `Data_Visualization`
                        (`method`,`method_id`, `dataset_id`,`dataset_path`, 
                         `method_parameters_id`,`normalized`,
                           `result_path`, `figure_path`)
               VALUES ('{method}','{method_id}', '{dataset_id}','{dataset_path}', 
                       '{method_parameters_id}','{normalized}',
                       '{result_path}', '{figure_path}');""".format(
                method = 'PCA', method_id = '1', 
                dataset_id = labeled_id, dataset_path = labeled_path, 
                method_parameters_id = '1', normalized = 0,
                result_path = output_dir + 'PCA_dataset.csv', figure_path = output_dir + 'Labeled_SERS_PCA.png')
# print (sql_command)
cursor.execute(sql_command)
mydb.commit()





### Normalized PCA

In [None]:


# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """SELECT `data_path`, `id` FROM `Preprocess_SERS`
                WHERE `file_name` = '{file_name}';""".format(
                    file_name = 'Labeled_Nor_SERS_dataset.csv')
cursor.execute(sql_command)
labeled_nor_info =  cursor.fetchall()
labeled_nor_path = labeled_nor_info[0][0]
labeled_nor_id   = labeled_nor_info[0][1]

## Normalized PCA
df = pd.read_csv(labeled_nor_path, header = 0)  # input_dir + 'Labeled_Nor_SERS_dataset.csv'
# Sorting by Label
df.sort_values(by = 'label', inplace=True)


plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, '400.0':'1550.0']



pca = PCA(n_components=2)
pca_feature = pca.fit_transform(feature)
plt.figure()
temp_label = 0
for i in range(pca_feature.shape[0]):  
    if temp_label == 0 and int(df.iloc[i, 0]) == 0:
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 0]) and (temp_label != 0):
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)
        temp_label = temp_label + 1
        print( xlabel_General[int(df.iloc[i, 0])])
        print( int(df.iloc[i, 0]) )
    else :
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 0])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 0])], edgecolors = None)


handles, labels = plt.gca().get_legend_handles_labels()

plt.legend(reversed(handles), reversed(labels),bbox_to_anchor=(1.1, 1), loc='upper left')

# plt.xlim([-50000, 100000])
# plt.ylim([-50000, 50000])

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.axis('square') 
plt.savefig(output_dir + 'Labeled_Nor_SERS_PCA.png', dpi=300, transparent=True, bbox_inches='tight')


## Generate PCA dataset 
pca_dataframe = pd.DataFrame(pca_feature, columns=['1st_D', '2nd_D'])
pca_dataframe.insert(0, 'label', df.iloc[:, 0])
print(pca_dataframe)
pca_dataframe.to_csv(output_dir + 'Nor_PCA_dataset.csv', index= False)



## Record PCA Figure & dataset into Database
sql_command = """INSERT INTO `Data_Visualization`
                        (`method`,`method_id`, `dataset_id`,`dataset_path`, 
                         `method_parameters_id`,`normalized`,
                           `result_path`, `figure_path`)
               VALUES ('{method}','{method_id}', '{dataset_id}','{dataset_path}', 
                       '{method_parameters_id}','{normalized}',
                       '{result_path}', '{figure_path}');""".format(
                method = 'PCA', method_id = '1', 
                dataset_id = labeled_nor_id, dataset_path = labeled_nor_path, 
                method_parameters_id = '1', normalized = 1,
                result_path = output_dir + 'Nor_PCA_dataset.csv', figure_path = output_dir + 'Labeled_Nor_SERS_PCA.png')
# print (sql_command)
cursor.execute(sql_command)
mydb.commit()





### t-SNE

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """SELECT `data_path`, `id` FROM `Preprocess_SERS`
                WHERE `file_name` = '{file_name}';""".format(
                    file_name = 'Labeled_SERS_dataset.csv')
cursor.execute(sql_command)
labeled_info =  cursor.fetchall()
labeled_path = labeled_info[0][0]
labeled_id   = labeled_info[0][1]




df = pd.read_csv(labeled_path, header = 0)  # input_dir + 'Labeled_SERS_dataset.csv'
# Sorting by Label
df.sort_values(by = 'label', inplace=True)


plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, '400.0':'1550.0']



tsne = manifold.TSNE(n_components=2, init='random', learning_rate=200, perplexity = 50).fit_transform(feature)
x_min, x_max = tsne.min(0), tsne.max(0)
tsne = (tsne - x_min) / (x_max - x_min)

plt.figure()
temp_label = 0
for i in range(tsne.shape[0]):
    if temp_label == 0 and int(df.iloc[i, 0]) == 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 0]) and temp_label != 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    else :
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], alpha = 0.5, s=10 ,marker = marker_list[int(df.iloc[i, 0])])



handles, labels = plt.gca().get_legend_handles_labels()

plt.legend(reversed(handles), reversed(labels),bbox_to_anchor=(1.1, 1), loc='upper left')
plt.xlabel('T-SNE1')
plt.ylabel('T-SNE2')
plt.axis('square')
plt.savefig(output_dir + 'Labeled_SERS_t-SNE.png', dpi=300, transparent=True, bbox_inches='tight')


## Generate T-SNE dataset 
tsne_dataframe = pd.DataFrame(tsne, columns=['1st_D', '2nd_D'])
tsne_dataframe.insert(0, 'label', df.iloc[:, 0])
print(tsne_dataframe)
tsne_dataframe.to_csv(output_dir + 't-SNE_dataset.csv', index= False)



## Record PCA Figure & dataset into Database
sql_command = """INSERT INTO `Data_Visualization`
                        (`method`,`method_id`, `dataset_id`,`dataset_path`, 
                         `method_parameters_id`,`normalized`,
                           `result_path`, `figure_path`)
               VALUES ('{method}','{method_id}', '{dataset_id}','{dataset_path}', 
                       '{method_parameters_id}','{normalized}',
                       '{result_path}', '{figure_path}');""".format(
                method = 'T-SNE', method_id = '2', 
                dataset_id = labeled_id, dataset_path = labeled_path, 
                method_parameters_id = '1', normalized = 0,
                result_path = output_dir + 't-SNE_dataset.csv', figure_path = output_dir + 'Labeled_SERS_t-SNE.png')
# print (sql_command)
cursor.execute(sql_command)
mydb.commit()



In [None]:

# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """SELECT `data_path`, `id` FROM `Preprocess_SERS`
                WHERE `file_name` = '{file_name}';""".format(
                    file_name = 'Labeled_Nor_SERS_dataset.csv')
cursor.execute(sql_command)
labeled_nor_info =  cursor.fetchall()
labeled_nor_path = labeled_nor_info[0][0]
labeled_nor_id   = labeled_nor_info[0][1]



df = pd.read_csv(labeled_nor_path, header = 0) ## input_dir + 'Labeled_Nor_SERS_dataset.csv'


df.sort_values(by = 'label', inplace=True)

plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, '400.0':'1550.0']


tsne = manifold.TSNE(n_components=2, init='random', learning_rate=200, perplexity = 50).fit_transform(feature)
x_min, x_max = tsne.min(0), tsne.max(0)
tsne = (tsne - x_min) / (x_max - x_min)

plt.figure()
temp_label = 0
for i in range(tsne.shape[0]):
    if temp_label == 0 and int(df.iloc[i, 0]) == 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 0]) and temp_label != 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    else :
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], alpha = 0.5, s=10 ,marker = marker_list[int(df.iloc[i, 0])])



handles, labels = plt.gca().get_legend_handles_labels()

plt.legend(reversed(handles), reversed(labels),bbox_to_anchor=(1.1, 1), loc='upper left')
plt.xlabel('T-SNE1')
plt.ylabel('T-SNE2')
plt.axis('square')
plt.savefig(output_dir + 'Labeled_Nor_SERS_t-SNE.png', dpi=300, transparent=True, bbox_inches='tight')



## Generate T-SNE dataset 
tsne_dataframe = pd.DataFrame(tsne, columns=['1st_D', '2nd_D'])
tsne_dataframe.insert(0, 'label', df.iloc[:, 0])
print(tsne_dataframe)
tsne_dataframe.to_csv(output_dir + 'Nor_t-SNE_dataset.csv', index= False)



## Record PCA Figure & dataset into Database
sql_command = """INSERT INTO `Data_Visualization`
                        (`method`,`method_id`, `dataset_id`,`dataset_path`, 
                         `method_parameters_id`,`normalized`,
                           `result_path`, `figure_path`)
               VALUES ('{method}','{method_id}', '{dataset_id}','{dataset_path}', 
                       '{method_parameters_id}','{normalized}',
                       '{result_path}', '{figure_path}');""".format(
                method = 'T-SNE', method_id = '2', 
                dataset_id = labeled_nor_id, dataset_path = labeled_nor_path, 
                method_parameters_id = '1', normalized = 1,
                result_path = output_dir + 'Nor_t-SNE_dataset.csv', figure_path = output_dir + 'Labeled_Nor_SERS_t-SNE.png')
# print (sql_command)
cursor.execute(sql_command)
mydb.commit()


### t-SNE perplexity = 500, Normalized dataset

In [None]:
## Nor: perplexity 500

# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """SELECT `data_path`, `id` FROM `Preprocess_SERS`
                WHERE `file_name` = '{file_name}';""".format(
                    file_name = 'Labeled_Nor_SERS_dataset.csv')
cursor.execute(sql_command)
labeled_nor_info =  cursor.fetchall()
labeled_nor_path = labeled_nor_info[0][0]
labeled_nor_id   = labeled_nor_info[0][1]



df = pd.read_csv(labeled_nor_path, header = 0) ## input_dir + 'Labeled_Nor_SERS_dataset.csv'
df.sort_values(by = 'label', inplace=True)

plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, '400.0':'1550.0']



tsne = manifold.TSNE(n_components=2, init='random', learning_rate=200, perplexity = 500).fit_transform(feature)
x_min, x_max = tsne.min(0), tsne.max(0)
tsne = (tsne - x_min) / (x_max - x_min)

plt.figure()
temp_label = 0
for i in range(tsne.shape[0]):
    if temp_label == 0 and int(df.iloc[i, 0]) == 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 0]) and temp_label != 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], label = xlabel_General[int(df.iloc[i, 0])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 0])])
        temp_label = temp_label + 1
    else :
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 0])], alpha = 0.5, s=10 ,marker = marker_list[int(df.iloc[i, 0])])



handles, labels = plt.gca().get_legend_handles_labels()

plt.legend(reversed(handles), reversed(labels),bbox_to_anchor=(1.1, 1), loc='upper left')
plt.xlabel('T-SNE1')
plt.ylabel('T-SNE2')
plt.axis('square')
plt.savefig(output_dir + 'Labeled_Nor_SERS_t-SNE_p500.png', dpi=300, transparent=True, bbox_inches='tight')

## Generate T-SNE dataset 
tsne_dataframe = pd.DataFrame(tsne, columns=['1st_D', '2nd_D'])
tsne_dataframe.insert(0, 'label', df.iloc[:, 0])
print(tsne_dataframe)
tsne_dataframe.to_csv(output_dir + 'Nor_t-SNE_p500_dataset.csv', index= False)



## Record PCA Figure & dataset into Database
sql_command = """INSERT INTO `Data_Visualization`
                        (`method`,`method_id`, `dataset_id`,`dataset_path`, 
                         `method_parameters_id`,`normalized`,
                           `result_path`, `figure_path`)
               VALUES ('{method}','{method_id}', '{dataset_id}','{dataset_path}', 
                       '{method_parameters_id}','{normalized}',
                       '{result_path}', '{figure_path}');""".format(
                method = 'T-SNE', method_id = '2', 
                dataset_id = labeled_nor_id, dataset_path = labeled_nor_path, 
                method_parameters_id = '2', normalized = 1,
                result_path = output_dir + 'Nor_t-SNE_p500_dataset.csv', figure_path = output_dir + 'Labeled_Nor_SERS_t-SNE_p500.png')
# print (sql_command)
cursor.execute(sql_command)
mydb.commit()


# Supervised Machine learning classification

### List of ML methods into database

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")



sql_command = """INSERT INTO `ML_methods`(`method`, `method_fullname`)
               VALUES ('RF',   'Random forest'),
                      ('SVM',  'Support vector machine'),
                      ('KNN',  'k-nearest neighbors'),
                      ('CNN',  'Convolutional neural network');"""

cursor.execute(sql_command)
mydb.commit()


### Training & Testing Set info collection

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")

## Training_Set path and id
sql_command = """SELECT `data_path`, `id`, `count` FROM `Preprocess_SERS`
                WHERE (`normalized` = '1')
                AND (`train` = '1')
                AND (`test` = '0');"""
cursor.execute(sql_command)
train_set_info =  cursor.fetchall()
print(train_set_info)
train_set_path = train_set_info[0][0]
train_set_id   = train_set_info[0][1]
train_set_size   = train_set_info[0][2]

## Test_Set path and id
sql_command = """SELECT `data_path`, `id`,`count`  FROM `Preprocess_SERS`
                WHERE (`normalized` = '1')
                AND (`train` = '0')
                AND (`test` = '1');"""
cursor.execute(sql_command)
test_set_info =  cursor.fetchall()
print(test_set_info)
test_set_path = test_set_info[0][0]
test_set_id   = test_set_info[0][1]
test_set_size   = test_set_info[0][2]


### ML training: RF, SVM, KNN

In [None]:

df = pd.read_csv(train_set_path,header = 0)  ## input_dir + 'SERS_Nor_training.csv'
# feature = df.loc[:, '400.0':'1550.0']
feature = df.iloc[:, 1:].to_numpy()
# print(feature)
train_label = df['label'].to_numpy()

## Model Training
#Random Forest
rf = RandomForestClassifier(max_depth=40, max_samples=1.0, min_samples_split=5,random_state=0)
rf.fit(feature, train_label)

#SVM
svm = SVC(C=10)
svm.fit(feature, train_label)

#KNN
knn = KNeighborsClassifier(algorithm='brute', n_neighbors=10, weights='distance')
knn.fit(feature, train_label)

### ML Training: CNN

In [None]:
from time import time


Epoch = 200
BATCH_SIZE = 200
learning_rate = 0.0001 
wd=0.00001

df = pd.read_csv(train_set_path,header = 0)  ## input_dir + 'SERS_Nor_training.csv'

combine = df.to_numpy()
np.random.seed(8787)
np.random.shuffle(combine)
# feature = combine[:, 1:]
# label =  combine[:, 0]
feature_train = combine[:, 1:]
label_train =  combine[:, 0]
train_size = int(feature_train.shape[0] * 1)


df_test = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'
# combine = df_test.loc[:, 400.0:'Label'].to_numpy()
combine = df_test.to_numpy()
np.random.seed(8787)
np.random.shuffle(combine)
feature_test = combine[:, 1:]
label_test =  combine[:, 0]
test_size = int(feature_test.shape[0] * 1)



#Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 6, 3, 2)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(6, 16, 3, 2)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc1 = nn.Linear(2288, 280)
        self.fc2 = nn.Linear(280, 14)
        self.fc3 = nn.Linear(14, 5)

    def forward(self, x):
        x = x.reshape((x.shape[0],1,-1))
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = F.relu(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


feature_train = torch.from_numpy(feature_train)
label_train = torch.from_numpy(label_train)
feature_test = torch.from_numpy(feature_test)
label_test = torch.from_numpy(label_test)

train_dataset = Data.TensorDataset(feature_train, label_train)
test_dataset = Data.TensorDataset(feature_test, label_test)
train_loader = Data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle = True)
test_loader = Data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = True)



#Model Setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print("GPU run")
cnn_model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=learning_rate, weight_decay=wd)


#Model Training
cnn_model_path = output_dir + 'SERS_Nor_CNN.pth'


accuracy_record = {'train': [], 'test': []} 
loss_record = {'train': [], 'test': []} 
best_train_acc = 0.0
best_train_loss = 0.0


initial_time = time()

for epoch in range(Epoch):  # loop over the dataset multiple times
    train_acc = 0.0
    train_loss = 0.0
    test_acc = 0.0
    test_loss = 0.0

    cnn_model.train()
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.type(torch.FloatTensor)
        labels = labels.type(torch.LongTensor)
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = cnn_model(inputs)
        loss =  criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        loss.backward()
        optimizer.step()
        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += loss.item()


    accuracy_record['train'].append(train_acc/len(train_dataset))
    loss_record['train'].append(train_loss/len(train_loader))    
    if (epoch + 1) % 10 == 0 or epoch == 0:    # print every 2000 mini-batches
        print(f'{epoch + 1}, train_loss: {train_loss /len(train_loader)}, train_acc: {train_acc/len(train_dataset)}')

    if train_acc > best_train_acc:
        best_train_acc = train_acc
        print('[Save]-- [{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, Epoch, train_acc/len(train_dataset), train_loss/len(train_loader)
            ))


    cnn_model.eval() # set the model to evaluation mode
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.type(torch.FloatTensor)
            labels = labels.type(torch.LongTensor)
            inputs, labels = inputs.to(device), labels.to(device)

            # forward + backward + optimize
            outputs = cnn_model(inputs)
            loss =  criterion(outputs, labels)
            _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
            test_acc += (test_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
            test_loss += loss.item()

        accuracy_record['test'].append(test_acc/len(test_dataset))
        loss_record['test'].append(test_loss/len(test_loader))



torch.save(cnn_model.state_dict(), cnn_model_path)

print('Finished Training (02)')
print('Training time', time() - initial_time)


acc_pd = pd.DataFrame.from_dict(accuracy_record)
loss_pd = pd.DataFrame.from_dict(loss_record)
lc_pd = df = pd.concat([acc_pd,loss_pd], axis=1)
lc_filename =  output_dir + 'SERS_Nor_CNN_learnCurve.csv'
lc_pd.to_csv(lc_filename, index=True)



### ML prediction: RF, SVM, KNN

In [None]:
df = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'
feature = df.iloc[:, 1:].to_numpy()


label = df['label'].to_numpy()
label = torch.from_numpy(label)

#Random Forest
rf_result = rf.predict(feature)
rf_result_df = pd.DataFrame(rf_result)
rf_result_df.to_csv(output_dir  + 'Nor_RF_prediction.csv', index= True , header = False)
rf_result = torch.from_numpy(rf_result)

#SVM
svm_result = svm.predict(feature)
svm_result_df = pd.DataFrame(svm_result)
svm_result_df.to_csv(output_dir  + 'Nor_SVM_prediction.csv', index= True , header = False)
# print(svm_result_df)
svm_result = torch.from_numpy(svm_result)

#KNN
knn_result = knn.predict(feature)
knn_result_df = pd.DataFrame(knn_result)
knn_result_df.to_csv(output_dir  + 'Nor_KNN_prediction.csv', index= True , header = False)
knn_result = torch.from_numpy(knn_result)


### ML prediction: CNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from time import time

df = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'

# feature = df.loc[:, 400.0:1550.0].to_numpy()
feature = df.loc[:, '400.0':'1550.0'].to_numpy()
feature = torch.from_numpy(feature)
label = df['label'].to_numpy()
label = torch.from_numpy(label)


#Data_Loader
batch_num = feature.shape[0]
dataset = Data.TensorDataset(feature, label)
test_loader = Data.DataLoader(dataset, batch_size=batch_num)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 6, 3, 2)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(6, 16, 3, 2)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc1 = nn.Linear(2288, 280)
        self.fc2 = nn.Linear(280, 14)
        self.fc3 = nn.Linear(14, 5)

    def forward(self, x):
        x = x.reshape((x.shape[0],1,-1))
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = F.relu(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = Net().to(device)
model.load_state_dict(torch.load(output_dir + 'SERS_Nor_CNN.pth'))

model.eval()

pred_acc = 0.0

with torch.no_grad():
    for data in test_loader:
        features, labels = data
        features = features.type(torch.FloatTensor)
        labels = labels.type(torch.LongTensor)
        features = features.reshape((features.shape[0],1, 1, -1))
        features = features.to(device)
        labels = labels.to(device)
        # calculate outputs by running images through the network
        outputs = model(features)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        pred_acc += (predicted.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
        cnn_result = predicted.to('cpu')


cnn_result = cnn_result.cpu().numpy()
cnn_result_df = pd.DataFrame(cnn_result)
cnn_result_df.to_csv(output_dir  + 'Nor_CNN_prediction.csv', index= True , header = False)
cnn_result = torch.from_numpy(cnn_result)


### ML prediction results

In [None]:
def acc_calculation(predict, label):
    correct = (predict == label).sum().item()
    total = label.size(0) 
    return round(100 * correct / total, 2)

In [None]:

#RF
rf_acc = acc_calculation(rf_result, label)

#SVM
svm_acc = acc_calculation(svm_result, label)

#KNN
knn_acc = acc_calculation(knn_result, label)

#CNN
cnn_acc = acc_calculation(cnn_result, label)


print(f'Random Forest ACC: {rf_acc}% \n SVM ACC: {svm_acc}% \n KNN ACC: {knn_acc}% \n CNN ACC: {cnn_acc}%')

### ML Prediction record as csv

In [None]:
ml_acc = pd.DataFrame([['Random Forest', rf_acc],['SVM' , svm_acc],['KNN', knn_acc], ['CNN', cnn_acc]], columns=['method', 'accuracy(%)'])
print(ml_acc)
ml_acc.to_csv(output_dir  + 'Nor_ML_ACC.csv', index= False , header = True)

# print(ml_acc.iloc[0,:]) 
print(ml_acc.iloc[0,:]) 

ml_acc.iloc[0,:].to_csv(output_dir  + 'Nor_RF_ACC.csv', index= True , header = False)
ml_acc.iloc[1,:].to_csv(output_dir  + 'Nor_SVM_ACC.csv', index= True , header = False)
ml_acc.iloc[2,:].to_csv(output_dir  + 'Nor_KNN_ACC.csv', index= True , header = False)
ml_acc.iloc[3,:].to_csv(output_dir  + 'Nor_CNN_ACC.csv', index= True , header = False)


## Confusion Matrix plot & record ML results into database 

In [None]:
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 300


xlabel_General = np.array([ 'ATCC 27662_Amp16','BL21_Amp16', 'BW25113_Amp16', 'DH5\u03B1(WT)_Amp16','DH5\u03B1(ampR)_Amp16']) 
x_General = np.array([0, 1, 2, 3, 4])   


#RF
plt.figure(1)
rf_con = confusion_matrix(label, rf_result,normalize='true')
rf_con_df = pd.DataFrame(rf_con)
rf_con_df.to_csv(output_dir  + 'Nor_RF_conf.csv', index= False , header = False)

rf_con = np.around(rf_con, 2)
rf_disp = ConfusionMatrixDisplay(confusion_matrix=rf_con, display_labels= xlabel_General)
rf_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(output_dir + 'Nor_RF_conf.png', dpi=300, transparent=True, bbox_inches='tight')



#SVM
plt.figure(2)
svm_con = confusion_matrix(label, svm_result,normalize='true')
svm_con_df = pd.DataFrame(svm_con)
svm_con_df.to_csv(output_dir  + 'Nor_SVM_conf.csv', index= False , header = False)

svm_con = np.around(svm_con, 2)
svm_disp = ConfusionMatrixDisplay(confusion_matrix=svm_con,display_labels= xlabel_General)
svm_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(output_dir + 'Nor_SVM_conf.png', dpi=300, transparent=True, bbox_inches='tight')



#KNN
plt.figure(3)
knn_con = confusion_matrix(label, knn_result,normalize='true')
knn_con_df = pd.DataFrame(knn_con)
knn_con_df.to_csv(output_dir  + 'Nor_KNN_conf.csv', index= False , header = False)

knn_con = np.around(knn_con, 2)
knn_disp = ConfusionMatrixDisplay(confusion_matrix=knn_con, display_labels= xlabel_General)
knn_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(output_dir + 'Nor_KNN_conf.png', dpi=300, transparent=True, bbox_inches='tight')



#CNN
plt.figure(4)
cnn_con = confusion_matrix(label, cnn_result, normalize='true')
cnn_con_df = pd.DataFrame(cnn_con)
cnn_con_df.to_csv(output_dir  + 'Nor_CNN_conf.csv', index= False , header = False)

cnn_con = np.around(cnn_con,2)
cnn_disp = ConfusionMatrixDisplay(confusion_matrix=cnn_con, display_labels= xlabel_General)
cnn_disp.plot(cmap ='gist_yarg',  colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(output_dir + 'Nor_CNN_conf.png', dpi=300, transparent=True, bbox_inches='tight')



########################################################################################
## Save ML results to database

# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


sql_command = """INSERT INTO `Machine_Learning`
                        (`method`,`ML_method_id`, `method_parameters_id`,
                         `train_set_id`,`train_set_path`, `train_size`,
                        `test_set_id` , `test_set_path`, `test_size`,
                         `prediction_path`, `accuracy`,
                         `confusion_matrix_path`, `confusion_matrix_figure` )
               VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                       '{train_set_id}','{train_set_path}', '{train_size}',
                       '{test_set_id}', '{test_set_path}', '{test_size}',
                        '{prediction_path}','{accuracy}',
                        '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                method = 'RF', ML_method_id = '1', method_parameters_id = '1',
                train_set_id = train_set_id, train_set_path = train_set_path, train_size = train_set_size , 
                test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                prediction_path = output_dir  + 'Nor_RF_prediction.csv', accuracy = rf_acc, 
                confusion_matrix_path = output_dir + 'Nor_RF_conf.csv', confusion_matrix_figure = output_dir + 'Nor_RF_conf.png')
print (sql_command)
cursor.execute(sql_command)
mydb.commit()


## Record SVM Results Database
sql_command = """INSERT INTO `Machine_Learning`
                        (`method`,`ML_method_id`, `method_parameters_id`,
                         `train_set_id`,`train_set_path`, `train_size`,
                        `test_set_id` , `test_set_path`, `test_size`,
                         `prediction_path`, `accuracy`,
                         `confusion_matrix_path`, `confusion_matrix_figure` )
               VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                       '{train_set_id}','{train_set_path}', '{train_size}',
                       '{test_set_id}', '{test_set_path}', '{test_size}',
                        '{prediction_path}','{accuracy}',
                        '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                method = 'SVM', ML_method_id = '2', method_parameters_id = '1',
                train_set_id = train_set_id, train_set_path = train_set_path, train_size = train_set_size , 
                test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                prediction_path = output_dir  + 'Nor_SVM_prediction.csv', accuracy = svm_acc, 
                confusion_matrix_path = output_dir + 'Nor_SVM_conf.csv', confusion_matrix_figure = output_dir + 'Nor_SVM_conf.png')
print (sql_command)
cursor.execute(sql_command)
mydb.commit()


## Record KNN Results Database
sql_command = """INSERT INTO `Machine_Learning`
                        (`method`,`ML_method_id`, `method_parameters_id`,
                         `train_set_id`,`train_set_path`, `train_size`,
                        `test_set_id` , `test_set_path`, `test_size`,
                         `prediction_path`, `accuracy`,
                         `confusion_matrix_path`, `confusion_matrix_figure` )
               VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                       '{train_set_id}','{train_set_path}', '{train_size}',
                       '{test_set_id}', '{test_set_path}', '{test_size}',
                        '{prediction_path}','{accuracy}',
                        '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                method = 'KNN', ML_method_id = '3', method_parameters_id = '1',
                train_set_id = train_set_id, train_set_path = train_set_path, train_size = train_set_size , 
                test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                prediction_path = output_dir  + 'Nor_KNN_prediction.csv', accuracy = knn_acc, 
                confusion_matrix_path = output_dir + 'Nor_KNN_conf.csv', confusion_matrix_figure = output_dir + 'Nor_KNN_conf.png')
print (sql_command)
cursor.execute(sql_command)
mydb.commit()

## Record CNN Results Database
sql_command = """INSERT INTO `Machine_Learning`
                        (`method`,`ML_method_id`, `method_parameters_id`,
                         `train_set_id`,`train_set_path`, `train_size`,
                        `test_set_id` , `test_set_path`, `test_size`,
                         `prediction_path`, `accuracy`,
                         `confusion_matrix_path`, `confusion_matrix_figure` )
               VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                       '{train_set_id}','{train_set_path}', '{train_size}',
                       '{test_set_id}', '{test_set_path}', '{test_size}',
                        '{prediction_path}','{accuracy}',
                        '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                method = 'CNN', ML_method_id = '4', method_parameters_id = '1',
                train_set_id = train_set_id, train_set_path = train_set_path, train_size = train_set_size , 
                test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                prediction_path = output_dir  + 'Nor_CNN_prediction.csv', accuracy = cnn_acc, 
                confusion_matrix_path = output_dir + 'Nor_CNN_conf.csv', confusion_matrix_figure = output_dir + 'Nor_CNN_conf.png')
print (sql_command)
cursor.execute(sql_command)
mydb.commit()


# Training size difference evaluation

In [None]:
##
training_sizes = [50,100,200,400,800,1600,3200]

In [None]:
def acc_calculation(predict, label):
    correct = (predict == label).sum().item()
    total = label.size(0) 
    return round(100 * correct / total, 2)

## RF: Training size difference

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")



df = pd.read_csv(train_set_path,header = 0)  ## input_dir + 'SERS_Nor_training.csv'
df2 = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'


rf_record = []

for training_size in training_sizes:
    print('training size is:' + str(training_size))

    ## Training
    train_feature = df.iloc[:training_size, 1:].to_numpy()
    train_label = df.iloc[:training_size,0].to_numpy()

    ## Model Training
    # Random Forest
    rf = RandomForestClassifier(max_depth=40, max_samples=1.0, min_samples_split=5,random_state=0)
    rf.fit(train_feature, train_label)

    feature = df2.iloc[:, 1:].to_numpy()
    label = df2.iloc[:,0].to_numpy()
    label = torch.from_numpy(label)

    #Random Forest
    rf_result = rf.predict(feature)
    rf_result_df = pd.DataFrame(rf_result)
    rf_result_df.to_csv(output_dir  + 'Nor_RF_%(number)04d_prediction.csv'%{'number': training_size}, index= True , header = False)

    rf_result = torch.from_numpy(rf_result)


    ## RF_acc
    rf_acc = acc_calculation(rf_result, label)
    print(f'Random Forest ACC: {rf_acc}%')
    rf_record.append(['Random Forest', training_size , rf_acc])


    plt.rcParams['font.size'] = 8
    plt.rcParams['figure.dpi'] = 300

    #RF
    plt.figure()
    rf_con = confusion_matrix(label, rf_result,normalize='true')

    ## RF confusion matrix dataset csv
    rf_con_df = pd.DataFrame(rf_con)
    rf_con_df.to_csv(output_dir  + 'Nor_RF_%(number)04d_conf.csv'%{'number': training_size}, index= False , header = False)


    rf_con = np.around(rf_con, 2)
    rf_disp = ConfusionMatrixDisplay(confusion_matrix=rf_con, display_labels= xlabel_General)
    rf_disp.plot(cmap ='gist_yarg', colorbar=False)
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.savefig(input_dir + 'Nor_RF_%(number)04d_conf.png' %{'number': training_size}, dpi=300, transparent=True, bbox_inches='tight')




    sql_command = """INSERT INTO `Machine_Learning`
                            (`method`,`ML_method_id`, `method_parameters_id`,
                            `train_set_id`,`train_set_path`, `train_size`,
                            `test_set_id` , `test_set_path`, `test_size`,
                            `prediction_path`, `accuracy`,
                            `confusion_matrix_path`, `confusion_matrix_figure` )
                VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                        '{train_set_id}','{train_set_path}', '{train_size}',
                        '{test_set_id}', '{test_set_path}', '{test_size}',
                            '{prediction_path}','{accuracy}',
                            '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                    method = 'RF', ML_method_id = '1', method_parameters_id = '1',
                    train_set_id = train_set_id, train_set_path = train_set_path, train_size = training_size , 
                    test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                    prediction_path = output_dir  + 'Nor_RF_%(number)04d_prediction.csv'%{'number': training_size}, 
                    accuracy = rf_acc, 
                    confusion_matrix_path = output_dir + 'Nor_RF_%(number)04d_conf.csv'%{'number': training_size}, 
                    confusion_matrix_figure = output_dir + 'Nor_RF_%(number)04d_conf.png')
    # print (sql_command)
    cursor.execute(sql_command)
    mydb.commit()

rf_record_df = pd.DataFrame(rf_record, columns=['method', 'training_size', 'accuracy'])
rf_record_df.to_csv(output_dir + 'Nor_RF_ChangeSize_accuracy.csv', index= False , header = True)

## SVM: Training size difference

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")



df = pd.read_csv(train_set_path,header = 0)  ## input_dir + 'SERS_Nor_training.csv'
df2 = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'


svm_record = []

for training_size in training_sizes:
    print('training size is:' + str(training_size))

    ## Training
    train_feature = df.iloc[:training_size, 1:].to_numpy()
    train_label = df.iloc[:training_size,0].to_numpy()

    ## Model Training
    ## SVM
    svm = SVC(C=10)
    svm.fit(train_feature, train_label)

    feature = df2.iloc[:, 1:].to_numpy()
    label = df2.iloc[:,0].to_numpy()
    label = torch.from_numpy(label)

    ## SVM
    svm_result = svm.predict(feature)
    svm_result_df = pd.DataFrame(svm_result)
    svm_result_df.to_csv(output_dir  + 'Nor_SVM_%(number)04d_prediction.csv'%{'number': training_size}, index= True , header = False)
    svm_result = torch.from_numpy(svm_result)


    svm_acc = acc_calculation(svm_result, label)
    print(f'SVM ACC: {svm_acc}%')
    svm_record.append(['SVM', training_size , svm_acc])

    plt.rcParams['font.size'] = 8
    plt.rcParams['figure.dpi'] = 300




    ## SVM Confusion Matrix
    plt.figure()
    svm_con = confusion_matrix(label, svm_result,normalize='true')
    ## SVM confusion matrix dataset csv
    svm_con_df = pd.DataFrame(svm_con)
    svm_con_df.to_csv(output_dir  + 'Nor_SVM_%(number)04d_conf.csv'%{'number': training_size}, index= False , header = False)

    svm_con = np.around(svm_con, 2)
    svm_disp = ConfusionMatrixDisplay(confusion_matrix=svm_con, display_labels= xlabel_General)
    svm_disp.plot(cmap ='gist_yarg', colorbar=False)
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.savefig(input_dir + 'Nor_SVM_%(number)04d_conf.png' %{'number': training_size}, dpi=300, transparent=True, bbox_inches='tight')



    sql_command = """INSERT INTO `Machine_Learning`
                            (`method`,`ML_method_id`, `method_parameters_id`,
                            `train_set_id`,`train_set_path`, `train_size`,
                            `test_set_id` , `test_set_path`, `test_size`,
                            `prediction_path`, `accuracy`,
                            `confusion_matrix_path`, `confusion_matrix_figure` )
                VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                        '{train_set_id}','{train_set_path}', '{train_size}',
                        '{test_set_id}', '{test_set_path}', '{test_size}',
                            '{prediction_path}','{accuracy}',
                            '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                    method = 'SVM', ML_method_id = '2', method_parameters_id = '1',
                    train_set_id = train_set_id, train_set_path = train_set_path, train_size = training_size , 
                    test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                    prediction_path = output_dir  + 'Nor_SVM_%(number)04d_prediction.csv'%{'number': training_size}, 
                    accuracy = svm_acc, 
                    confusion_matrix_path = output_dir + 'Nor_SVM_%(number)04d_conf.csv'%{'number': training_size}, 
                    confusion_matrix_figure = output_dir + 'Nor_SVM_%(number)04d_conf.png')
    # print (sql_command)
    cursor.execute(sql_command)
    mydb.commit()

svm_record_df = pd.DataFrame(svm_record, columns=['method', 'training_size', 'accuracy'])
svm_record_df.to_csv(output_dir + 'Nor_SVM_ChangeSize_accuracy.csv', index= False , header = True)


## KNN : Training size difference

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")


df = pd.read_csv(train_set_path,header = 0)  ## input_dir + 'SERS_Nor_training.csv'
df2 = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'

knn_record = []

for training_size in training_sizes:
    print('training size is:' + str(training_size))

    ## Training

    train_feature = df.iloc[:training_size, 1:].to_numpy()
    train_label = df.iloc[:training_size,0].to_numpy()

    ## Model Training
    ## KNN 
    knn = KNeighborsClassifier(algorithm='brute', n_neighbors=10, weights='distance')
    knn.fit(train_feature, train_label)


    feature = df2.iloc[:, 1:].to_numpy()
    label = df2.iloc[:,0].to_numpy()
    label = torch.from_numpy(label)

    ## KNN Result
    knn_result = knn.predict(feature)
    knn_result_df = pd.DataFrame(knn_result)
    knn_result_df.to_csv(output_dir  + 'Nor_KNN_%(number)04d_prediction.csv'%{'number': training_size}, index= True , header = False)
    knn_result = torch.from_numpy(knn_result)


    ## KNN Accuracy
    knn_acc = acc_calculation(knn_result, label)
    print(f'KNN ACC: {knn_acc}%')
    knn_record.append(['KNN', training_size , knn_acc])
    


    plt.rcParams['font.size'] = 8
    plt.rcParams['figure.dpi'] = 300



    ## KNN Confusion Matrix
    plt.figure()
    knn_con = confusion_matrix(label, knn_result,normalize='true')

    knn_con_df = pd.DataFrame(knn_con)
    knn_con_df.to_csv(output_dir  + 'Nor_KNN_%(number)04d_conf.csv'%{'number': training_size}, index= False , header = False)

    knn_con = np.around(knn_con, 2)
    knn_disp = ConfusionMatrixDisplay(confusion_matrix=knn_con, display_labels= xlabel_General)
    knn_disp.plot(cmap ='gist_yarg', colorbar=False)
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.savefig(input_dir + 'Nor_KNN_%(number)04d_conf.png' %{'number': training_size}, dpi=300, transparent=True, bbox_inches='tight')


    sql_command = """INSERT INTO `Machine_Learning`
                            (`method`,`ML_method_id`, `method_parameters_id`,
                            `train_set_id`,`train_set_path`, `train_size`,
                            `test_set_id` , `test_set_path`, `test_size`,
                            `prediction_path`, `accuracy`,
                            `confusion_matrix_path`, `confusion_matrix_figure` )
                VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                        '{train_set_id}','{train_set_path}', '{train_size}',
                        '{test_set_id}', '{test_set_path}', '{test_size}',
                            '{prediction_path}','{accuracy}',
                            '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                    method = 'KNN', ML_method_id = '3', method_parameters_id = '1',
                    train_set_id = train_set_id, train_set_path = train_set_path, train_size = training_size , 
                    test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                    prediction_path = output_dir  + 'Nor_KNN_%(number)04d_prediction.csv'%{'number': training_size}, 
                    accuracy = knn_acc, 
                    confusion_matrix_path = output_dir + 'Nor_KNN_%(number)04d_conf.csv'%{'number': training_size}, 
                    confusion_matrix_figure = output_dir + 'Nor_KNN_%(number)04d_conf.png')
    # print (sql_command)
    cursor.execute(sql_command)
    mydb.commit()

knn_record_df = pd.DataFrame(knn_record, columns=['method', 'training_size', 'accuracy'])
knn_record_df.to_csv(output_dir + 'Nor_KNN_ChangeSize_accuracy.csv', index= False , header = True)

    

## CNN: Training size difference

In [None]:
# ensure database connection
cursor.close()
mydb.reconnect()
cursor = mydb.cursor(buffered=True)
cursor.execute("USE `SERS_ML_TEST`;")



cnn_record = []

for training_size in training_sizes:
    print('training size is:' + str(training_size))


    Epoch = 200
    BATCH_SIZE = 200
    learning_rate = 0.0001 
    wd=0.00001


    df = pd.read_csv(train_set_path , header = 0)  ## input_dir + 'SERS_Nor_training.csv'
    # combine = df.loc[:, 400.0:'abel'].to_numpy()
    combine = df.to_numpy()
    np.random.seed(8787)
    np.random.shuffle(combine)
    # feature = combine[:, 1:]
    # label =  combine[:, 0]
    feature_train = combine[:training_size, 1:]
    label_train =  combine[:training_size, 0]
    train_size = int(feature_train.shape[0] * 1)


    df_test = pd.read_csv(test_set_path, header = 0) ## input_dir + 'SERS_Nor_testing.csv'
    # combine = df_test.loc[:, 400.0:'Label'].to_numpy()
    combine = df_test.to_numpy()
    np.random.seed(8787)
    np.random.shuffle(combine)
    feature_test = combine[:, 1:]
    label_test =  combine[:, 0]
    test_size = int(feature_test.shape[0] * 1)
    # print(test_size)

    ## For conf
    label = label_test
    label = torch.from_numpy(label)
    ###

    #Model
    class Net(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv1d(1, 6, 3, 2)
            self.pool = nn.MaxPool1d(kernel_size=2)
            self.bn1 = nn.BatchNorm1d(6)
            self.conv2 = nn.Conv1d(6, 16, 3, 2)
            self.bn2 = nn.BatchNorm1d(16)
            self.fc1 = nn.Linear(2288, 280)
            self.fc2 = nn.Linear(280, 14)
            self.fc3 = nn.Linear(14, 5)

        def forward(self, x):
            x = x.reshape((x.shape[0],1,-1))
            x = self.pool(F.relu(self.bn1(self.conv1(x))))
            x = self.pool(F.relu(self.bn2(self.conv2(x))))
            x = F.relu(x)
            x = torch.flatten(x, 1) # flatten all dimensions except batch
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x


    feature_train = torch.from_numpy(feature_train)
    label_train = torch.from_numpy(label_train)
    feature_test = torch.from_numpy(feature_test)
    label_test = torch.from_numpy(label_test)

    train_dataset = Data.TensorDataset(feature_train, label_train)
    test_dataset = Data.TensorDataset(feature_test, label_test)
    train_loader = Data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle = True)
    test_loader = Data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = True)



    #Model Setting
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # if torch.cuda.is_available():
    #     print("GPU run")
    cnn_model = Net().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn_model.parameters(), lr=learning_rate, weight_decay=wd)


    #Model Training
    cnn_model_path = input_dir + 'SERS_Nor_CNN_%(number)04d.pth' %{'number': training_size}


    accuracy_record = {'train': [], 'test': []} 
    loss_record = {'train': [], 'test': []} 
    best_train_acc = 0.0
    best_train_loss = 0.0


    initial_time = time()

    for epoch in range(Epoch):  # loop over the dataset multiple times
        train_acc = 0.0
        train_loss = 0.0
        test_acc = 0.0
        test_loss = 0.0

        cnn_model.train()
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.type(torch.FloatTensor)
            labels = labels.type(torch.LongTensor)
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = cnn_model(inputs)
            loss =  criterion(outputs, labels)
            _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
            loss.backward()
            optimizer.step()
            train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
            train_loss += loss.item()


        accuracy_record['train'].append(train_acc/len(train_dataset))
        loss_record['train'].append(train_loss/len(train_loader))    


        cnn_model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, data in enumerate(test_loader, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data
                inputs = inputs.type(torch.FloatTensor)
                labels = labels.type(torch.LongTensor)
                inputs, labels = inputs.to(device), labels.to(device)

                # forward + backward + optimize
                outputs = cnn_model(inputs)
                loss =  criterion(outputs, labels)
                _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
                test_acc += (test_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                test_loss += loss.item()

            accuracy_record['test'].append(test_acc/len(test_dataset))
            loss_record['test'].append(test_loss/len(test_loader))



    torch.save(cnn_model.state_dict(), cnn_model_path)

    print('Finished Training (02)')
    print('Training time', time() - initial_time)


    acc_pd = pd.DataFrame.from_dict(accuracy_record)
    loss_pd = pd.DataFrame.from_dict(loss_record)
    lc_pd = df = pd.concat([acc_pd,loss_pd], axis=1)
    lc_filename =  input_dir + 'SERS_Nor_CNN_%(number)04d_learnCurve.csv'%{'number': training_size}
    lc_pd.to_csv(lc_filename, index=True)

    #######################################################################
    ## Prediction
    batch_num = feature_test.shape[0]
    test_dataset = Data.TensorDataset(feature_test, label_test)
    test_loader = Data.DataLoader(test_dataset, batch_size=batch_num)

    cnn_model.eval()
    # pred_acc = 0.0
    with torch.no_grad():
        for data_test in test_loader:
            features, labels = data_test
            features = features.type(torch.FloatTensor)
            labels = labels.type(torch.LongTensor)
            features = features.reshape((features.shape[0],1, 1, -1))
            features = features.to(device)
            labels = labels.to(device)
            # calculate outputs by running images through the network
            outputs = cnn_model(features)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            cnn_result = predicted.to('cpu')

    cnn_result = cnn_result.cpu().numpy()
    cnn_result_df = pd.DataFrame(cnn_result)
    cnn_result_df.to_csv(output_dir  + 'Nor_CNN_%(number)04d_prediction.csv'%{'number': training_size}, index= True , header = False)
    
    cnn_result = torch.from_numpy(cnn_result)

    ## CNN accuracy
    cnn_acc = acc_calculation(cnn_result, label)
    print(f'CNN ACC: {cnn_acc}%')
    cnn_record.append(['CNN', training_size , cnn_acc])
    

    plt.rcParams['font.size'] = 8
    plt.rcParams['figure.dpi'] = 300


    ## CNN Confusion Matrix
    plt.figure()
    cnn_con = confusion_matrix(label, cnn_result,normalize='true')\
    
    cnn_con_df = pd.DataFrame(cnn_con)
    cnn_con_df.to_csv(output_dir  + 'Nor_CNN_%(number)04d_conf.csv'%{'number': training_size}, index= False , header = False)
    
    cnn_con = np.around(cnn_con, 2)
    cnn_disp = ConfusionMatrixDisplay(confusion_matrix=cnn_con, display_labels= xlabel_General)
    cnn_disp.plot(cmap ='gist_yarg', colorbar=False)
    plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
    plt.savefig(input_dir + 'Nor_CNN_%(number)04d_conf.png' %{'number': training_size}, dpi=300, transparent=True, bbox_inches='tight')


    sql_command = """INSERT INTO `Machine_Learning`
                            (`method`,`ML_method_id`, `method_parameters_id`,
                            `train_set_id`,`train_set_path`, `train_size`,
                            `test_set_id` , `test_set_path`, `test_size`,
                            `prediction_path`, `accuracy`,
                            `confusion_matrix_path`, `confusion_matrix_figure` )
                VALUES ('{method}','{ML_method_id}', '{method_parameters_id}',
                        '{train_set_id}','{train_set_path}', '{train_size}',
                        '{test_set_id}', '{test_set_path}', '{test_size}',
                            '{prediction_path}','{accuracy}',
                            '{confusion_matrix_path}', '{confusion_matrix_figure}');""".format(
                    method = 'CNN', ML_method_id = '4', method_parameters_id = '1',
                    train_set_id = train_set_id, train_set_path = train_set_path, train_size = training_size , 
                    test_set_id = test_set_id, test_set_path = test_set_path,  test_size = test_set_size,
                    prediction_path = output_dir  + 'Nor_CNN_%(number)04d_prediction.csv'%{'number': training_size}, 
                    accuracy = cnn_acc, 
                    confusion_matrix_path = output_dir + 'Nor_CNN_%(number)04d_conf.csv'%{'number': training_size}, 
                    confusion_matrix_figure = output_dir + 'Nor_CNN_%(number)04d_conf.png')
    # print (sql_command)
    cursor.execute(sql_command)
    mydb.commit()

cnn_record_df = pd.DataFrame(cnn_record, columns=['method', 'training_size', 'accuracy'])
cnn_record_df.to_csv(output_dir + 'Nor_CNN_ChangeSize_accuracy.csv', index= False , header = True)


