## Dimensional Reduction Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn import manifold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import time


In [None]:
input_dir = input("Please enter the directory: \n")

### PCA

In [None]:
df = pd.read_csv(input_dir + 'Labeled_SERS_dataset.csv', header = 0)
# Sorting by Label
df.sort_values(by = 'Label', inplace=True)


plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, 400.0:1550.0]
xlabel_General = np.array([ 'BW25113','BW25113_Uninhibited','BW25113_Inhibited','DH5\u03B1', 'DH5\u03B1_Uninhibited', 'DH5\u03B1_Inhibited', 'DH5\u03B1(ampR)_Uninhibited'])
x_General = np.array([0, 1, 2, 3, 4, 5, 6])
color_map = [ 'powderblue', 'steelblue', 'navy', '#F6BE00','firebrick', 'maroon',  'olivedrab']
marker_list = ['v', 'v', 'v',  'o', '8', '8', 'p', ]



pca = PCA(n_components=3)
pca_feature = pca.fit_transform(feature)
plt.figure()
temp_label = 0
for i in range(pca_feature.shape[0]):  
    if temp_label == 0 and int(df.iloc[i, 2301]) == 0:
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 2301])], label = xlabel_General[int(df.iloc[i, 2301])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 2301])], edgecolors = None)
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 2301]) and (temp_label != 0):
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 2301])], label = xlabel_General[int(df.iloc[i, 2301])], alpha = 0.5, s = 10, marker = marker_list[int(df.iloc[i, 2301])], edgecolors = None)
        temp_label = temp_label + 1
        print( xlabel_General[int(df.iloc[i, 2301])])
        print( int(df.iloc[i, 2301]) )
    else :
        plt.scatter(pca_feature[i,0], pca_feature[i,1],c = color_map[int(df.iloc[i, 2301])], alpha = 0.5, s = 7, marker = marker_list[int(df.iloc[i, 2301])], edgecolors = None)

plt.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.axis('square')
plt.savefig(input_dir + 'Labeled_SERS_PCA.png', dpi=300, transparent=True, bbox_inches='tight')


### t-SNE

In [None]:
df = pd.read_csv(input_dir + 'Labeled_SERS_dataset.csv', header = 0)
# Sorting by Label
df.sort_values(by = 'Label', inplace=True)


plt.rcParams['font.size'] = 6
plt.rcParams['figure.dpi'] = 300
feature = df.loc[:, 400.0:1550.0]
xlabel_General = np.array([ 'BW25113','BW25113_Uninhibited','BW25113_Inhibited','DH5\u03B1', 'DH5\u03B1_Uninhibited', 'DH5\u03B1_Inhibited', 'DH5\u03B1(ampR)_Uninhibited'])
x_General = np.array([0, 1, 2, 3, 4, 5, 6])
color_map = [ 'powderblue', 'steelblue', 'navy', '#F6BE00','firebrick', 'maroon',  'olivedrab']
marker_list = ['v', 'v', 'v',  'o', '8', '8', 'p', ]


tsne = manifold.TSNE(n_components=2, init='random', learning_rate=200, perplexity = 50).fit_transform(feature)
x_min, x_max = tsne.min(0), tsne.max(0)
tsne = (tsne - x_min) / (x_max - x_min)

plt.figure()
temp_label = 0
for i in range(tsne.shape[0]):
    if temp_label == 0 and int(df.iloc[i, 2301]) == 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 2301])], label = xlabel_General[int(df.iloc[i, 2301])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 2301])])
        temp_label = temp_label + 1
    elif temp_label == int(df.iloc[i, 2301]) and temp_label != 0:
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 2301])], label = xlabel_General[int(df.iloc[i, 2301])], alpha = 0.5, s=10, marker = marker_list[int(df.iloc[i, 2301])])
        temp_label = temp_label + 1
    else :
        plt.scatter(tsne[i,0], tsne[i,1],c = color_map[int(df.iloc[i, 2301])], alpha = 0.5, s=10,marker = marker_list[int(df.iloc[i, 2301])])


plt.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
plt.xlabel('T-SNE1')
plt.ylabel('T-SNE2')
plt.axis('square')
plt.savefig(input_dir + 'Labeled_SERS_t-SNE.png', dpi=300, transparent=True, bbox_inches='tight')


# Supervised Machine learning classification

### ML training: RF, SVM, KNN

In [None]:
df = pd.read_csv(input_dir + 'SERS_training.csv',header = 0)
feature = df.loc[:, 400.0:1550.0]
train_label = df['Label'].to_numpy()

## Model Training
#Random Forest
rf = RandomForestClassifier(max_depth=40, max_samples=1.0, min_samples_split=5,random_state=0)
rf.fit(feature, train_label)

#SVM
svm = SVC(C=10, kernel='linear')
svm.fit(feature, train_label)

#KNN
knn = KNeighborsClassifier(algorithm='brute', n_neighbors=10, weights='distance')
knn.fit(feature, train_label)

### ML Training: CNN

In [None]:
Epoch = 200
BATCH_SIZE = 200
learning_rate = 0.0001 
wd=0.00001


df = pd.read_csv(input_dir + 'SERS_training.csv' , header = 0)
combine = df.loc[:, 400.0:'Label'].to_numpy()
np.random.seed(8787)
np.random.shuffle(combine)
feature = combine[:, :-1]
label =  combine[:, -1]
feature_train = combine[:, :-1]
label_train =  combine[:, -1]
train_size = int(feature.shape[0] * 1)


df_test = pd.read_csv(input_dir + 'SERS_testing.csv', header = 0)
combine = df_test.loc[:, 400.0:'Label'].to_numpy()
np.random.seed(8787)
np.random.shuffle(combine)
feature_test = combine[:, :-1]
label_test =  combine[:, -1]
test_size = int(feature_test.shape[0] * 1)



#Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 6, 3, 2)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(6, 16, 3, 2)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc1 = nn.Linear(2288, 280)
        self.fc2 = nn.Linear(280, 14)
        self.fc3 = nn.Linear(14, 7)

    def forward(self, x):
        x = x.reshape((x.shape[0],1,-1))
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = F.relu(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


feature_train = torch.from_numpy(feature_train)
label_train = torch.from_numpy(label_train)
feature_test = torch.from_numpy(feature_test)
label_test = torch.from_numpy(label_test)

train_dataset = Data.TensorDataset(feature_train, label_train)
test_dataset = Data.TensorDataset(feature_test, label_test)
train_loader = Data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle = True)
test_loader = Data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = True)



#Model Setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print("GPU run")
cnn_model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=learning_rate, weight_decay=wd)


#Model Training
cnn_model_path = input_dir + 'SERS_CNN.pth'


accuracy_record = {'train': [], 'test': []} 
loss_record = {'train': [], 'test': []} 
best_train_acc = 0.0
best_train_loss = 0.0


initial_time = time()

for epoch in range(Epoch):  # loop over the dataset multiple times
    train_acc = 0.0
    train_loss = 0.0
    test_acc = 0.0
    test_loss = 0.0

    cnn_model.train()
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.type(torch.FloatTensor)
        labels = labels.type(torch.LongTensor)
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = cnn_model(inputs)
        loss =  criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        loss.backward()
        optimizer.step()
        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += loss.item()


    accuracy_record['train'].append(train_acc/len(train_dataset))
    loss_record['train'].append(train_loss/len(train_loader))    
    if (epoch + 1) % 10 == 0 or epoch == 0:    # print every 2000 mini-batches
        print(f'{epoch + 1}, train_loss: {train_loss /len(train_loader)}, train_acc: {train_acc/len(train_dataset)}')

    if train_acc > best_train_acc:
        best_train_acc = train_acc
        print('[Save]-- [{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, Epoch, train_acc/len(train_dataset), train_loss/len(train_loader)
            ))


    cnn_model.eval() # set the model to evaluation mode
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.type(torch.FloatTensor)
            labels = labels.type(torch.LongTensor)
            inputs, labels = inputs.to(device), labels.to(device)

            # forward + backward + optimize
            outputs = cnn_model(inputs)
            loss =  criterion(outputs, labels)
            _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
            test_acc += (test_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
            test_loss += loss.item()

        accuracy_record['test'].append(test_acc/len(test_dataset))
        loss_record['test'].append(test_loss/len(test_loader))



torch.save(cnn_model.state_dict(), cnn_model_path)

print('Finished Training (02)')
print('Training time', time() - initial_time)


acc_pd = pd.DataFrame.from_dict(accuracy_record)
loss_pd = pd.DataFrame.from_dict(loss_record)
lc_pd = df = pd.concat([acc_pd,loss_pd], axis=1)
lc_filename =  input_dir + 'SERS_CNN_learnCurve.csv'
lc_pd.to_csv(lc_filename, index=True)



### ML prediction: RF, SVM, KNN

In [None]:
df = pd.read_csv(input_dir + 'SERS_testing.csv', header = 0)
# Sorting by Label
df.sort_values(by = 'Label', inplace=True)
feature = df.loc[:, 400.0:1550.0]


label = df['Label'].to_numpy()
label = torch.from_numpy(label)

#Random Forest
rf_result = rf.predict(feature)
rf_result = torch.from_numpy(rf_result)

#SVM
svm_result = svm.predict(feature)
svm_result = torch.from_numpy(svm_result)

#KNN
knn_result = knn.predict(feature)
knn_result = torch.from_numpy(knn_result)

#CNN
cnn_result = df['Prediction'].to_numpy()
cnn_result = torch.from_numpy(cnn_result)

### ML prediction: CNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data

df = pd.read_csv(input_dir + 'SERS_testing.csv', header = 0)
feature = df.loc[:, 400.0:1550.0].to_numpy()
feature = torch.from_numpy(feature)
label = df['Label'].to_numpy()
label = torch.from_numpy(label)


#Data_Loader
batch_num = feature.shape[0]
dataset = Data.TensorDataset(feature, label)
test_loader = Data.DataLoader(dataset, batch_size=batch_num)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 6, 3, 2)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(6, 16, 3, 2)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc1 = nn.Linear(2288, 280)
        self.fc2 = nn.Linear(280, 14)
        self.fc3 = nn.Linear(14, 7)

    def forward(self, x):
        x = x.reshape((x.shape[0],1,-1))
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = F.relu(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = Net().to(device)
model.load_state_dict(torch.load(input_dir + 'SERS_CNN.pth'))

model.eval()

pred_acc = 0.0

with torch.no_grad():
    for data in test_loader:
        features, labels = data
        features = features.type(torch.FloatTensor)
        labels = labels.type(torch.LongTensor)
        features = features.reshape((features.shape[0],1, 1, -1))
        features = features.to(device)
        labels = labels.to(device)
        # calculate outputs by running images through the network
        outputs = model(features)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        pred_acc += (predicted.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
        cnn_result = predicted.to('cpu')


cnn_result = cnn_result.cpu().numpy()



### ML prediction results

In [None]:
def acc_calculation(predict, label):
    correct = (predict == label).sum().item()
    total = label.size(0) 
    return round(100 * correct / total, 2)

In [None]:

#RF
rf_acc = acc_calculation(rf_result, label)

#SVM
svm_acc = acc_calculation(svm_result, label)

#KNN
knn_acc = acc_calculation(knn_result, label)

#CNN
cnn_acc = acc_calculation(cnn_result, label)


print(f'Random Forest ACC: {rf_acc}% \n SVM ACC: {svm_acc}% \n KNN ACC: {knn_acc}% \n CNN ACC: {cnn_acc}%')

In [None]:
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 300


xlabel_General = np.array([ 'BW25113','BW25113_Uninhibited','BW25113_Inhibited','DH5\u03B1', 'DH5\u03B1(WT)_Uninhibited', 'DH5\u03B1(WT)_Inhibited', 'DH5\u03B1(ampR)_Uninhibited'])
x_General = np.array([0, 1, 2, 3, 4, 5, 6])
color_map = [ 'powderblue', 'steelblue', 'navy', '#F6BE00','firebrick', 'maroon',  'olivedrab']
marker_list = ['v', 'v', 'v',  'o', '8', '8', 'p' ]

#RF
plt.figure(1)
rf_con = confusion_matrix(label, rf_result,normalize='true')
rf_con = np.around(rf_con, 2)
rf_disp = ConfusionMatrixDisplay(confusion_matrix=rf_con, display_labels= xlabel_General)
rf_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(input_dir + 'RF_conf.png', dpi=300, transparent=True, bbox_inches='tight')



#SVM
plt.figure(2)
svm_con = confusion_matrix(label, svm_result,normalize='true')
svm_con = np.around(svm_con, 2)
svm_disp = ConfusionMatrixDisplay(confusion_matrix=svm_con,display_labels= xlabel_General)
svm_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(input_dir + 'SVM_conf.png', dpi=300, transparent=True, bbox_inches='tight')

#KNN
plt.figure(3)
knn_con = confusion_matrix(label, knn_result,normalize='true')
knn_con = np.around(knn_con, 2)
knn_disp = ConfusionMatrixDisplay(confusion_matrix=knn_con, display_labels= xlabel_General)
knn_disp.plot(cmap ='gist_yarg', colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(input_dir + 'KNN_conf.png', dpi=300, transparent=True, bbox_inches='tight')

#CNN
plt.figure(4)
cnn_con = confusion_matrix(label, cnn_result, normalize='true')
cnn_con = np.around(cnn_con,2)
cnn_disp = ConfusionMatrixDisplay(confusion_matrix=cnn_con, display_labels= xlabel_General)
cnn_disp.plot(cmap ='gist_yarg',  colorbar=False)
plt.xticks(rotation=45, ha='right', rotation_mode='anchor')
plt.savefig(input_dir + 'CNN_conf.png', dpi=300, transparent=True, bbox_inches='tight')
