In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

data_file_path = "../TrainingData/SensorArrayProteinResponseMatrixPairwiseAveraging.csv"

# features:
#   - current density used to fabricate sensors, correlated with pore size {55mAcm^-2, 40mAcm^-2, 25mAcm^-2}
#   - pH of buffer used to make protein solutions {pH4, pH10}
features = ['55mAcm^-2pH4','55mAcm^-2pH10','40mAcm^-2pH4','40mAcm^-2pH10','25mAcm^-2pH4','25mAcm^-2pH10']

# load dataset into Pandas DataFrame
df = pd.read_csv(data_file_path, header=0)

# load data and labels
x = df.loc[:, features].values
y = df.loc[:,['Labels']].values

# dimensionality reduction
dim_red = ['None', 'PCA', 'LDA']

# loop over different methods of dimensionality reduction preprocessing
for preprocessing in dim_red:
    
    models = [SVC(kernel='linear', C=200)
              , SVC(kernel='rbf', C=200)
              , SVC(kernel='sigmoid', C=200)
              , SVC(kernel='poly', C=200)
              , RandomForestClassifier(n_estimators = 100)
              , LogisticRegression(max_iter=10000, solver='saga', penalty='l1', C=10)
              , KNeighborsClassifier(n_neighbors=7)]
    
    models_accuracy = np.zeros(len(models),)
    
    SVM_linear_kernel_accuracy = 0
    SVM_rbf_kernel_accuracy = 0
    SVM_sigmoid_kernel_accuracy = 0
    SVM_poly_kernel_accuracy = 0
    LRaccuracy = 0
    RFaccuracy = 0
    KNNaccuracy = 0
    
    print(f'dimensionality reduction: {preprocessing}')
    print('******************************')
    
    for example in range(len(x)):
        
        # create boolean array mask the same shape as the first column of x with
        # all elements initialized as False, except the row corresponding to the
        # 6D sensor array response which is the test set for
        # leave-one-out cross validation (and vice versa for the training set)
        indices_test = (x[:,0]*0).astype('bool')
        indices_test[example] = True
        indices_train = [not element for element in indices_test]
        
        # train/test splits
        x_train = x[indices_train]
        x_test = x[indices_test]
        y_train = y[indices_train]
        y_test = y[indices_test]
        y_train = np.transpose(y_train)[0,]
        y_test = np.transpose(y_test)[0,]
        
        # standardize the data using mean and stdev from training data
        Scaler = StandardScaler()
        Scaler.fit(x_train)
        x_train = Scaler.transform(x_train)
        x_test = Scaler.transform(x_test)
        
        # apply dimenionsality reduction
        # LDA
        if preprocessing == 'LDA':
            modelLDA = LinearDiscriminantAnalysis(n_components=3, tol=1e-3, solver = 'svd')
            modelLDA.fit(x_train, y_train)
            x_test = modelLDA.transform(x_test)
            x_train = modelLDA.transform(x_train)
        
        # PCA
        if preprocessing == 'PCA':
            modelPCA = PCA(n_components=3)
            modelPCA.fit(x_train)
            x_test = modelPCA.transform(x_test)
            x_train = modelPCA.transform(x_train)
        
        
        #train models on training data
        trained_models = [model.fit(x_train, y_train) for model in models]
        
        #evaluate models on test data
        models_accuracy += np.array([trained_model.score(x_test, y_test) for trained_model in trained_models])
    
    models_accuracy /= len(x)
    print('Support Vector Machines:')
    print(f'    linear kernel accuracy: {round(models_accuracy[0],2)}')
    print(f'    rbf kernel accuracy: {round(models_accuracy[1],2)}')
    print(f'    sigmoid kernel accuracy: {round(models_accuracy[2],2)}')
    print(f'    poly kernel accuracy: {round(models_accuracy[3],2)}')    
    print(f'Random Forest accuracy: {round(models_accuracy[4],2)}')   
    print(f'Logistic Regression accuracy: {round(models_accuracy[5],2)}')
    print(f'KNN accuracy: {round(models_accuracy[6],2)}\n')

dimensionality reduction: None
******************************
Support Vector Machines:
    linear kernel accuracy: 0.98
    rbf kernel accuracy: 0.94
    sigmoid kernel accuracy: 0.55
    poly kernel accuracy: 0.98
Random Forest accuracy: 0.99
Logistic Regression accuracy: 0.96
KNN accuracy: 0.92

dimensionality reduction: PCA
******************************
Support Vector Machines:
    linear kernel accuracy: 0.98
    rbf kernel accuracy: 0.96
    sigmoid kernel accuracy: 0.5
    poly kernel accuracy: 0.92
Random Forest accuracy: 0.9
Logistic Regression accuracy: 0.94
KNN accuracy: 0.94

dimensionality reduction: LDA
******************************
Support Vector Machines:
    linear kernel accuracy: 1.0
    rbf kernel accuracy: 0.98
    sigmoid kernel accuracy: 0.51
    poly kernel accuracy: 0.99
Random Forest accuracy: 0.94
Logistic Regression accuracy: 0.9
KNN accuracy: 0.92

