In [52]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix
import numpy as np
from __future__ import print_function

In [53]:
def read_file(filename):
    '''
    Parameters
    - filename: str
        File must be tab-delimited as follows: smiles code, fingerprint, label
    
    Returns
    - (X, Y): tuple of np.arrays
        X is an array of features
        Y is a vector of labels
    '''
    X = []
    Y = []
    input_file = open(filename, 'r')
    
    for line in input_file:
        # split line (1 data point) into smiles, fingerprint (features), and label
        split_line = line.strip().split('\t')
        smiles = split_line[0]
        fingerprint = [int(c) for c in split_line[1]]
        label = int(split_line[2])
        
        # append data point to train_x (features) and train_y (labels)
        X.append(fingerprint)
        Y.append(label)
    input_file.close()
    return (np.array(X), np.array(Y))

def train_and_test(model, train_x, train_y, test_x, test_y):
    '''
    Parameters
    - model: sklearn model
    - train_x: array, shape (n_samples, n_features)
        The input data for training.
    - train_y: array, shape (n_samples,) or (n_samples, n_outputs)
        The target training values (class labels in classification, real numbers in regression).
    - test_x: array, shape (n_samples, n_features)
        The input data for testing
    - test_y: array, shape (n_samples,) or (n_samples, n_outputs)
        The target testing values (class labels in classification, real numbers in regression).
    
    Prints
    - Training confusion matrix
    - Testing confusion matrix
    - AUROC
    '''
    # fit model parameters
    model.fit(train_x, train_y)
    
    # apply model to train and test data
    train_predictions = model.predict(train_x)
    test_predictions = model.predict(test_x)
    
    # print confusion matricies
    print('train confusion matrix')
    print(confusion_matrix(train_y, train_predictions))
    print('test confusion matrix')
    print(confusion_matrix(test_y, test_predictions))
    
    # calculate AUROC
    y_score = model.predict_proba(test_x)
    # print(y_score)
    print('AUROC: ', end='')
    print(sklearn.metrics.roc_auc_score(test_y, y_score[:,1]))

In [54]:
# Data
train_filename = './train_fingerprints/nr-ar.fp'
X, Y  = read_file(train_filename)
train_x = X[0:-100]
train_y = Y[0:-100]
test_x = X[-100:]
test_y = Y[-100:]

In [55]:
# Naive Bayes - Gaussian
model = GaussianNB()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[619 771]
 [  0  99]]
test confusion matrix
[[40 54]
 [ 0  6]]
AUROC: 0.712765957447


In [56]:
# Naive Bayes - Bernoulli
model = BernoulliNB()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[1246  144]
 [  32   67]]
test confusion matrix
[[82 12]
 [ 4  2]]
AUROC: 0.739361702128


In [57]:
## Neural network - default parameters
model = MLPClassifier()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[1389    1]
 [   4   95]]
test confusion matrix
[[92  2]
 [ 4  2]]
AUROC: 0.890070921986


In [58]:
## Neural network
model = MLPClassifier(alpha=1)
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[1388    2]
 [  31   68]]
test confusion matrix
[[93  1]
 [ 4  2]]
AUROC: 0.882978723404
