In [1]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix
import numpy as np
from __future__ import print_function

In [2]:
def read_file(filename):
    '''
    Parameters
    - filename: str
        File must be tab-delimited as follows: smiles code, fingerprint, label
    
    Returns
    - (X, Y): tuple of np.arrays
        X is an array of features
        Y is a vector of labels
    '''
    X = []
    Y = []
    input_file = open(filename, 'r')
    
    for line in input_file:
        # split line (1 data point) into smiles, fingerprint (features), and label
        split_line = line.strip().split('\t')
        # print(index)
        smiles = split_line[0]
        fingerprint = [int(c) for c in split_line[3]]
        label = int(split_line[2])
        
        # append data point to train_x (features) and train_y (labels)
        X.append(fingerprint)
        Y.append(label)
    input_file.close()
    return (np.array(X), np.array(Y))

def train_and_test(model, train_x, train_y, test_x, test_y):
    '''
    Parameters
    - model: sklearn model
    - train_x: array, shape (n_samples, n_features)
        The input data for training.
    - train_y: array, shape (n_samples,) or (n_samples, n_outputs)
        The target training values (class labels in classification, real numbers in regression).
    - test_x: array, shape (n_samples, n_features)
        The input data for testing
    - test_y: array, shape (n_samples,) or (n_samples, n_outputs)
        The target testing values (class labels in classification, real numbers in regression).
    
    Prints
    - Training confusion matrix
    - Testing confusion matrix
    - AUROC
    '''
    # fit model parameters
    model.fit(train_x, train_y)
    
    # apply model to train and test data
    train_predictions = model.predict(train_x)
    test_predictions = model.predict(test_x)
    
    # print confusion matricies
    print('train confusion matrix')
    print(confusion_matrix(train_y, train_predictions))
    print('test confusion matrix')
    print(confusion_matrix(test_y, test_predictions))
    
    # calculate AUROC
    y_score = model.predict_proba(test_x)
    # print(y_score)
    print('AUROC: ', end='')
    print(sklearn.metrics.roc_auc_score(test_y, y_score[:,1]))

In [3]:
# Data
# train_filename = './train_fingerprints/nr-ar.fp'
# X, Y  = read_file(train_filename)
# train_x = X[0:-100]
# train_y = Y[0:-100]
# test_x = X[-100:]
# test_y = Y[-100:]

In [4]:
train_filename = './fingerprints/train/nr-ar.fp'
val_filename = './fingerprints/test/nr-ar.fp'
test_filename = './fingerprints/score/nr-ar.fp'
train_x, train_y = read_file(train_filename)
# X_val, Y_val = read_file(val_filename)
test_x, test_y = read_file(test_filename)

In [5]:
# Naive Bayes - Gaussian
model = GaussianNB()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[1209 7749]
 [   0  378]]
test confusion matrix
[[ 88 484]
 [  2  10]]
AUROC: 0.49358974359


In [6]:
# Naive Bayes - Bernoulli
model = BernoulliNB()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[7996  962]
 [ 141  237]]
test confusion matrix
[[525  47]
 [  8   4]]
AUROC: 0.620775058275


In [7]:
## Neural network - default parameters
model = MLPClassifier()
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[8945   13]
 [  81  297]]
test confusion matrix
[[570   2]
 [ 11   1]]
AUROC: 0.563228438228


In [9]:
## Neural network
model = MLPClassifier(hidden_layer_sizes=(1024,1024,1024,1024),activation='relu',batch_size=50,alpha=0.01)
train_and_test(model, train_x, train_y, test_x, test_y)

train confusion matrix
[[8941   17]
 [  94  284]]
test confusion matrix
[[569   3]
 [ 11   1]]
AUROC: 0.61858974359
