In [9]:
from __future__ import print_function

import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import os

In [10]:
## Utility functions

def sign(x, threshold=0):
    y = x > threshold
    return y.astype(int)

def get_data_filenames(data_dir, data_file_ext, assay_name):
    '''
    Returns dictionary mapping 'train', 'test', and 'score' to the corresponding data filename
    '''
    return {subfolder: os.path.join(os.getcwd(), data_dir, subfolder, '') + assay_name + '.' + data_file_ext \
            for subfolder in ['train', 'test', 'score']}

def read_fingerprint(filename):
    '''
    Parameters
    - filename: str
        File must be tab-delimited as follows: smiles code, tox21_id, label, fingerprint
    
    Returns
    - (X, Y): tuple of np.arrays
        X is an array of features
        Y is a vector of labels
    '''
    X = []
    Y = []
    input_file = open(filename, 'r')
    
    for index, line in enumerate(input_file):
        # split line (1 data point) into smiles, fingerprint (features), and label
        split_line = line.strip().split('\t')
        # print(index)
        # smiles = split_line[0]
        fingerprint = [int(c) for c in split_line[3]]
        label = int(split_line[2])
        
        # append data point to train_x (features) and train_y (labels)
        X.append(fingerprint)
        Y.append(label)
    input_file.close()
    return (np.array(X), np.array(Y))

def read_features(filename):
    '''
    Parameters
    - filename: str
        File must be tab-delimited as follows: smiles code, cid, pubchem_fingerprint, 33 extra features (tab-delimited), label
    
    Returns
    - (X, Y): tuple of np.arrays
        X is an array of features
        Y is a vector of labels
    '''
    X = []
    Y = []
    input_file = open(filename, 'r')
    
    for index, line in enumerate(input_file):
        try:
            # split line (1 data point) into smiles, fingerprint (features), 33 extra featues, and label
            split_line = line.strip().split()
            fingerprint = [int(c) for c in split_line[2]]
            label = int(split_line[36])
            extra_features = split_line[3:36]
            fingerprint.extend(extra_features)

            # append data point to X (features) and Y (labels)
            X.append(fingerprint)
            Y.append(label)
        except:
            print('failed to parse data point %d' % index)
            continue
    input_file.close()
    return (np.array(X, dtype=float), np.array(Y, dtype=float))

In [11]:
def train_and_test(model, X_train, Y_train, X_test, Y_test):
    '''
    Parameters
    - model: sklearn model
    - X_train: array, shape (n_samples, n_features)
        The input data for training.
    - Y_train: array, shape (n_samples,) or (n_samples, n_outputs)
        The target training values (class labels in classification, real numbers in regression).
    - X_test: array, shape (n_samples, n_features)
        The input data for testing
    - Y_test: array, shape (n_samples,) or (n_samples, n_outputs)
        The target testing values (class labels in classification, real numbers in regression).
    
    Prints
    - Training confusion matrix
    - Testing confusion matrix
    - AUROC
    '''
    # fit model parameters
    model.fit(X_train, Y_train)
    
    # apply model to train and test data
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    # print confusion matricies
    print('train confusion matrix')
    print(confusion_matrix(Y_train, train_predictions))
    print('test confusion matrix')
    print(confusion_matrix(Y_test, test_predictions))
    
    # calculate AUROC
    y_score = model.predict_proba(X_test)
    test_accuracy = sklearn.metrics.accuracy_score(Y_test, test_predictions)
    test_auc_roc = sklearn.metrics.roc_auc_score(Y_test, y_score[:,1])
    print('Test AUROC: %0.3g' % test_auc_roc)
    print('Test accuracy: %0.3g' % test_accuracy)

In [12]:
assay_name = 'nr-ahr'
data_dir = 'data_pcfp_ext'
data_file_ext = 'features'

filenames = get_data_filenames(data_dir, data_file_ext, assay_name)
X_train, Y_train = read_features(filenames['train'])
X_test, Y_test = read_features(filenames['test'])
X_score, Y_score = read_features(filenames['score'])

failed to parse data point 1046
failed to parse data point 1206
failed to parse data point 1294
failed to parse data point 2556
failed to parse data point 2632
failed to parse data point 2951
failed to parse data point 2976
failed to parse data point 3093
failed to parse data point 3094
failed to parse data point 3095
failed to parse data point 3108
failed to parse data point 3109
failed to parse data point 3212
failed to parse data point 3226
failed to parse data point 3316
failed to parse data point 3353
failed to parse data point 3400
failed to parse data point 3401
failed to parse data point 3402
failed to parse data point 3513
failed to parse data point 3514
failed to parse data point 3515
failed to parse data point 3516
failed to parse data point 3517
failed to parse data point 3721
failed to parse data point 3722
failed to parse data point 3796
failed to parse data point 4169
failed to parse data point 4170
failed to parse data point 4171
failed to parse data point 4172
failed t

In [15]:
# Comparing different models
models = []
models.append(GaussianNB())  # Naive Bayes, Gaussian
models.append(BernoulliNB()) # Naive Bayes, Bernouilli
models.append(DecisionTreeClassifier()) # Decision tree
models.append(RandomForestClassifier()) # Random forest
models.append(MLPClassifier()) # Neural network
models.append(MLPClassifier(hidden_layer_sizes=(512,),activation='relu',batch_size=100,alpha=0.1,max_iter=4)) # custom neural net

for model in models:
    print(type(model))
    train_and_test(model, X_train, Y_train, X_test, Y_test)
    print('\n---\n')

<class 'sklearn.naive_bayes.GaussianNB'>
train confusion matrix
[[3762 1709]
 [  83  712]]
test confusion matrix
[[55 76]
 [ 3 16]]
Test AUROC: 0.648
Test accuracy: 0.473

---

<class 'sklearn.naive_bayes.BernoulliNB'>
train confusion matrix
[[3764 1707]
 [ 122  673]]
test confusion matrix
[[57 74]
 [ 4 15]]
Test AUROC: 0.671
Test accuracy: 0.48

---

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
train confusion matrix
[[5470    1]
 [  40  755]]
test confusion matrix
[[104  27]
 [ 12   7]]
Test AUROC: 0.6
Test accuracy: 0.74

---

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
train confusion matrix
[[5454   17]
 [  63  732]]
test confusion matrix
[[117  14]
 [ 15   4]]
Test AUROC: 0.694
Test accuracy: 0.807

---

<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
train confusion matrix
[[5150  321]
 [ 175  620]]
test confusion matrix
[[94 37]
 [ 5 14]]
Test AUROC: 0.79
Test accuracy: 0.72

---

<class 'sklearn.neural_network.multilayer_perceptron.MLPCl

