In [213]:
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import tensorflow as tf

In [214]:
df = pd.read_csv('OICleaned.csv')
df2 = pd.read_csv('Demographics.csv')

## Join the two dataframes on MRN

In [215]:
merged_df = pd.merge(df, df2, on='MRN')
merged_df = merged_df[['organism', 'Gender', 'age_at_order', 'race', 'antibiotic']]

## Generate one-hot encoding

In [216]:
one_hot_organism = pd.get_dummies(merged_df['organism'])
one_hot_gender = pd.get_dummies(merged_df['Gender'])
one_hot_race = pd.get_dummies(merged_df['race'])
one_hot_antibiotic = pd.get_dummies(merged_df['antibiotic'])

## Delete useless data

In [217]:
one_hot_organism = one_hot_organism.drop('*', axis=1)

## Find top 20 organisms for training

In [218]:
column_list = df['organism'].value_counts().index.values.tolist()[:30]
one_hot_organism = one_hot_organism[column_list]
column_list2 = df['antibiotic'].value_counts().index.values.tolist()[:5]
one_hot_antibiotic = one_hot_antitic[column_list2]

## Concat useful data and translate to 2d matrix

In [223]:
df_x = pd.concat([one_hot_organism, one_hot_gender, merged_df['age_at_order']], axis=1)
# df_x = one_hot_organism

In [231]:
df_x

Unnamed: 0,ESCHERICHIA COLI,KLEBSIELLA PNEUMONIAE SS. PNEUMONIAE,STAPHYLOCOCCUS AUREUS,PSEUDOMONAS AERUGINOSA,METHICILLIN RESISTANT STAPHYLOCOCCUS AUREUS,ESBL ESCHERICHIA COLI,PROTEUS MIRABILIS,ENTEROCOCCUS FAECALIS,ENTEROBACTER CLOACAE,KLEBSIELLA OXYTOCA,...,ACHROMOBACTER SPECIES,STENOTROPHOMONAS MALTOPHILIA,ENTEROCOCCUS FAECIUM,SERRATIA SPECIES,PROVIDENCIA RETTGERI,ACINETOBACTER SPECIES,Female,Male,Unknown,age_at_order
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,63
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,64
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,63
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,64
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,63
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,64
6,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,63
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,64
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,63
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,64


## Train set and test set

In [224]:
data_x = np.array(df_x.as_matrix())
data_y = np.array(one_hot_antibiotic.as_matrix())
X_train = np.array(data_x[:2000000])
y_train = np.array(data_y[:2000000])
X_test = np.array(data_x[2000000:3000000])
y_test = np.array(data_y[2000000:3000000])

In [225]:
print(np.shape(X_train))
print(np.shape(y_train))

(2000000, 34)
(2000000, 5)


## Decision Tree Approach

In [9]:
%store

Stored variables and their in-db values:
mldataset             ->              MRN      Visit_id               Order


In [10]:
%store -r

In [22]:
target = mldataset[['5-Flucytosine', 'Amikacin',
       'Amoxicillin', 'Amoxicillin/Clavulanate', 'Amphotericin-B',
       'Ampicillin', 'Ampicillin/Sulbactam', 'Anidulafungin',
       'Azithromycin', 'Aztreonam', 'Beta lactamase (Penicillinase)',
       'Capreomycin', 'Caspofungin', 'Cefazolin', 'Cefepime', 'Cefotaxime',
       'Cefoxitin', 'Ceftaroline', 'Ceftazidime', 'Ceftriaxone',
       'Cefuroxime', 'Chloramphenicol', 'Ciprofloxacin', 'Clarithromycin',
       'Clindamycin', 'Clofazimine', 'Colistin', 'Cycloserine',
       'Daptomycin', 'Doripenem', 'Doxycycline', 'Ertapenem',
       'Erythromycin', 'Ethambutol', 'Ethionamide', 'Fluconazole',
       'Fosfomycin', 'Gentamicin', 'Gentamicin-Syn', 'Imipenem',
       'Isoniazid', 'Itraconazole', 'Kanamycin', 'Levofloxacin',
       'Linezolid', 'Meropenem', 'Metronidazole', 'Micafungin',
       'Minocycline', 'Moxifloxacin', 'Nitrofurantoin', 'Ofloxacin',
       'Oxacillin', 'Penicillin G', 'Piperacillin',
       'Piperacillin/Tazobactam', 'Posaconazole', 'Pyrazinamide',
       'Quinupristin/Dalfopristin', 'Rifabutin', 'Rifampin',
       'Streptomycin', 'Streptomycin-Syn', 'Tetracycline', 'Tigecycline',
       'Tobramycin', 'Trimethoprim/Sulfa', 'Trimethoprim/Sulfamethoxazole',
       'Vancomycin', 'Voriconazole', 'p-Aminosalicylic acid']]

train_x = mldataset[mldataset.columns.difference(['5-Flucytosine', 'Amikacin',
       'Amoxicillin', 'Amoxicillin/Clavulanate', 'Amphotericin-B',
       'Ampicillin', 'Ampicillin/Sulbactam', 'Anidulafungin',
       'Azithromycin', 'Aztreonam', 'Beta lactamase (Penicillinase)',
       'Capreomycin', 'Caspofungin', 'Cefazolin', 'Cefepime', 'Cefotaxime',
       'Cefoxitin', 'Ceftaroline', 'Ceftazidime', 'Ceftriaxone',
       'Cefuroxime', 'Chloramphenicol', 'Ciprofloxacin', 'Clarithromycin',
       'Clindamycin', 'Clofazimine', 'Colistin', 'Cycloserine',
       'Daptomycin', 'Doripenem', 'Doxycycline', 'Ertapenem',
       'Erythromycin', 'Ethambutol', 'Ethionamide', 'Fluconazole',
       'Fosfomycin', 'Gentamicin', 'Gentamicin-Syn', 'Imipenem',
       'Isoniazid', 'Itraconazole', 'Kanamycin', 'Levofloxacin',
       'Linezolid', 'Meropenem', 'Metronidazole', 'Micafungin',
       'Minocycline', 'Moxifloxacin', 'Nitrofurantoin', 'Ofloxacin',
       'Oxacillin', 'Penicillin G', 'Piperacillin',
       'Piperacillin/Tazobactam', 'Posaconazole', 'Pyrazinamide',
       'Quinupristin/Dalfopristin', 'Rifabutin', 'Rifampin',
       'Streptomycin', 'Streptomycin-Syn', 'Tetracycline', 'Tigecycline',
       'Tobramycin', 'Trimethoprim/Sulfa', 'Trimethoprim/Sulfamethoxazole',
       'Vancomycin', 'Voriconazole', 'p-Aminosalicylic acid'])]

In [45]:
train_x = train_x[train_x.columns.difference(['MRN','Order_time','Visit_id', 'zip'])]

In [43]:
train_x.columns.values

array(['0', '000', '1', '100', '103', '104', '105', '108', '109', '110',
       '112', '115', '116', '117', '119', '120', '121', '123', '124',
       '130', '131', '132', '133', '142', '148', '150', '152', '153',
       '160', '164', '166', '170', '172', '173', '175', '180', '186',
       '189', '190', '191', '193', '194', '195', '196', '197', '200',
       '206', '208', '209', '210', '211', '212', '213', '217', '220',
       '221', '222', '223', '224', '225', '231', '232', '233', '234',
       '238', '242', '246', '249', '253', '254', '256', '275', '277',
       '280', '281', '282', '283', '284', '285', '286', '287', '288',
       '297', '299', '300', '301', '302', '303', '305', '306', '309',
       '317', '319', '320', '321', '322', '324', '325', '326', '327',
       '329', '330', '331', '333', '334', '335', '337', '338', '339',
       '341', '342', '344', '346', '347', '349', '352', '358', '361',
       '366', '368', '370', '372', '373', '378', '379', '382', '394',
       '395', '40

In [21]:
mldataset.columns.values
# training: gram, hospital, zipcod

array(['MRN', 'Visit_id', 'Order_time', '5-Flucytosine', 'Amikacin',
       'Amoxicillin', 'Amoxicillin/Clavulanate', 'Amphotericin-B',
       'Ampicillin', 'Ampicillin/Sulbactam', 'Anidulafungin',
       'Azithromycin', 'Aztreonam', 'Beta lactamase (Penicillinase)',
       'Capreomycin', 'Caspofungin', 'Cefazolin', 'Cefepime', 'Cefotaxime',
       'Cefoxitin', 'Ceftaroline', 'Ceftazidime', 'Ceftriaxone',
       'Cefuroxime', 'Chloramphenicol', 'Ciprofloxacin', 'Clarithromycin',
       'Clindamycin', 'Clofazimine', 'Colistin', 'Cycloserine',
       'Daptomycin', 'Doripenem', 'Doxycycline', 'Ertapenem',
       'Erythromycin', 'Ethambutol', 'Ethionamide', 'Fluconazole',
       'Fosfomycin', 'Gentamicin', 'Gentamicin-Syn', 'Imipenem',
       'Isoniazid', 'Itraconazole', 'Kanamycin', 'Levofloxacin',
       'Linezolid', 'Meropenem', 'Metronidazole', 'Micafungin',
       'Minocycline', 'Moxifloxacin', 'Nitrofurantoin', 'Ofloxacin',
       'Oxacillin', 'Penicillin G', 'Piperacillin',
       '

In [63]:
train_x = train_x.dropna(how='any',axis=0)

In [64]:
import numpy as np
data_x = np.array(train_x.as_matrix())
data_y = np.array(target.as_matrix())

In [65]:
np.any(np.nan)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [66]:
# training a DescisionTreeClassifier
from sklearn import tree
clf = tree.DecisionTreeClassifier()
print(np.shape(data_x))
clf = clf.fit(data_x, data_y)
clf.score(data_x, data_y)

(180626, 634)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=uint8)

In [None]:
clf.predict()

## Support Vector Machine Approach

In [227]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(15,5), random_state=1)
clf.fit(X_train, y_train)                         

KeyboardInterrupt: 

In [183]:
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)

ValueError: bad input shape (200000, 5)

In [145]:
clf.accuracy_score(X_test, y_test)

AttributeError: 'MLPClassifier' object has no attribute 'accuracy_score'

In [108]:
# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)
# train_set
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=100,solver='adam',learning_rate_init=0.01,max_iter=500)

mlp.fit(X_train, Y_train)

print(mlp.score(X_test,Y_test))



0.0


In [49]:
# clf.fit(train_set[:-1], train_set[-1])  
# SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
#   decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
#   max_iter=-1, probability=False, random_state=None, shrinking=True,
#   tol=0.001, verbose=False)

## Get the next batch from the train data

In [56]:
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

## Build the multilayer perceptron and train

In [229]:
from __future__ import print_function
import tensorflow as tf

# Parameters
learning_rate = 0.0001
training_epochs = 40
batch_size = 1000
display_step = 1

# Network Parameters
n_hidden_1 = 16 # 1st layer number of neurons
n_hidden_2 = 16 # 2nd layer number of neurons
n_input = 34 # MNIST data input (img shape: 28*28)
n_classes = 5 # MNIST total classes (0-9 digits)

# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_classes])

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}


# Create model
def multilayer_perceptron(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    # Output fully connected layer with a neuron for each class
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# Construct model
logits = multilayer_perceptron(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        total_batch = int(len(train_x)/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = next_batch(batch_size, X_train, y_train)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([train_op, loss_op], feed_dict={X: batch_x,
                                                            Y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    pred = tf.nn.softmax(logits)  # Apply softmax to logits
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({X: X_test, Y: y_test}))

Epoch: 0001 cost=2336.696056172
Epoch: 0002 cost=2673.256082722
Epoch: 0003 cost=2988.725436970
Epoch: 0004 cost=3296.047121235


KeyboardInterrupt: 