In [29]:
from ACDC.random_walk_classifier import * 
from ACDC.cell_type_annotation import * 

In [30]:
import pandas as pd
import numpy as np
from collections import Counter

path = 'data/Levine32/40000events/'
df = pd.read_csv(path + 'Levine32_5.csv.gz', sep=',', header = 0, compression = 'gzip')

df = df.drop(['Time', 'Cell_length', 'file_number', 'event_number', 'DNA1()',
              'DNA2()', 'Viability', 'subject'], axis = 1)

channels = [item[:item.find('(')] for item in df.columns[:-1]]
df.columns = channels + ['cell_type']


#df = df.loc[df['cell_type'] != 'NotDebrisSinglets']

table = pd.read_csv(path + 'AML_table.csv', sep=',', header=0, index_col=0)
table = table.fillna(0)

cts, channels = get_label(table)

#X0= np.arcsinh((df[channels].values - 1.0)/5.0)
X0= df[channels].values

In [31]:
idx2ct = [key for idx, key in enumerate(table.index)]
#idx2ct.append('unknown')

ct2idx = {key:idx for idx, key in enumerate(table.index)}
#ct2idx['unknown'] = len(table.index)
        
ct_score = np.abs(table.as_matrix()).sum(axis = 1)

## compute manual gated label
y0 = np.zeros(df.cell_type.shape)

for i, ct in enumerate(df.cell_type):
    if ct in ct2idx:
        y0[i] = ct2idx[ct]
    #else:
        #y0[i] = ct2idx['unknown']

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix
import phenograph
from sklearn.cross_validation import StratifiedKFold
import pickle
n_neighbor = 10
thres = 0.5


In [33]:
import time
import scipy.io as sio 

skf = StratifiedKFold(y0, n_folds=5, shuffle=True, random_state=0)
result = []
score_final = []


process_time = []
c = 0
for tr, te in skf:
    print('%02d th batch' % c)
    if c == 1:
        break
    c += 1
    
    X = X0.copy()
    y_true = y0.copy()

    X = X[tr, :]
    y_true = y_true[tr]

    mk_model =  compute_marker_model(pd.DataFrame(X, columns = channels), table, 0.0)

    ## compute posterior probs
    tic = time.clock()
    score = get_score_mat(X, [], table, [], mk_model)
    score = np.concatenate([score, 1.0 - score.max(axis = 1)[:, np.newaxis]], axis = 1)    

    ## get indices     
    ct_index = get_unique_index(X, score, table, thres)
    
    ## baseline - classify events    
    y_pred_index = np.argmax(score, axis = 1)
    
    toc = time.clock()
    time0 = toc - tic
    
    
    
    ## running ACDC
    tic = time.clock()
    res_c = get_landmarks(X, score, ct_index, idx2ct, phenograph, thres)

    landmark_mat, landmark_label = output_feature_matrix(res_c, [idx2ct[i] for i in range(len(idx2ct))]) 

    landmark_label = np.array(landmark_label)

    lp, y_pred = rm_classify(X, landmark_mat, landmark_label, n_neighbor)

    process_time.append(toc-tic)
    
    res = phenograph.cluster(X, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True,
                        primary_metric='euclidean', n_jobs=-1, q_tol=1e-3)
    
    toc = time.clock()
    time1 = toc - tic
    
    
    ## running phenograph classification
    tic = time.clock()
    y_pred_oracle = np.zeros_like(y_true)
    for i in range(max(res[0])+1):
        ic, nc = Counter(y_true[res[0] == i]).most_common(1)[0]
        y_pred_oracle[res[0] == i] = ic
        
    score_final.append([accuracy_score(y_true, [ct2idx[c] for c in y_pred]), 
                    accuracy_score(y_true, y_pred_index), 
                    accuracy_score(y_true, y_pred_oracle)])
    
    toc = time.clock()
    time2 = toc - tic   
    
    
    result.append((y_true, y_pred, y_pred_index, y_pred_oracle))
    process_time.append((time0, time1, time2))
    
    #pickle.dump(result, open('processed_file/AML/event_classidication_AML.p', 'wb'))
    sio.savemat('processed_file/Levine32/40000events/event_classidication_Levine32_5.mat',{'y_true':y_true,'y_pred_index':y_pred_index,'y_pred_oracle':y_pred_oracle,'X':X})

00 th batch
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10400605201721191 seconds
Jaccard graph constructed in 1.8401052951812744 seconds
Wrote graph to binary file in 0.029001712799072266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.632446
After 2 runs, maximum modularity is Q = 0.634459
After 7 runs, maximum modularity is Q = 0.63611
Louvain completed 27 runs in 3.81021785736084 seconds
PhenoGraph complete in 5.7893311977386475 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10600614547729492 seconds
Jaccard graph constructed in 1.8751075267791748 seconds
Wrote graph to binary file in 0.10100579261779785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.600986
After 6 runs, maximum modularity is Q = 0.604754
Louvain completed 26 runs in 4.655266284942627 seconds
PhenoGraph complete in 6.74738

In [34]:
np.mean(score_final, axis = 0) # score of ACDC, score-based classification, phenograph classification

array([ 0.22378824,  0.05056408,  0.99484359])

In [35]:
process_time

[-2.2807760160503676e-06,
 (0.34848975560908, 211.36753081471034, 0.027091628468269846)]