In [10]:
from ACDC.random_walk_classifier import * 
from ACDC.cell_type_annotation import * 

In [11]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle

channels = ['Y89Di','Pr141Di','Nd143Di','Nd144Di','Nd145Di','Nd146Di','Nd148Di','Sm149Di','Eu151Di',
            'Gd155Di','Gd160Di','Dy161Di','Dy163Di','Ho165Di','Er167Di','Er168Di','Tm169Di','Er170Di','Yb172Di']

path = 'data/private13class_proportion/colon_20k/'

df = pd.read_csv(path + 'colon_20k_1.csv.gz', sep=',', header = 0, compression = 'gzip')
#df = df[df.cell_type != 'NotGated']


table = pd.read_csv(path + 'private_wrongtable.csv', sep=',', header=0, index_col=0)
table = table.fillna(0)

cts, channels = get_label(table)

#X0= np.arcsinh((df[channels].values - 1.0)/5.0)
X0= df[channels].values

In [12]:
idx2ct = [key for idx, key in enumerate(table.index)]
#idx2ct.append('unknown')

ct2idx = {key:idx for idx, key in enumerate(table.index)}
#ct2idx['unknown'] = len(table.index)
        
ct_score = np.abs(table.as_matrix()).sum(axis = 1)

## compute manual gated label
y0 = np.zeros(df.cell_type.shape)

for i, ct in enumerate(df.cell_type):
    if ct in ct2idx:
        y0[i] = ct2idx[ct]
    #else:
        #y0[i] = ct2idx['unknown']

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix
import phenograph
from sklearn.cross_validation import StratifiedKFold

n_neighbor = 10
thres = 0.5


In [14]:
import time
import scipy.io as sio 

skf = StratifiedKFold(y0, n_folds=5, shuffle=True, random_state=0)
result = []
score_final = []


process_time = []
c = 0
for tr, te in skf:
    print('%02d th batch' % c)
    if c == 1:
        break
    c += 1
    
    X = X0.copy()
    y_true = y0.copy()

    X = X[tr, :]
    y_true = y_true[tr]

    mk_model =  compute_marker_model(pd.DataFrame(X, columns = channels), table, 0.0)

    ## compute posterior probs
    tic = time.clock()
    score = get_score_mat(X, [], table, [], mk_model)
    score = np.concatenate([score, 1.0 - score.max(axis = 1)[:, np.newaxis]], axis = 1)    

    ## get indices     
    ct_index = get_unique_index(X, score, table, thres)
    
    ## baseline - classify events    
    y_pred_index = np.argmax(score, axis = 1)
    
    toc = time.clock()
    time0 = toc - tic
    
    
    
    ## running ACDC
    tic = time.clock()
    res_c = get_landmarks(X, score, ct_index, idx2ct, phenograph, thres)

    landmark_mat, landmark_label = output_feature_matrix(res_c, [idx2ct[i] for i in range(len(idx2ct))]) 

    landmark_label = np.array(landmark_label)

    lp, y_pred = rm_classify(X, landmark_mat, landmark_label, n_neighbor)

    process_time.append(toc-tic)
    
    res = phenograph.cluster(X, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True,
                        primary_metric='euclidean', n_jobs=-1, q_tol=1e-3)
    
    toc = time.clock()
    time1 = toc - tic
    
    
    ## running phenograph classification
    tic = time.clock()
    y_pred_oracle = np.zeros_like(y_true)
    for i in range(max(res[0])+1):
        ic, nc = Counter(y_true[res[0] == i]).most_common(1)[0]
        y_pred_oracle[res[0] == i] = ic
        
    score_final.append([accuracy_score(y_true, [ct2idx[c] for c in y_pred]), 
                    accuracy_score(y_true, y_pred_index), 
                    accuracy_score(y_true, y_pred_oracle)])
    
    toc = time.clock()
    time2 = toc - tic   
    
    
    result.append((y_true, y_pred, y_pred_index, y_pred_oracle))
    process_time.append((time0, time1, time2))
    
    #pickle.dump(result, open('processed_file/BMMC/event_classidication_BMMC.p', 'wb'))
    sio.savemat('processed_file/private13class/wrong20/event_classidication_private_1.mat',{'y_true':y_true,'y_pred_index':y_pred_index,'y_pred_oracle':y_pred_oracle,'X':X})

00 th batch
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10300588607788086 seconds
Jaccard graph constructed in 1.2270703315734863 seconds
Wrote graph to binary file in 0.026001691818237305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.669607
After 2 runs, maximum modularity is Q = 0.675995
Louvain completed 22 runs in 6.272358655929565 seconds
PhenoGraph complete in 7.63443660736084 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10400605201721191 seconds
Jaccard graph constructed in 1.452082872390747 seconds
Wrote graph to binary file in 0.11500668525695801 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.614231
Louvain completed 21 runs in 1.810103416442871 seconds
PhenoGraph complete in 3.489199638366699 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbo

In [15]:
np.mean(score_final, axis = 0) # score of ACDC, score-based classification, phenograph classification

array([ 0.10859644,  0.01250391,  0.73547984])

In [16]:
score_final

[[0.10859643638637075, 0.012503907471084715, 0.73547983744920287]]

In [17]:
process_time

[-1.1403819826227846e-06,
 (0.1317286449952917, 51.22575769109153, 0.01354517066806693)]