In [112]:
import os
import sys
import argparse
import numpy as np
import sklearn.model_selection
import sklearn.metrics
import csv
from glob import glob

In [2]:
import util
from linear_classifier import LinearClassifier
from sil import SIL

In [3]:

#haven't changed this yet
class ResultsReport:
    def __init__(self,label_names=None):
        self.res = {}
        self.label_names = label_names
    def add(self,metric,result):
        if metric not in self.res:
            self.res[metric] = []
        self.res[metric].append( result )
    def print_summary(self,metric=None):
        if metric is None:
            for metric in sorted(self.res.keys()):
                if metric != 'confusion':
                    self.print_summary(metric)
            self.print_summary('confusion')
            return
        if metric != 'confusion':
            mean = np.mean(self.res[metric])
            std = np.std(self.res[metric])
            ste = std/np.sqrt(len(self.res[metric])-1)
            print('%s %f %f %f' % (metric,mean,std,ste) )
        else:
            print('confusion')
            print(('%s '*len(self.label_names))%tuple(self.label_names))
            print(sum(self.res['confusion']))

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser( description='Compute CNN features.' )
    parser.add_argument('--out_dir', '-o', required=True, help='output directory' )
    parser.add_argument('--model', '-m', required=True, help='CNN model' )
    parser.add_argument('--layer', '-l', required=True, help='CNN layer' )
    parser.add_argument('--instance-size', help='instance size' )
    parser.add_argument('--instance-stride', help='instance stride' )
    parser.add_argument('--pool-size', '-p', help='mean pooling size' )
    parser.add_argument('--cat', help='label categories to train (comma separated); default: all' )
    parser.add_argument('--calibrate', action='store_true', help='calibrate classifier' )
    parser.add_argument('--metric', help='metric to optimize during parameter search (accuracy, balanced_accuracy, roc_auc); default: accuracy' )
    parser.add_argument('--classifier', '-c', help='classifier (svm or logistic); default: all' )
    parser.add_argument('--kernel', help='SVM kernel; default: linear' )
    parser.add_argument('--mi', help='MI type (none, median, quantile); default: none (compute mean across images)' )
    parser.add_argument('--quantiles', '-q', help='Number of quantiles; default: 16' )
    parser.add_argument('--sample-weight', help='Weight samples by classification category and this one' )
    parser.add_argument('--group', help='Class groups for reporting results' )
    parser.add_argument('--cv-fold-files', help='cross-validation fold files' )
    parser.add_argument('--cv-folds', help='cross-validation folds' )
    parser.add_argument('--cv-lno', help='cross-validation leave n out' )
    parser.add_argument('--n-jobs', help='number of parallel threads' )
    args = parser.parse_args()
    out_dir = args.out_dir
    if len(out_dir) > 1 and out_dir[-1] != '/':
        out_dir += '/'
    model_name = args.model
    layer = args.layer
    instance_size = args.instance_size
    instance_stride = args.instance_stride
    pool_size = args.pool_size
    categories = args.cat
    metric = args.metric
    calibrate = args.calibrate
    classifier = args.classifier
    kernel = args.kernel
    mi_type = args.mi
    quantiles = args.quantiles
    sample_weight = args.sample_weight
    group = args.group
    cv_fold_files = args.cv_fold_files
    cv_folds = args.cv_folds
    cv_lno = args.cv_lno
    n_jobs = args.n_jobs

    if calibrate is None:
        calibrate = False
    else:
        calibrate = bool(calibrate)
        print(calibrate)

    if n_jobs is not None:
        n_jobs = int(n_jobs)

    # load filenames and labels
    sample_images = util.load_sample_images( out_dir )
    samples,cats,labels = util.load_labels( out_dir )

    if sample_weight is not None:
        # get labels for sample_weight category
        c = np.where(cats==sample_weight)[0][0]
        ln = np.unique([l[c] for l in labels])
        ln.sort()
        ln = list(ln)
        if '' in ln:
            del ln[ln.index('')]
        label_names_sw = ln
        labels_sw = np.array([ ln.index(l) if l in ln else -1 for l in labels[:,c] ])
    if group is not None:
        # get labels for group category
        if group == sample_weight:
            label_names_group = label_names_sw
            labels_group = labels_sw
        else:
            c = np.where(cats==group)[0][0]
            ln = np.unique([l[c] for l in labels])
            ln.sort()
            ln = list(ln)
            if '' in ln:
                del ln[ln.index('')]
            label_names_group = ln
            labels_group = np.array([ ln.index(l) if l in ln else -1 for l in labels[:,c] ])
    if categories is None:
        # get labels for list of categories
        label_names = []
        new_labels = np.zeros(labels.shape,dtype='int')
        for c,cat in enumerate(cats):
            ln = np.unique([l[c] for l in labels])
            ln.sort()
            ln = list(ln)
            label_names.append( ln )
            new_labels[:,c] = [ ln.index(l) for l in labels[:,c] ]
        labels = new_labels
    else:
        # get labels for all categories
        label_names = []
        categories = categories.split(',')
        new_labels = np.zeros((labels.shape[0],len(categories)),dtype='int')
        for i,cat in enumerate(categories):
            c = np.where(cats==cat)[0][0]
            ln = np.unique([l[c] for l in labels])
            ln.sort()
            ln = list(ln)
            if '' in ln:
                del ln[ln.index('')]
            label_names.append( ln )
            new_labels[:,i] = np.array([ ln.index(l) if l in ln else -1 for l in labels[:,c] ])
        labels = new_labels
        cats = categories
        
    # read in CNN features
    feats = {}
    for sample,imagelist in sample_images.items():
        feats[sample] = []
        for fn in imagelist:
            feat_fn = out_dir+fn[:fn.rfind('.')]+'_'+model_name+'-'+layer
            if pool_size is not None:
                feat_fn += '_p'+str(pool_size)
            if instance_size is not None:
                feat_fn += '_i'+str(instance_size)
            if instance_stride is not None:
                feat_fn += '-'+str(instance_stride)
            feat_fn += '.npy'
            feat = np.load(feat_fn)
            if len(feat) == 0:
                continue
            feats[sample].append( feat )

        print('%s %d'%(sample,len(feats[sample])))
        feats[sample] = np.concatenate(feats[sample],axis=0)
        if len(feats[sample].shape) == 1:
            feats[sample] = feats[sample].reshape((1,len(feats[sample])))
            
        # compute mean if needed
        if mi_type is None or mi_type.lower() == 'none':
            if len(feats[sample].shape) > 1:
                feats[sample] = feats[sample].mean(axis=0)

    # build train/test sets
    if cv_fold_files is not None:
        idx_train_test = util.load_cv_files( out_dir, samples, cv_fold_files )
    elif cv_folds is not None or cv_lno is not None:
        if cv_folds is not None:
            cv_folds = int(cv_folds)
        else:
            cv_lno = int(cv_lno)
            if cv_folds is None:
                cv_folds = len(samples) // cv_lno
        idx = np.arange(len(samples))
        if len(label_names) == 1:
            if cv_lno == 1:
                skf = sklearn.model_selection.LeaveOneOut()
            else:
                skf = sklearn.model_selection.StratifiedKFold( n_splits=cv_folds, shuffle=True )
            idx_train_test = list(skf.split(idx,labels[:,0]))
        else:
            # merge label categories to do stratified folds
            skf = sklearn.model_selection.StratifiedKFold( n_splits=cv_folds, shuffle=True )
            la_all = np.array(labels[:,0])
            p = 1
            for i in range(labels.shape[1]):
                la_all += labels[:,i] * p
                p *= len(label_names[i])
            idx_train_test = list(skf.split(idx,la_all))
    else:
        print('Error: train/test split not specified')
        sys.exit(1)

    options = {}
    if kernel is not None:
        options['kernel'] = kernel
    else:
        options['kernel'] = 'linear'
    if classifier is not None:
        options['classifier'] = classifier
    if mi_type is not None:
        options['predict_type'] = mi_type
    if metric is not None:
        options['metric'] = metric
                        
    for c,cat_name in enumerate(cats):
        print(cat_name)
        res = ResultsReport(label_names[c])
        nfolds = len(idx_train_test)
        for f,(idx_train,idx_test) in enumerate(idx_train_test):
            print('Fold '+str(f+1)+'/'+str(len(idx_train_test)))
            idx_train = idx_train[np.where(labels[idx_train,c]!=-1)[0]]
            idx_test = idx_test[np.where(labels[idx_test,c]!=-1)[0]]
            X_train = [ feats[samples[i]] for i in idx_train ]
            y_train = labels[idx_train,c]
            X_test = [ feats[samples[i]] for i in idx_test ]
            y_test = labels[idx_test,c]

            if sample_weight is not None:
                # figure out sample weights
                print('Weighting by '+sample_weight)
                # discard samples missing a label for sample_weight category
                idx_train = idx_train[np.where(labels_sw[idx_train]!=-1)[0]]
                X_train = [ feats[samples[i]] for i in idx_train ]
                
                y_train = labels[idx_train,c]
                y_sw = y_train + len(label_names[c])*labels_sw[idx_train]

                uniq = np.unique(y_sw).tolist()
                counts = np.array([ (y_sw==l).sum() for l in uniq ])
                counts = counts.sum().astype(float) / ( counts * len(counts) )
                sw = np.array([ counts[uniq.index(y)] for y in y_sw ])
            else:
                sw = None

            if mi_type is None:
                model = LinearClassifier( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
            elif mi_type in ['median','max']:
                model = SIL( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
            elif mi_type == 'quantile':
                if quantiles is not None:
                    options['quantiles'] = int(quantiles)
                model = SIL( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
                
            p_predict = model.predict( X_test )
            y_predict = np.argmax(p_predict,axis=1)
            acc = sklearn.metrics.accuracy_score( y_test, y_predict )
            if len(y_test) == 1:
                auc = 0.0
            elif len(np.unique(y_train)) == 2:
                auc = sklearn.metrics.roc_auc_score( y_test, p_predict[:,1] )
            else:
                auc = 0.0
                for i in range(p_predict.shape[1]):
                    auc += sklearn.metrics.roc_auc_score( y_test==i, p_predict[:,i] )
                auc /= p_predict.shape[1]
            kappa = sklearn.metrics.cohen_kappa_score( y_test, y_predict )
            classes = np.unique(y_train)
            np.sort(classes)
            confusion = sklearn.metrics.confusion_matrix( y_test, y_predict, labels=classes )
            res.add('acc',acc)
            res.add('auc',auc)
            res.add('kappa',kappa)
            if len(label_names[c]) == 2:
                res.add('sensitivity', float( np.logical_and(y_test==1, y_predict==y_test).sum() ) / (y_test==1).sum() )
                res.add('specificity', float( np.logical_and(y_test!=1, y_predict==y_test).sum() ) / (y_test!=1).sum() )
            res.add('confusion',confusion)

            print('accuracy %f auc %f' % (acc,auc))
            print(confusion)

            if group is not None:
                # within group class metrics
                l_group = labels_group[idx_test]
                uniq = np.unique(l_group)
                uniq.sort()
                for u in uniq:
                    if u == -1:
                        continue
                    idx = (l_group==u)

                    group_name = '(%s=%s)'%(group,label_names_group[u])
                    res.add('accuracy '+group_name,sklearn.metrics.accuracy_score( y_test[idx], y_predict[idx] ))
                    if len(np.unique(y_train)) == 2:
                        if (y_test[idx]==0).sum() == 0 or (y_test[idx]==1).sum() == 0:
                            auc = 0
                        else:
                            auc = sklearn.metrics.roc_auc_score( y_test[idx], p_predict[idx,1] )
                    else:
                        auc = 0.0
                        for i in range(p_predict.shape[1]):
                            auc += sklearn.metrics.roc_auc_score( y_test[idx]==i, p_predict[idx,i] )
                        auc /= p_predict.shape[1]
                    res.add('auc '+group_name,auc)
                    res.add('kappa '+group_name,sklearn.metrics.cohen_kappa_score( y_test[idx], y_predict[idx] ) )
                    if len(label_names[c]) == 2:
                        res.add('sensitivity '+group_name,float( np.logical_and(y_test[idx]==1, y_predict[idx]==y_test[idx]).sum() ) / (y_test[idx]==1).sum() )
                        res.add('specificity '+group_name,float( np.logical_and(y_test[idx]!=1, y_predict[idx]==y_test[idx]).sum() ) / (y_test[idx]!=1).sum() )
            
        print('Cross-validation results')
        res.print_summary()


In [None]:
#python run_mi_classify.py -o BreaKHis200/ -m vgg16 -l block4_pool --cat tumor --cv-fold-files fold* --pool-size 5 --mi median

In [23]:
#description='Compute CNN features.'

#required parameters
out_dir = os.path.join(os.getcwd(), 'BreaKHis200/')  #'-o', required=True, help='output directory'
model_name = 'vgg16' #'-m', required=True, help='CNN model'
layer = 'block4_pool' #'-l', required=True, help='CNN layer.'

#if only 3 parameters are used:
#python run_mi_classify.py -o BreaKHis200/ -m vgg16 -l block4_pool_p5

#optional parameters 
pool_size = '5' #'--pool-size', '-p', help='mean pooling size'
metric = 'accuracy' #'--metric', help='metric to optimize during parameter search (accuracy, balanced_accuracy, roc_auc); default: accuracy'
calibrate = False #'--calibrate', action='store_true', help='calibrate classifier (True or False); default: False'
classifier = 'svm' #'--classifier', '-c', help='classifier (svm or logistic); default: all'
kernel = 'linear' #'--kernel', help='SVM kernel; default: linear' 
mi_type = 'median' #'--mi', help='MI type (none, median, quantile); default: none (compute mean across images)' 

#not defined
instance_size = None #'--instance-size', help='instance size' 
instance_stride = None #'--instance-stride', help='instance stride'
cv_fold_files = None #'--cv-fold-files', help='cross-validation fold files; default: None'
cv_folds = None #'--cv-folds', help='cross-validation folds; default: None'
cv_lno = None #'--cv-lno', help='cross-validation leave n out; default: None'
n_jobs = '0' #'--n-jobs', help='number of parallel threads; default: 0'
group = None #'--group', help='Class groups for reporting results'
quantiles = '16' #'--quantiles', '-q', help='Number of quantiles; default: 16'
sample_weight = None #'--sample-weight', help='Weight samples by classification category and this one'
categories = None #'--cat', help='label categories to train (comma separated, tumor); default: tumor,tumor_type,benign_type,malignant_type'

In [49]:
# load filenames and labels
sample_images = util.load_sample_images(out_dir)
samples,cats,labels = util.load_labels(out_dir)

In [20]:
print(type(sample_images))
print(len(sample_images.keys()))
#print(sample_images.keys())

<class 'dict'>
2013


In [13]:
def load_labels(out_dir):

    samples = []
    labels = []
    #d = np.loadtxt( out_dir+'labels.csv', dtype=str, delimiter=',' )
    #d = np.array(d)
    d = []
    with open( out_dir+'labels.csv', 'r' ) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            d.append( row )
    #print([len(di) for di in d])
    d = np.vstack(d)
    print(d)
    samples = d[1:,0]
    cats = d[0,1:]
    labels = d[1:,1:]
    return samples,cats,labels



<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [25]:
#populate optional features
if sample_weight is not None:
    # get labels for sample_weight category
    c = np.where(cats==sample_weight)[0][0]
    ln = np.unique([l[c] for l in labels])
    ln.sort()
    ln = list(ln)
    if '' in ln:
        del ln[ln.index('')]
    label_names_sw = ln
    labels_sw = np.array([ ln.index(l) if l in ln else -1 for l in labels[:,c] ])
    
if group is not None:
    # get labels for group category
    if group == sample_weight:
        label_names_group = label_names_sw
        labels_group = labels_sw
    else:
        c = np.where(cats==group)[0][0]
        ln = np.unique([l[c] for l in labels])
        ln.sort()
        ln = list(ln)
        if '' in ln:
            del ln[ln.index('')]
        label_names_group = ln
        labels_group = np.array([ ln.index(l) if l in ln else -1 for l in labels[:,c] ])

In [66]:
#for i in labels:
#    print(i)
print(categories)    
print(labels.shape[0])
print(labels.shape)

print(labels_none)

print(labels_tumor)

tumor
2013
(2013, 4)
[]
[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


In [67]:
#create labels and categories
labels_none = []
labels_tumor = []

if categories is None:
    # get labels for all categories
    label_names = []
    new_labels = np.zeros(labels.shape,dtype='int')
    
    for c,cat in enumerate(cats):
        ln = np.unique([l[c] for l in labels])
        ln.sort()
        ln = list(ln)
        
        label_names.append(ln)
        
        new_labels[:,c] = [ln.index(l) for l in labels[:,c]]

    labels = new_labels #labels_none = new_labels
    
else:
    # get labels for list of categories: tumor,tumor_type,benign_type,malignant_type
    label_names = []
    cats = categories.split(',')
    new_labels = np.zeros((labels.shape[0],len(cats)),dtype='int')
    
    for i,cat in enumerate(cats):
        if len(cats) > 1:
            c = np.where(cats==cat)[0][0]
        else:
            c = 0
        ln = np.unique([l[c] for l in labels])
        ln.sort()
        ln = list(ln)
        
        if '' in ln:
            del ln[ln.index('')]
        label_names.append(ln)
        
        new_labels[:,i] = np.array([ln.index(l) if l in ln else -1 for l in labels[:,c]])
        
    labels = new_labels #labels_tumor = new_labels
    cats = categories

In [68]:
# read in CNN features
feats = {}
for sample,imagelist in sample_images.items():
    feats[sample] = []
    for fn in imagelist:
        feat_fn = out_dir+fn[:fn.rfind('.')]+'_'+model_name+'-'+layer
        if pool_size is not None:
            feat_fn += '_p'+str(pool_size)
        if instance_size is not None:
            feat_fn += '_i'+str(instance_size)
        if instance_stride is not None:
            feat_fn += '-'+str(instance_stride)
        feat_fn += '.npy'
        feat = np.load(feat_fn)
        if len(feat) == 0:
            continue
        feats[sample].append( feat )

    print('%s %d'%(sample,len(feats[sample])))
    feats[sample] = np.concatenate(feats[sample],axis=0)
    if len(feats[sample].shape) == 1:
        feats[sample] = feats[sample].reshape((1,len(feats[sample])))
            
    # compute mean if needed
    if mi_type is None or mi_type.lower() == 'none':
        if len(feats[sample].shape) > 1:
            feats[sample] = feats[sample].mean(axis=0)

SOB_M_DC-14-16716-029 1
SOB_M_DC-14-16716-034 1
SOB_M_DC-14-16716-031 1
SOB_M_DC-14-16716-024 1
SOB_M_DC-14-16716-007 1
SOB_M_DC-14-16716-015 1
SOB_M_DC-14-16716-028 1
SOB_M_DC-14-16716-013 1
SOB_M_DC-14-16716-017 1
SOB_M_DC-14-16716-003 1
SOB_M_DC-14-16716-012 1
SOB_M_DC-14-16716-020 1
SOB_M_DC-14-16716-019 1
SOB_M_DC-14-16716-027 1
SOB_M_DC-14-16716-004 1
SOB_M_DC-14-16716-006 1
SOB_M_DC-14-16716-022 1
SOB_M_DC-14-16716-032 1
SOB_M_DC-14-16716-026 1
SOB_M_DC-14-16716-030 1
SOB_M_DC-14-16716-016 1
SOB_M_DC-14-16716-021 1
SOB_M_DC-14-16716-005 1
SOB_M_DC-14-16716-002 1
SOB_M_DC-14-16716-008 1
SOB_M_DC-14-16716-014 1
SOB_M_DC-14-16716-033 1
SOB_M_DC-14-16716-025 1
SOB_M_DC-14-16716-009 1
SOB_M_DC-14-16716-001 1
SOB_M_DC-14-16716-010 1
SOB_M_DC-14-16716-023 1
SOB_M_DC-14-16716-011 1
SOB_M_DC-14-16716-018 1
SOB_M_DC-14-11951-019 1
SOB_M_DC-14-11951-028 1
SOB_M_DC-14-11951-021 1
SOB_M_DC-14-11951-018 1
SOB_M_DC-14-11951-009 1
SOB_M_DC-14-11951-001 1
SOB_M_DC-14-11951-008 1
SOB_M_DC-14-1195

SOB_M_DC-14-16336-009 1
SOB_M_DC-14-16336-003 1
SOB_M_DC-14-16336-013 1
SOB_M_DC-14-16336-017 1
SOB_M_DC-14-16336-011 1
SOB_M_DC-14-16336-006 1
SOB_M_DC-14-16336-002 1
SOB_M_DC-14-16336-018 1
SOB_M_DC-14-16336-014 1
SOB_M_DC-14-16336-012 1
SOB_M_DC-14-16336-010 1
SOB_M_DC-14-16336-015 1
SOB_M_DC-14-16336-019 1
SOB_M_DC-14-16336-008 1
SOB_M_DC-14-16336-007 1
SOB_M_DC-14-9461-040 1
SOB_M_DC-14-9461-053 1
SOB_M_DC-14-9461-004 1
SOB_M_DC-14-9461-034 1
SOB_M_DC-14-9461-023 1
SOB_M_DC-14-9461-035 1
SOB_M_DC-14-9461-027 1
SOB_M_DC-14-9461-019 1
SOB_M_DC-14-9461-029 1
SOB_M_DC-14-9461-061 1
SOB_M_DC-14-9461-056 1
SOB_M_DC-14-9461-011 1
SOB_M_DC-14-9461-058 1
SOB_M_DC-14-9461-039 1
SOB_M_DC-14-9461-009 1
SOB_M_DC-14-9461-068 1
SOB_M_DC-14-9461-063 1
SOB_M_DC-14-9461-030 1
SOB_M_DC-14-9461-070 1
SOB_M_DC-14-9461-007 1
SOB_M_DC-14-9461-065 1
SOB_M_DC-14-9461-064 1
SOB_M_DC-14-9461-067 1
SOB_M_DC-14-9461-073 1
SOB_M_DC-14-9461-072 1
SOB_M_DC-14-9461-054 1
SOB_M_DC-14-9461-025 1
SOB_M_DC-14-9461-03

SOB_M_DC-14-14926-001 1
SOB_M_DC-14-14926-016 1
SOB_M_DC-14-14926-015 1
SOB_M_DC-14-14926-003 1
SOB_M_DC-14-14926-008 1
SOB_M_DC-14-14926-010 1
SOB_M_DC-14-14926-007 1
SOB_M_DC-14-14926-006 1
SOB_M_DC-14-14926-013 1
SOB_M_DC-14-14926-002 1
SOB_M_DC-14-14926-009 1
SOB_M_DC-14-14926-018 1
SOB_M_DC-14-14926-011 1
SOB_M_DC-14-14926-012 1
SOB_M_DC-14-14926-004 1
SOB_M_DC-14-14926-005 1
SOB_M_DC-14-17915-007 1
SOB_M_DC-14-17915-011 1
SOB_M_DC-14-17915-010 1
SOB_M_DC-14-17915-009 1
SOB_M_DC-14-17915-017 1
SOB_M_DC-14-17915-013 1
SOB_M_DC-14-17915-001 1
SOB_M_DC-14-17915-016 1
SOB_M_DC-14-17915-021 1
SOB_M_DC-14-17915-019 1
SOB_M_DC-14-17915-015 1
SOB_M_DC-14-17915-018 1
SOB_M_DC-14-17915-006 1
SOB_M_DC-14-17915-012 1
SOB_M_DC-14-17915-003 1
SOB_M_DC-14-17915-004 1
SOB_M_DC-14-17915-020 1
SOB_M_DC-14-17915-005 1
SOB_M_DC-14-17915-002 1
SOB_M_DC-14-17915-022 1
SOB_M_DC-14-17915-014 1
SOB_M_DC-14-2523-023 1
SOB_M_DC-14-2523-021 1
SOB_M_DC-14-2523-030 1
SOB_M_DC-14-2523-019 1
SOB_M_DC-14-2523-027

SOB_M_PC-14-15687B-002 1
SOB_M_PC-14-9146-017 1
SOB_M_PC-14-9146-001 1
SOB_M_PC-14-9146-012 1
SOB_M_PC-14-9146-010 1
SOB_M_PC-14-9146-004 1
SOB_M_PC-14-9146-020 1
SOB_M_PC-14-9146-006 1
SOB_M_PC-14-9146-008 1
SOB_M_PC-14-9146-014 1
SOB_M_PC-14-9146-007 1
SOB_M_PC-14-9146-009 1
SOB_M_PC-14-9146-021 1
SOB_M_PC-14-9146-019 1
SOB_M_PC-14-9146-002 1
SOB_M_PC-14-9146-015 1
SOB_M_PC-14-9146-018 1
SOB_M_PC-14-9146-003 1
SOB_M_PC-14-9146-011 1
SOB_M_PC-14-9146-013 1
SOB_M_PC-14-9146-005 1
SOB_M_PC-14-9146-016 1
SOB_M_PC-14-15704-012 1
SOB_M_PC-14-15704-011 1
SOB_M_PC-14-15704-020 1
SOB_M_PC-14-15704-006 1
SOB_M_PC-14-15704-016 1
SOB_M_PC-14-15704-004 1
SOB_M_PC-14-15704-023 1
SOB_M_PC-14-15704-001 1
SOB_M_PC-14-15704-014 1
SOB_M_PC-14-15704-003 1
SOB_M_PC-14-15704-025 1
SOB_M_PC-14-15704-030 1
SOB_M_PC-14-15704-013 1
SOB_M_PC-14-15704-033 1
SOB_M_PC-14-15704-010 1
SOB_M_PC-14-15704-019 1
SOB_M_PC-14-15704-008 1
SOB_M_PC-14-15704-028 1
SOB_M_PC-14-15704-007 1
SOB_M_PC-14-15704-022 1
SOB_M_PC-14-

SOB_B_F-14-9133-003 1
SOB_B_F-14-9133-024 1
SOB_B_F-14-9133-022 1
SOB_B_F-14-9133-015 1
SOB_B_F-14-9133-035 1
SOB_B_F-14-9133-002 1
SOB_B_F-14-9133-019 1
SOB_B_F-14-9133-017 1
SOB_B_F-14-9133-036 1
SOB_B_F-14-9133-018 1
SOB_B_F-14-9133-027 1
SOB_B_F-14-9133-007 1
SOB_B_F-14-9133-010 1
SOB_B_F-14-9133-009 1
SOB_B_F-14-9133-021 1
SOB_B_F-14-9133-025 1
SOB_B_F-14-9133-008 1
SOB_B_F-14-9133-031 1
SOB_B_F-14-9133-026 1
SOB_B_F-14-9133-023 1
SOB_B_F-14-9133-028 1
SOB_B_F-14-9133-030 1
SOB_B_F-14-9133-004 1
SOB_B_F-14-9133-006 1
SOB_B_F-14-9133-037 1
SOB_B_F-14-9133-001 1
SOB_B_F-14-9133-020 1
SOB_B_F-14-23060CD-002 1
SOB_B_F-14-23060CD-013 1
SOB_B_F-14-23060CD-016 1
SOB_B_F-14-23060CD-001 1
SOB_B_F-14-23060CD-003 1
SOB_B_F-14-23060CD-004 1
SOB_B_F-14-23060CD-011 1
SOB_B_F-14-23060CD-015 1
SOB_B_F-14-23060CD-010 1
SOB_B_F-14-23060CD-008 1
SOB_B_F-14-23060CD-006 1
SOB_B_F-14-23060CD-012 1
SOB_B_F-14-23060CD-009 1
SOB_B_F-14-23060CD-007 1
SOB_B_F-14-23060CD-005 1
SOB_B_F-14-23060CD-014 1
SOB_B_

In [131]:
def load_cv_files( out_dir, samples, cv_fold_files ):
        
    cv_files = sorted(list(glob(out_dir + cv_fold_files)))
    print(cv_files)
    idx_train_test = []
    for fn in cv_files:
        print(fn)
        f = np.loadtxt( fn, dtype=str, delimiter=',' )
        idx_train = np.where(f[:,1]=='train')[0]
        idx_test = np.where(f[:,1]=='test')[0]
        name_train = f[idx_train,0]
        name_test = f[idx_test,0]
        idx_train = np.array([ np.where(samples==name)[0] for name in name_train ]).flatten()
        idx_test = np.array([ np.where(samples==name)[0] for name in name_test ]).flatten()
        idx_train_test.append( [idx_train,idx_test] )
    return idx_train_test

#a = util.load_cv_files(out_dir, samples, cv_fold_files)
#print(a)
#print(out_dir)
#print(samples)
#print(type(samples))
#samples_list = list(samples)
#print(type(samples_list))
#print(cv_fold_files)

#cv_files = sorted(list(glob(out_dir + cv_fold_files)))
#print(cv_files)
idx_train_test = []
for fn in cv_files:
    #print(fn)
    f = np.loadtxt(fn, dtype=str, delimiter=',')
    #print(type(f))
    idx_train = np.where(f[:,1]=='train')[0]
    idx_test = np.where(f[:,1]=='test')[0]
    #print(len(idx_train))
    #print(len(idx_test))
    name_train = f[idx_train,0]
    #print(len(name_train))
    name_test = f[idx_test,0]
    #print(len(name_test))
    #print(name_train[0])
    
    name = name_train[0]
    print(samples_list.index(name))
    break
    print(np.where(samples==name))
    break
    idx_train = np.array([np.where(samples==name)[0] for name in name_train]).flatten()
    print(idx_train.shape)
    #idx_test = np.array([ np.where(samples==name)[0] for name in name_test ]).flatten()
    break




BreaKHis_v1/histology_slides/breast/benign/SOB/fibroadenoma/SOB_B_F_14-21998EF/200X/SOB_B_F-14-200
BreaKHis_v1/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/200X/SOB_B_A-14-200


ValueError: 'BreaKHis_v1/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/200X/SOB_B_A-14-200' is not in list

In [108]:
#get the list of cv_fold_files
#not populating the idx_train_test files

cv_fold_files = 'fold*'
#print(cv_fold_files)

# build train/test sets
if cv_fold_files is not None:
    idx_train_test = util.load_cv_files(out_dir, samples, cv_fold_files)
    print(idx_train_test)
    
elif cv_folds is not None or cv_lno is not None:
    if cv_folds is not None:
        cv_folds = int(cv_folds)
    else:
        cv_lno = int(cv_lno)
        if cv_folds is None:
            cv_folds = len(samples) // cv_lno
    idx = np.arange(len(samples))
    if len(label_names) == 1:
        if cv_lno == 1:
            skf = sklearn.model_selection.LeaveOneOut()
        else:
            skf = sklearn.model_selection.StratifiedKFold( n_splits=cv_folds, shuffle=True )
        idx_train_test = list(skf.split(idx,labels[:,0]))
    else:
        # merge label categories to do stratified folds
        skf = sklearn.model_selection.StratifiedKFold( n_splits=cv_folds, shuffle=True )
        la_all = np.array(labels[:,0])
        p = 1
        for i in range(labels.shape[1]):
            la_all += labels[:,i] * p
            p *= len(label_names[i])
        idx_train_test = list(skf.split(idx,la_all))
else:
    print('Error: train/test split not specified')
    sys.exit(1)

options = {}
if kernel is not None:
    options['kernel'] = kernel
else:
    options['kernel'] = 'linear'
if classifier is not None:
    options['classifier'] = classifier
if mi_type is not None:
    options['predict_type'] = mi_type
if metric is not None:
    options['metric'] = metric    

/home/vibha/Documents/Research/project2/ImageMIL/BreaKHis200/fold0.txt
/home/vibha/Documents/Research/project2/ImageMIL/BreaKHis200/fold1.txt
/home/vibha/Documents/Research/project2/ImageMIL/BreaKHis200/fold2.txt
/home/vibha/Documents/Research/project2/ImageMIL/BreaKHis200/fold3.txt
/home/vibha/Documents/Research/project2/ImageMIL/BreaKHis200/fold4.txt
[[array([], dtype=int64), array([], dtype=int64)], [array([], dtype=int64), array([], dtype=int64)], [array([], dtype=int64), array([], dtype=int64)], [array([], dtype=int64), array([], dtype=int64)], [array([], dtype=int64), array([], dtype=int64)]]


In [107]:
idx_train_test

[[array([], dtype=int64), array([], dtype=int64)],
 [array([], dtype=int64), array([], dtype=int64)],
 [array([], dtype=int64), array([], dtype=int64)],
 [array([], dtype=int64), array([], dtype=int64)],
 [array([], dtype=int64), array([], dtype=int64)]]

In [104]:
#handle the case of only one category
#cats = 'tumor'
if cats == str:
    print('string category')
    cats = [cats]

for c,cat_name in enumerate(cats):
        res = ResultsReport(label_names[c])
        #print(res)
        nfolds = len(idx_train_test)
        #print(nfolds)

        for f,(idx_train,idx_test) in enumerate(idx_train_test):
            #print(f)
            #print(idx_train)
            #print(idx_test)
            print('Fold '+str(f+1)+'/'+str(len(idx_train_test)))

            idx_train = idx_train[np.where(labels[idx_train,c]!=-1)[0]]
            idx_test = idx_test[np.where(labels[idx_test,c]!=-1)[0]]
            X_train = [ feats[samples[i]] for i in idx_train ]
            y_train = labels[idx_train,c]
            X_test = [ feats[samples[i]] for i in idx_test ]
            y_test = labels[idx_test,c]

            if sample_weight is not None:
                # figure out sample weights
                print('Weighting by '+sample_weight)
                # discard samples missing a label for sample_weight category
                idx_train = idx_train[np.where(labels_sw[idx_train]!=-1)[0]]
                X_train = [ feats[samples[i]] for i in idx_train ]
                
                y_train = labels[idx_train,c]
                y_sw = y_train + len(label_names[c])*labels_sw[idx_train]

                uniq = np.unique(y_sw).tolist()
                counts = np.array([ (y_sw==l).sum() for l in uniq ])
                counts = counts.sum().astype(float) / ( counts * len(counts) )
                sw = np.array([ counts[uniq.index(y)] for y in y_sw ])
            else:
                sw = None

            if mi_type is None:
                model = LinearClassifier( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
            elif mi_type in ['median','max']:
                model = SIL( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
            elif mi_type == 'quantile':
                if quantiles is not None:
                    options['quantiles'] = int(quantiles)
                model = SIL( n_jobs=n_jobs, **options )
                model.fit( X_train, y_train, calibrate=calibrate, param_search=True, sample_weight=sw )
                
            p_predict = model.predict( X_test )
            y_predict = np.argmax(p_predict,axis=1)
            acc = sklearn.metrics.accuracy_score( y_test, y_predict )
            if len(y_test) == 1:
                auc = 0.0
            elif len(np.unique(y_train)) == 2:
                auc = sklearn.metrics.roc_auc_score( y_test, p_predict[:,1] )
            else:
                auc = 0.0
                for i in range(p_predict.shape[1]):
                    auc += sklearn.metrics.roc_auc_score( y_test==i, p_predict[:,i] )
                auc /= p_predict.shape[1]
            kappa = sklearn.metrics.cohen_kappa_score( y_test, y_predict )
            classes = np.unique(y_train)
            np.sort(classes)
            confusion = sklearn.metrics.confusion_matrix( y_test, y_predict, labels=classes )
            res.add('acc',acc)
            res.add('auc',auc)
            res.add('kappa',kappa)
            if len(label_names[c]) == 2:
                res.add('sensitivity', float( np.logical_and(y_test==1, y_predict==y_test).sum() ) / (y_test==1).sum() )
                res.add('specificity', float( np.logical_and(y_test!=1, y_predict==y_test).sum() ) / (y_test!=1).sum() )
            res.add('confusion',confusion)

            print('accuracy %f auc %f' % (acc,auc))
            print(confusion)

            if group is not None:
                # within group class metrics
                l_group = labels_group[idx_test]
                uniq = np.unique(l_group)
                uniq.sort()
                for u in uniq:
                    if u == -1:
                        continue
                    idx = (l_group==u)

                    group_name = '(%s=%s)'%(group,label_names_group[u])
                    res.add('accuracy '+group_name,sklearn.metrics.accuracy_score( y_test[idx], y_predict[idx] ))
                    if len(np.unique(y_train)) == 2:
                        if (y_test[idx]==0).sum() == 0 or (y_test[idx]==1).sum() == 0:
                            auc = 0
                        else:
                            auc = sklearn.metrics.roc_auc_score( y_test[idx], p_predict[idx,1] )
                    else:
                        auc = 0.0
                        for i in range(p_predict.shape[1]):
                            auc += sklearn.metrics.roc_auc_score( y_test[idx]==i, p_predict[idx,i] )
                        auc /= p_predict.shape[1]
                    res.add('auc '+group_name,auc)
                    res.add('kappa '+group_name,sklearn.metrics.cohen_kappa_score( y_test[idx], y_predict[idx] ) )
                    if len(label_names[c]) == 2:
                        res.add('sensitivity '+group_name,float( np.logical_and(y_test[idx]==1, y_predict[idx]==y_test[idx]).sum() ) / (y_test[idx]==1).sum() )
                        res.add('specificity '+group_name,float( np.logical_and(y_test[idx]!=1, y_predict[idx]==y_test[idx]).sum() ) / (y_test[idx]!=1).sum() )
            
        print('Cross-validation results')
        res.print_summary()


0
[]
[]
Fold 1/5
Cross-validation results
confusion
B M 


KeyError: 'confusion'

In [89]:

list_c = []
list_c.append(cats_test)
print(list_c)

['tumor']


In [91]:
cats

'tumor'

In [80]:
cats_l = ['tumor']