In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import os
import csv
import random
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.svm import SVR, SVC

### train data: map file paths with labels

In [None]:
a = "../label_train_test.pkl"
with open(a, 'rb') as f:
    label, train, test = pickle.load(f)
print(len(train), len(test))

data_path = '../gnet2/data2800'
all_data = os.listdir(data_path)
print('# files', len(all_data))
data = []
all_labels = set()
for file_ in all_data:
    basename = file_
    name = os.path.splitext(basename)[0]
    data.append(os.path.join(data_path, basename))
    label_ = label[name]
    ls = label_.split('+')
    for l in ls:
        all_labels.add(l)
    # print(name, label_)
print(len(all_labels), all_labels)

### test data

In [3]:
area_file = '../roi_results.txt'
area_map = {}
with open(area_file, 'r') as f:
    for line in f.readlines():
        name, patches, area = line.strip().split()
        name = os.path.splitext(name)[0]
        patches = int(patches)
        area = float(area)
        area_map[name] = {'patches':patches, 'area':area}
print(area_map)

{'TC19010210': {'area': 2797025499.8828063, 'patches': 1867}, 'TC19012687': {'area': 2053898240.0, 'patches': 1441}, 'TC19005393': {'area': 2280798701.240744, 'patches': 1537}, 'TC19020557': {'area': 2410073600.0, 'patches': 1693}, 'TC19020559': {'area': 2726311904.5400586, 'patches': 1899}, 'TC19005274': {'area': 2594693120.0, 'patches': 1771}, 'TC19016108': {'area': 2460992000.0, 'patches': 1733}, 'TC19020569': {'area': 3338923691.375258, 'patches': 2272}, 'TC19010318': {'area': 826399379.964972, 'patches': 2150}, 'TC19012680': {'area': 2251472896.0, 'patches': 1584}, 'TC19012672': {'area': 2053581057.5332284, 'patches': 1415}, 'JS19': {'area': 2743246848.0, 'patches': 1905}, 'TC19005405': {'area': 2031676789.2151313, 'patches': 1428}, 'TC19005303': {'area': 2201920748.3048005, 'patches': 1510}, 'JS47': {'area': 2735316587.2397575, 'patches': 1921}, 'TC19005158': {'area': 2100500384.343844, 'patches': 1447}, 'TC19011584': {'area': 3740956611.2054214, 'patches': 2505}, 'TC19011603': {

In [4]:
label_file = '../test1521.xlsx'
df_l = pd.read_excel(label_file)
df_l.head(10)

label_map = {}
for i,row in df_l.iterrows():
    label_map[row['case_no']] = row['old_label'].split('+')

In [5]:
data_path = '../gnet2/test1521'
data = [os.path.join(data_path, f) for f in os.listdir(data_path)]

In [6]:
f = data[5]
df = pd.read_csv(f)
# area = float(df.area[df.area.notnull()])
area = area_map[os.path.basename(f).split('_BATCH')[0]]['area']
print(area)
# patches = float(df.patches[df.patches.notnull()])
# print(patches)
df.head(10)

1456699369.9440808


Unnamed: 0,yolo_cell_class,yolo_cell_class_det,xcp_cell_class,xcp_cell_class_det,x,y,w,h,ACTINO_det,AGC_A_det,...,LSIL_E_softmax_det,LSIL_F_softmax_det,MC_softmax_det,PH_softmax_det,RC_softmax_det,SC_softmax_det,SCC_G_softmax_det,SCC_R_softmax_det,TRI_softmax_det,HSV_softmax_det
0,FUNGI,0.655732,CANDIDA,0.936778,25326,27807,158,56,4.48929,-0.00157,...,1.529697e-05,5.357431e-05,6.073481e-05,0.0003916048,3.170923e-05,5.88192e-07,0.03109014,0.01312842,0.000457,1.1e-05
1,SC,0.249201,HSV,0.996674,20518,27457,44,42,0.438997,-2.7065,...,7.0084e-07,3.20448e-07,5.89161e-07,2.92458e-06,6.71612e-07,0.003093922,2.1545e-08,4.54345e-07,0.000192,0.996674
2,SC,0.171056,HSV,0.976276,22534,21383,46,44,-0.477307,-2.36043,...,1.84301e-07,9.493e-08,1.13527e-07,6.35957e-07,3.73478e-07,0.0236901,5.713e-09,6.0065e-08,4e-06,0.976276
3,SC,0.264347,HSV,0.931605,22930,24583,40,42,-0.494387,-2.28268,...,4.69926e-07,2.11381e-07,2.61418e-07,1.665571e-06,6.19406e-07,0.06829115,1.4872e-08,1.40915e-07,1e-05,0.931605
4,SC,0.120689,HSV,0.919178,24154,26737,44,38,-0.819844,-2.18405,...,3.22192e-07,2.1288e-07,1.89383e-07,1.314374e-06,5.81259e-07,0.08053885,2.0868e-08,1.31682e-07,6e-06,0.919178
5,VIRUS,0.10001,HSV,0.880271,2710,24921,62,46,-0.806261,-2.17609,...,7.21156e-07,3.50566e-07,4.72936e-07,2.64488e-06,2.20569e-06,0.1195794,3.1223e-08,2.09562e-07,1.2e-05,0.880271
6,SC,0.701848,HSV,0.853876,11986,25005,38,38,-0.427922,-2.31996,...,1.543471e-06,6.01021e-07,6.74871e-07,5.241451e-06,1.503874e-06,0.1458,4.0613e-08,6.09396e-07,0.000145,0.853876
7,VIRUS,0.107953,HSV,0.83951,27550,4809,46,42,-0.509021,-2.25843,...,5.37544e-07,2.17523e-07,2.93234e-07,1.769429e-06,8.77179e-07,0.1604135,1.4799e-08,1.5714e-07,1.4e-05,0.83951
8,VIRUS,0.157765,HSV,0.754153,14570,10743,62,48,-0.758364,-1.94768,...,1.921335e-06,8.93678e-07,1.212679e-06,6.215367e-06,3.511304e-06,0.2453673,1.25075e-07,6.00945e-07,2.6e-05,0.754153
9,VIRUS,0.351283,HSV,0.734386,16848,32323,60,70,-0.534236,-2.0198,...,3.851467e-06,2.140914e-06,4.166717e-06,1.619352e-05,1.398373e-05,0.2651098,4.13517e-07,2.106181e-06,6.5e-05,0.734386


### features

In [7]:
tolerate = {"AGC":{"AGC_A", "AGC_B"}, 
            "LSIL":{"ASCUS", "LSIL_E", "LSIL_F"}, 
            "ASCUS":{"ASCUS", "LSIL_E", "LSIL_F"}, 
            "HSIL-SCC_G":{"HSIL_B", "HSIL_M", "HSIL_S", "SCC_G"}, 
            "SCC_R":{"SCC_R"}, 
            "EC":{"EC"}, 
            "CC":{"CC"}, 
            "VIRUS":{"VIRUS", "HSV"}, 
            "FUNGI":{"FUNGI", "CANDIDA"}, 
            "ACTINO":{"ACTINO"}, 
            "TRI":{"TRI"}, 
            "PH":{"PH"}, 
            "SC":{"SC", "RC", "MC", "GEC"}}

# cross 
cross_dtct_p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cross_clas_p = [0.6, 0.7, 0.8, 0.9, 0.99, 0.999]
cross_header = ["{}_{:.2f}_{:.3f}".format(key, dp, cp) for key in tolerate for dp in cross_dtct_p for cp in cross_clas_p]

# detect
dclass = ["AGC", "LSIL", "ASCUS", "HSIL-SCC_G", "SCC_R", "EC", "CC", "VIRUS", "FUNGI", "ACTINO", "TRI", "PH", "SC"]
dtct_p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dheader = ["{}_{:.2f}".format(key, dp) for key in dclass for dp in dtct_p]

# classify
cclass = ["ACTINO", "AGC_A", "AGC_B", "ASCUS", "CC", "EC", "FUNGI", 
          "GEC", "HSIL_B", "HSIL_M", "HSIL_S", "LSIL_E", "LSIL_F", 
          "MC", "PH", "RC", "SC", "SCC_G", "SCC_R", "TRI", "VIRUS"]
clas_p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999]
cheader = ["{}_{:.3f}".format(key, cp) for key in cclass for cp in clas_p]

header = cross_header + dheader + cheader
header_map = {key:i for i,key in enumerate(header)}

all_labels = {'ACTINO':0, 'AGC':1, 'ASCH':2, 'ASCUS':3, 'CC':4, 'EC':5, 
              'FUNGI':6, 'CANDIDA':6, 'HSIL':7, 'VIRUS':8, 'HSV':8, 
              'LSIL':9, 'NILM':10, 'SCC':11, 'TRI':12}
bin_labels = {}
for l in all_labels:
    if l == 'NILM':
        bin_labels[l] = 0
    else:
        bin_labels[l] = 1
print(all_labels)
print(bin_labels)

{'AGC': 1, 'VIRUS': 8, 'CANDIDA': 6, 'ASCH': 2, 'NILM': 10, 'CC': 4, 'TRI': 12, 'EC': 5, 'ASCUS': 3, 'FUNGI': 6, 'HSV': 8, 'SCC': 11, 'HSIL': 7, 'ACTINO': 0, 'LSIL': 9}
{'AGC': 1, 'VIRUS': 1, 'ASCUS': 1, 'ASCH': 1, 'HSIL': 1, 'CC': 1, 'TRI': 1, 'EC': 1, 'CANDIDA': 1, 'FUNGI': 1, 'HSV': 1, 'SCC': 1, 'NILM': 0, 'ACTINO': 1, 'LSIL': 1}


In [None]:
# header_imap = {(key, dp, cp):header_map["{}_{:.2f}_{:.3f}".format(key, dp, cp)] for key in tolerate for dp in dtct_p for cp in clas_p}
# print(header_imap)

# with open('header_imap.pkl', 'wb') as f:
#     pickle.dump(header_imap, f)
    
# # # read pkl file
# # with open('header_imap.pkl', 'rb') as f:
# #     header_imap = pickle.load(f)

In [8]:
def extract_oldfashion(f):
    df = pd.read_csv(f)
    features = [0 for i in range(len(header))]
    # check if is empty
    if df.yolo_cell_class.isnull().values.any():  
        return features
    for i,row in df.iterrows():
        # cross levelup features
        for dp in cross_dtct_p:
            for cp in cross_clas_p:
                if row['xcp_cell_class'] in tolerate[row['yolo_cell_class']]:
                    if row['yolo_cell_class_det'] > dp and row['xcp_cell_class_det'] > cp:
                        key = "{}_{:.2f}_{:.3f}".format(row['yolo_cell_class'], dp, cp)
                        features[header_map[key]] += 1
        # detect levelup features
        for dp in dtct_p:
            if row['yolo_cell_class_det'] > dp:
                key = "{}_{:.2f}".format(row['yolo_cell_class'], dp)
                features[header_map[key]] += 1
        # classify levelup features
        for cp in clas_p:
            if row['xcp_cell_class_det'] > cp:
                xcp_key = row['xcp_cell_class']
                if xcp_key == 'CANDIDA':
                    xcp_key = 'FUNGI'
                elif xcp_key == 'HSV':
                    xcp_key = 'VIRUS'
                key = "{}_{:.3f}".format(xcp_key, cp)
                features[header_map[key]] += 1
    return features

def extract(f):
    df = pd.read_csv(f)
    features = [0 for i in range(len(header))]
    # check if is empty
    if df.detect_label.isnull().values.any():  
        return features
    for i,row in df.iterrows():
        # cross levelup features
        for dp in cross_dtct_p:
            for cp in cross_clas_p:
                if row['classify_label'] in tolerate[row['detect_label']]:
                    if row['detect_probability'] > dp and row['classify_probability'] > cp:
                        key = "{}_{:.2f}_{:.3f}".format(row['detect_label'], dp, cp)
                        features[header_map[key]] += 1
        # detect levelup features
        for dp in dtct_p:
            if row['detect_probability'] > dp:
                key = "{}_{:.2f}".format(row['detect_label'], dp)
                features[header_map[key]] += 1
        # classify levelup features
        for cp in clas_p:
            if row['classify_probability'] > cp:
                key = "{}_{:.3f}".format(row['classify_label'], cp)
                features[header_map[key]] += 1
    return features

def collect(data, test=True):
    X = []
    ya = []  # all labels
    yb = []  # binary labels
    names = []
    for f in data:
        features = extract_oldfashion(f)
        if not test:
            basename = os.path.splitext(os.path.basename(f))[0]
            ls = label[basename].split('+')
        else:
            basename = os.path.basename(f).split('_BATCH')[0]
            if not basename in label_map:
                continue
            ls = label_map[basename]
        if sum(features) == 0:
            continue
        for l in ls:
            a = all_labels[l]
            b = bin_labels[l]
            X.append(features)
            ya.append(a)
            yb.append(b)
            names.append(f)
    return X, ya, yb, names

def worker():
    files = data
    random.shuffle(files)
    random.shuffle(files)
    print("# files:", len(files))

    X, ya, yb, names = [], [], [], []
    
    executor = ProcessPoolExecutor(max_workers=36)
    tasks = []

    batch_size = 50
    for i in range(0, len(files), batch_size):
        batch = files[i : i+batch_size]
#         collect(batch)
        tasks.append(executor.submit(collect, batch))

    job_count = len(tasks)
    for future in as_completed(tasks):
        X_, ya_, yb_, names_ = future.result()  # get the returning result from calling fuction
        X += X_
        ya += ya_
        yb += yb_
        names += names_
        job_count -= 1
        if job_count % 8 == 0: 
            print("One Job Done, Remaining Job Count: %s" % (job_count))

    X = np.asarray(X)
    ya = np.asarray(ya)
    yb = np.asarray(yb)
    print(X.shape, ya.shape, yb.shape)
    
    return X, ya, yb, names

X, ya, yb, names = worker()

# files: 1521
One Job Done, Remaining Job Count: 24
One Job Done, Remaining Job Count: 16
One Job Done, Remaining Job Count: 8
One Job Done, Remaining Job Count: 0
(1420, 1050) (1420,) (1420,)


In [9]:
with open('./gnet2test1500.pkl', 'wb') as f:
    pickle.dump(X, f)
    pickle.dump(ya, f)
    pickle.dump(yb, f)
    pickle.dump(names, f)
    
# with open('./train15test1500.pkl', 'rb') as f:
#     X = pickle.load(f)
#     ya = pickle.load(f)
#     yb = pickle.load(f)
#     names = pickle.load(f)

# with open('/home/ssd_array0/Develop/liyu/codect/set1/feature_dict.pkl', 'rb') as f:
#     feature_dict = pickle.load(f)
    
# X, ya, yb = [], [], []
# for key,value in feature_dict.items():
#     ya += [all_labels[key]] * len(value)
#     yb += [0 if key == 'NILM' else 1] * len(value)
#     X += value
# X = np.asarray(X)
# ya = np.asarray(ya)
# yb = np.asarray(yb)
# print(X.shape, ya.shape, yb.shape)

### classification

In [10]:
class RFESVM:
    def __init__(self):
        self.estimator = SVR(kernel="linear")
        self.selector = None
        
    def select(self, X, y, num_feature):
        self.selector = RFE(self.estimator, num_feature, step=1)
        self.selector = self.selector.fit(X, y)
        selected_feature_indices = self.selector.support_ # ndarray of True/False
        return selected_feature_indices

def split(X, y, mode, test_size, seed):
    random.seed(seed)
    N = 2 if mode == "bin" else 13
    idx = {i:[] for i in range(N)}
    for i,c in enumerate(y):
        idx[c].append(i)
    idx_t, idx_v = [], []
    for c,indices in idx.items():
        n = len(indices)
        idx_t += indices[:-int(n*test_size)]
        idx_v += indices[-int(n*test_size):]
    X_train = X[idx_t]
    X_valid = X[idx_v]
    y_train = y[idx_t]
    y_valid = y[idx_v]
    return X_train, X_valid, y_train, y_valid

def rfe(X_train, X_valid, y_train, y_valid, num_features):
    rfe_svm = RFESVM()
    selected_feature_indices = rfe_svm.select(X_train, y_train, num_features)
    X_train = X_train[:, selected_feature_indices] # Select elements of numpy array via boolean mask array
    X_valid = X_valid[:, selected_feature_indices]
    return X_train, X_valid, y_train, y_valid

def evaluate(y_valid, y_pred, mode):
    if mode == "bin":
        cur_labels = {0:'NILM', 1:'ABN'}
    else:
        cur_labels = {value:key for key,value in all_labels.items()}
    
    results = {i:[0,0,0] for i in range(len(cur_labels))}  # tp, fn, fp
    for t,p in zip(y_valid, y_pred):
        if t == p:
            results[t][0] += 1
        else:
            results[t][1] += 1
            results[p][2] += 1
    for i in range(len(results)):
        labeli = cur_labels[i]
        recall = results[i][0] / (results[i][0] + results[i][1]) if results[i][0] + results[i][1] != 0 else 0.0
        precision = results[i][0] / (results[i][0] + results[i][2]) if results[i][0] + results[i][2] != 0 else 0.0
        print(labeli, results[i][0] + results[i][1], ' recall = {:.4f}'.format(recall), 'precision = {:.4f}'.format(precision))
        
    
def classify(mode="bin"):  # mode = "bin" or "all"
    seed = 2019
    test_size = 0.2
#     num_features = 1170

    y = yb if mode == "bin" else ya
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
    X_train, X_valid, y_train, y_valid = split(X, y, mode, test_size, seed)
    # X_train, X_valid, y_train, y_valid = rfe(X_train, X_valid, y_train, y_valid, num_features)

    model = XGBClassifier(max_depth=15, 
                          n_jobs=36, 
                          subsample=0.8, 
                          colsample_bylevel=1,
                          colsample_bytree=0.6, 
                          scale_pos_weight=1, 
                          n_estimators=500, 
                          min_child_weight=1, 
                          learning_rate=0.1, 
                          gamma=0,
                          random_state=seed)
    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    if mode == "bin":
        eval_metric = ["auc", "error"]
    else:
        eval_metric = ["merror"]
    model.fit(X_train, y_train, 
              early_stopping_rounds=50, 
              eval_metric=eval_metric, 
              eval_set=eval_set, 
              verbose=False)

    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print("accuracy: {:.4f}".format(accuracy))
    evaluate(y_valid, y_pred, mode)
    
    return model

# print('binary classification')
# bin_model = classify("bin")
# print()
# print('multilabel classification')
# all_model = classify("all")

In [11]:
bin_imap = {0:'NILM', 1:'ABN'}
all_imap = {0: 'ACTINO', 1: 'AGC', 2: 'ASCH', 3: 'ASCUS', 4: 'CC', 5: 'EC', 6: 'CANDIDA', 7: 'HSIL', 8: 'HSV', 9: 'LSIL', 10: 'NILM', 11: 'SCC', 12: 'TRI'}

# with open("gnet2models.pkl", 'wb') as f:
#     pickle.dump(bin_model, f)
#     pickle.dump(all_model, f)
#     pickle.dump(bin_imap, f)
#     pickle.dump(all_imap, f)


# with open("train15models.pkl", 'rb') as f:
#     bin_model = pickle.load(f)
#     all_model = pickle.load(f)
#     bin_imap = pickle.load(f)
#     all_imap = pickle.load(f)

#### release

In [12]:
class Diagnoser:
    def __init__(self, pkl_file):
        self.bin_cls = range(len(bin_imap))
        self.all_cls = range(len(all_imap))
        with open(pkl_file, 'rb') as f:
            self.bin_model = pickle.load(f)
            self.all_model = pickle.load(f)
            self.bin_imap = pickle.load(f)
            self.all_imap = pickle.load(f)
    
    def extract_old(self, csv_file):
        df = pd.read_csv(f)
        features = [0 for i in range(len(header))]
        # check if is empty
        if df.yolo_cell_class.isnull().values.any():  
            return features
        for i,row in df.iterrows():
            # cross levelup features
            for dp in cross_dtct_p:
                for cp in cross_clas_p:
                    if row['xcp_cell_class'] in tolerate[row['yolo_cell_class']]:
                        if row['yolo_cell_class_det'] > dp and row['xcp_cell_class_det'] > cp:
                            key = "{}_{:.2f}_{:.3f}".format(row['yolo_cell_class'], dp, cp)
                            features[header_map[key]] += 1
            # detect levelup features
            for dp in dtct_p:
                if row['yolo_cell_class_det'] > dp:
                    key = "{}_{:.2f}".format(row['yolo_cell_class'], dp)
                    features[header_map[key]] += 1
            # classify levelup features
            for cp in clas_p:
                if row['xcp_cell_class_det'] > cp:
                    xcp_key = row['xcp_cell_class']
                    if xcp_key == 'CANDIDA':
                        xcp_key = 'FUNGI'
                    elif xcp_key == 'HSV':
                        xcp_key = 'VIRUS'
                    key = "{}_{:.3f}".format(xcp_key, cp)
        return features
    
    def extract_new(self, csv_file):
        df = pd.read_csv(csv_file)
        features = [0 for i in range(len(header))]
        # check if is empty
        if df.detect_label.isnull().values.any():  
            return features
        for i,row in df.iterrows():
            # cross levelup features
            for dp in cross_dtct_p:
                for cp in cross_clas_p:
                    if row['classify_label'] in tolerate[row['detect_label']]:
                        if row['detect_probability'] > dp and row['classify_probability'] > cp:
                            key = "{}_{:.2f}_{:.3f}".format(row['detect_label'], dp, cp)
                            features[header_map[key]] += 1
            # detect levelup features
            for dp in dtct_p:
                if row['detect_probability'] > dp:
                    key = "{}_{:.2f}".format(row['detect_label'], dp)
                    features[header_map[key]] += 1
            # classify levelup features
            for cp in clas_p:
                if row['classify_probability'] > cp:
                    key = "{}_{:.3f}".format(row['classify_label'], cp)
        return features
    
#     def bin_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         p = self.bin_model.predict(f)[0]
#         l = self.bin_imap[p]
#         return l
        
#     def all_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         p = self.all_model.predict(f)[0]
#         l = self.all_imap[p]
#         return l

#     def bin_and_all_predict(self, csv_file):
#         f = self.extract_old(csv_file)
#         pb = self.bin_model.predict(f)[0]
#         lb = self.bin_imap[pb]
#         pa = self.all_model.predict(f)[0]
#         la = self.all_imap[pa]
#         return lb, la
    
    def bin_and_all_predict(self, X):
        pb = self.bin_model.predict(X)
        lb = [self.bin_imap[p] for p in pb]
        pa = self.all_model.predict(X)
        la = [self.all_imap[p] for p in pa]
        return lb, la
    
    def bin_predict(self, X, y):
        y_pred = self.bin_model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        evaluate(y, y_pred, 'bin')
        cm = confusion_matrix(y, y_pred, labels=self.bin_cls)
        return cm
        
    def all_predict(self, X, y):
        y_pred = self.all_model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        evaluate(y, y_pred, 'all')
        cm = confusion_matrix(y, y_pred, labels=self.all_cls)
        return cm

In [13]:
pkl_file = "gnet2models.pkl"
d = Diagnoser(pkl_file)

b_cm = d.bin_predict(X, yb)
a_cm = d.all_predict(X, ya)

data_p = {'case_no':[], 'diagnosis_b':[], 'diagnosis_m':[], 'label_b':[], 'label_m':[]}
lb, la = d.bin_and_all_predict(X)
data_p['case_no'] = [os.path.basename(n).split('_BATCH')[0] for n in names]
data_p['diagnosis_b'] = lb
data_p['diagnosis_m'] = la
data_p['label_b'] = [bin_imap[i] for i in yb]
data_p['label_m'] = [all_imap[i] for i in ya]    

df_p = pd.DataFrame(data=data_p)
df_p.to_csv('gnet2p.csv')
df_p.head(10)

accuracy: 0.6930
NILM 1260  recall = 0.7294 precision = 0.9063
ABN 160  recall = 0.4062 precision = 0.1601
accuracy: 0.7225
ACTINO 0  recall = 0.0000 precision = 0.0000
AGC 1  recall = 0.0000 precision = 0.0000
ASCH 7  recall = 0.0000 precision = 0.0000
ASCUS 72  recall = 0.1944 precision = 0.0757
CC 24  recall = 0.0000 precision = 0.0000
EC 3  recall = 0.0000 precision = 0.0000
FUNGI 27  recall = 0.1481 precision = 0.8000
HSIL 3  recall = 0.3333 precision = 0.5000
HSV 0  recall = 0.0000 precision = 0.0000
LSIL 9  recall = 0.0000 precision = 0.0000
NILM 1260  recall = 0.7992 precision = 0.8951
SCC 1  recall = 0.0000 precision = 0.0000
TRI 13  recall = 0.0000 precision = 0.0000


Unnamed: 0,case_no,diagnosis_b,diagnosis_m,label_b,label_m
0,TC19005329,ABN,NILM,NILM,NILM
1,TC19012703,NILM,NILM,NILM,NILM
2,TC19005171,NILM,NILM,NILM,NILM
3,TC19005273,NILM,NILM,NILM,NILM
4,TC19010303,NILM,NILM,NILM,NILM
5,TC19010251,NILM,NILM,NILM,NILM
6,TC19012776,NILM,NILM,NILM,NILM
7,TC19014647,ABN,ASCUS,NILM,NILM
8,TC19012864,ABN,NILM,NILM,NILM
9,TC19005157,NILM,NILM,NILM,NILM


In [14]:
def write_cm(cm, label_imap, csv_name):
    l = len(label_imap)
    header = [label_imap[i] for i in range(l)]
    with open(csv_name, 'w') as csvf:
        writer = csv.writer(csvf, delimiter=',')
        writer.writerow(['-']+header)
        for i in range(l):
            writer.writerow([header[i]] + list(cm[i,:]))
    
write_cm(b_cm, bin_imap, 'cm-gnet2bin.csv')
write_cm(a_cm, all_imap, 'cm-gnet2all.csv')

In [15]:
TP = len(df_p[(df_p.diagnosis_m != 'NILM') & (df_p.label_m != 'NILM')])
FN = len(df_p[(df_p.diagnosis_m == 'NILM') & (df_p.label_m != 'NILM')])
FP = len(df_p[(df_p.diagnosis_m != 'NILM') & (df_p.label_m == 'NILM')])
TN = len(df_p[(df_p.diagnosis_m == 'NILM') & (df_p.label_m == 'NILM')])

paiyin = TN / (TN + FP)
jiayin = FN / (TP + FN)
jiayang = FP / (TN + FP)
accuracy = (TN + TP) / (TP + FN + FP + TN)
sensitivity = TP / (TP + FN)
print(paiyin, jiayin, jiayang, accuracy, sensitivity)

0.7992063492063493 0.7375 0.2007936507936508 0.7387323943661972 0.2625


In [None]:
nocan = df_p[(df_p.diagnosis_m != 'CC') & (df_p.label_m == 'CC')]
nocan.case_no

### cross validation

In [None]:
def classify(mode="bin"):  # mode = "bin" or "all"
    seed = 2018
    
    y = yb if mode == "bin" else ya
    num_folds = 5
    X_train_folds = np.array_split(X, num_folds)
    y_train_folds = np.array_split(y, num_folds)
    
    best = [0.0, None] # accuracy, i
    for i in range(0, num_folds):
        X_train = np.concatenate(X_train_folds[:i] + X_train_folds[i+1:])
        y_train = np.concatenate(y_train_folds[:i] + y_train_folds[i+1:])
        X_valid = X_train_folds[i]
        y_valid = y_train_folds[i]

        model = XGBClassifier(max_depth=15, 
                              n_jobs=24, 
                              subsample=0.8, 
                              colsample_bylevel=1,
                              colsample_bytree=0.6, 
                              scale_pos_weight=1, 
                              n_estimators=500, 
                              min_child_weight=1, 
                              learning_rate=0.1, 
                              gamma=0,
                              random_state=seed)
        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        if mode == "bin":
            eval_metric = ["auc", "error"]
        else:
            eval_metric = ["merror"]
        model.fit(X_train, y_train, 
                  early_stopping_rounds=50, 
                  eval_metric=eval_metric, 
                  eval_set=eval_set, 
                  verbose=False)

        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        print("accuracy: {:.4f}".format(accuracy))
        
        if accuracy > best[0]:
            best = [accuracy, i]
            
    print(best)

In [None]:
print('binary classification')
classify("bin")
print()
print('multilabel classification')
classify("all")