In [1]:
import torch
#from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from Optim import Optim
import torch.optim as optim
import argparse
import random
import numpy as np
import timeit
from models import GCN, ChebyNet, MoNet, DSGC
from utils import computeLaplacian
import torchvision
import torchvision.transforms as transforms
import pandas as pd
from sklearn.preprocessing import normalize

from CustomCVs import StratifiedKFoldMixedSizes, StratifiedKFoldByGroups
from data_handling_Monet_DTI import create_data_set
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut
from scipy.spatial.distance import cdist

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score,cross_val_predict,StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

# parser 객체 생성
parser = argparse.ArgumentParser(description='PyTorch Time series forecasting')
# argument 추가
parser.add_argument('--model', type=str, default='DPIEnn' ,help='Model Name')
parser.add_argument('--epochs', type=int, default=60,help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=128, metavar='N',help='batch size')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--seed', type=int, default=1234,help='random seed')
parser.add_argument('--gpu', type=int, default=0)
parser.add_argument('--save', type=str,  default='save/model.pt',help='path to save the final model')
parser.add_argument('--cuda', type=str, default=True, help='use gpu or not')
parser.add_argument('--nn', type=int, default=16, help='number of the nearest neighbors')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--sigma', type=float, default=None, help='sigma coefficient.')
parser.add_argument('--embedding', type=int, default=40, help='embedding_num')
parser.add_argument('--cv', type=int, default=0, help='cv type: 0, 1, 2')
parser.add_argument('--numlayers', type=int, default=2, help='# of layers')

#parser.add_argument('--nn', type=str, default='8,8,8,8,3', help='number of the nearest neighbors')
#parser.add_argument('--l', type=float, default=0.25, help='laplacian constant for testing')

# method로 명령창에서 주어진 인자를 args 이름으로 파싱
#jupyter notebook에서는 이 부분 에러 발생
# : argparse 라이브러리를 사용하기 원한다면 터미널이나 다른 프레임워크에서 실행
#args = parser.parse_args()

import easydict

# 전체 최고 : 50 1024 : 54 / 54
# test 최고 : 60 1024
 
args = easydict.EasyDict({
    "model": "MoNet",
    "epochs": 60,
    "batch_size": 1024,
    "dropout": 0.001,
    "seed": "1234",
    "gpu": 0,
    "save": 'save/model.pt',
    "cuda": True,
    "nn": 16,
    "lr": 0.001,
    "sigma": None,
    "embedding": 40,
    "cv": 0,
    "numlayers": 2,
})

print('args', args)

# Data
print('==> Preparing data..')

args {'model': 'MoNet', 'epochs': 60, 'batch_size': 1024, 'dropout': 0.001, 'seed': '1234', 'gpu': 0, 'save': 'save/model.pt', 'cuda': True, 'nn': 16, 'lr': 0.001, 'sigma': None, 'embedding': 40, 'cv': 0, 'numlayers': 2}
==> Preparing data..


In [2]:
#set a device
# gpu/cpu를 통해서 학습 및 test 실행
device = torch.device('cuda:'+str(args.gpu) if torch.cuda.is_available() else 'cpu')
print(device)

cudnn.benchmark = True
bShowParameter = False

if bShowParameter:
    for name, param in model.named_parameters():
        if param.requires_grad:      # requires_grad == True : tensor의 모든 연산에 대하여 추적& 기록
            print(name, param.data.size())

# #print # of parameters
# nParams = sum([p.nelement() for p in model.parameters()])
# print('* number of parameters: %d' % nParams)

def miniBatchDat(X,y,batchSize = 128,del_idx=None):
    n = len(X)
    # X = normalize(X, axis=0, norm='l1')

    # n개의 data를 batchSize별로 나누고 그 batch들의 시작점마다
    for begin in range(0, n, batchSize):
        # y = torch.Tensor(y[begin:begin + batchSize]).type(torch.LongTensor)
        #if not del_idx:
            # X, y data에서 나눠진 batch들의 Tensor를 2차원으로 
        #    yield torch.unsqueeze(torch.Tensor(X[begin:begin + batchSize, :]),2), torch.Tensor(y[begin:begin + batchSize]).long()
        #else:
            X1 = np.delete(X[begin:begin + batchSize, :],del_idx,1)
            X2 = X[begin:begin + batchSize, del_idx]
            yield torch.unsqueeze(torch.Tensor(X1),2), torch.Tensor(X2), torch.Tensor(y[begin:begin + batchSize]).long()

# 가중치 초기화
def weights_init(m):
    if type(m) == nn.Linear:
        m.reset_parameters()

#training
def train(model,epoch,X,y):
    # training 시작 시간
    time_st = timeit.default_timer()
    # 학습
    model.train()
    
    train_loss = 0
    correct = 0
    total = 0
    batch_idx = 0
    
    #for i1, i2 in miniBatchDat(X,y,args.batch_size,del_idx):
    #    print(i1, i2)
    # batch로 나눠서 batch 순서대로 학습 진행 + loss 계산 + optimize
    for in1,in2, targets in miniBatchDat(X,y,args.batch_size,del_idx):
        in1,in2, targets = in1.to(device),in2.to(device), targets.to(device)
        optimizer.zero_grad()
        """
        print('in1: ' + str(in1))
        print('in1 type: ' + str(type(in1)))
        print('in1 size: ' + str(in1.size()))
        """

        #print('in1 size: ' + str(in1.size())): in1 size: torch.Size([64, 251, 1])
        #print('targets size: ' + str(targets.size())): targets size: torch.Size([64])
        
        #print(len(in1), len(in1[0]), len(in1[0][0]))
        if args.model == 'MoNet':
            outputs = model(in1)
        else: 
            outputs = model(in1,in2)
        
        #print(outputs, targets)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        batch_idx += 1

    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name, param.data, param.grad)
    train_time = timeit.default_timer() - time_st
    print('[%3d/%4d] '%(epoch, args.epochs),'Training Time: %2f'% train_time, 'Loss: %.3f | Acc: %.3f%% (%d/%d)'%(train_loss/(batch_idx+1),100.*correct/total, correct, total), end =" ")


#testing
def test(purpose, model, X, y):
    #print("test 중")
    time_st = timeit.default_timer()
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        batch_idx = 0
        for in1, in2, targets in miniBatchDat(X, y, args.batch_size, del_idx):
            in1, in2, targets = in1.to(device), in2.to(device), targets.to(device)
            if args.model == 'MoNet': 
                outputs = model(in1)
                #print(in1, outputs)
            else: 
                outputs = model(in1,in2)
                
            for i in range(list(targets.size())[0]):
                if targets[i] < 0: targets[i] = 0
                    
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            #print("test 중:", predicted)
            total += targets.size(0)
            #print(predicted)
            #print(targets)
            #print(predicted.eq(targets))
            #print(predicted.eq(targets).sum())
            correct += predicted.eq(targets).sum().item()
            batch_idx += 1
    test_time = timeit.default_timer() - time_st
    test_acc = 100.*correct/total

    print('Testing Time: %2f'% test_time, 'Acc: %.3f%% (%d/%d)'%(test_acc, correct, total))
    if(purpose == 'acc'):
        return test_acc
    else:
        return predicted

best_acc = 0.0
# best_epoch = 0
cv_type = 0

cpu


In [3]:
# Here we will import our data from the csv: we do NOT drop out participants with incomplete features (complete=False), but they should have at least 90% feature completeness (completeness_threshold=0.9), we add Age and Sex and SiteID (age_group_tesla_site) as extra columns (covariates), and we automaticly exclude sites with too little participants (min_counts_per_site). I added documentation to this function, check it for further details.

# ENIGMA_OCD_26-01-2019.csv 파일 upload하여 data 이용
X,fs_labels,cov,y, groups = create_data_set(complete=True, completeness_threshold=0.9,
                        covariates=['site'], min_counts_per_site='auto')

#print('x:{},fs_labels:{},cov:{},y:{},groups:{}'.format(X,fs_labels,cov,y,groups))
# X : 각 환자별로 정보 저장
# y : 강박증 여부 (0-강박증x / 1-강박증ㅇ)
# fs_labels : 뇌 구조와 관련된 열의 이름 값
# groups : site 열의 모든 행 값

#print(relevant_features)
#print("X: ", X)
#print("y: ", y)
#print("cov: ", cov)
#print("fs_labels: ", fs_labels)
#print("groups: ", groups)

"""
df = pd.read_csv('./dat/dti.merge13.csv')
dfdf = pd.DataFrame()
covariates = ['Age', 'Sex', 'site']

for i in range(len(df.columns)):
    if(i>=20):
        dfdf[df.columns[i]] = df[df.columns[i]].reset_index(drop=True)
        
X = dfdf.dropna(axis=0)  # 뇌 구조물들 사이의 tract 값들 저장
y = df['Dx']
fs_labels = df.columns[20:]
#cov = df.iloc[df.columns['Age', 'Sex', 'site']]
#groups
"""
# dai와 train set 동일시
X_test,fs_labels_test,cov_test,y_test, groups_testing = create_data_set(TesTrain="Test", complete=True, completeness_threshold=0.9,
                        covariates=['site'], min_counts_per_site='auto')

Loading ENIGMA dataset: Complete = True, Min_Threshold = 0.9, Covariates = ['site'], Min_Counts_per_Site = auto, y_label 

X:  938
Finished loading data set: 938 samples, 248 FS features, 1 covariates 
 
 
test set 설정
Loading ENIGMA dataset: Complete = True, Min_Threshold = 0.9, Covariates = ['site'], Min_Counts_per_Site = auto, y_label 

X:  314
Finished loading data set: 308 samples, 248 FS features, 1 covariates 
 
 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [4]:
print('X shape:{}'.format(X.shape))
#print(relevant_features)
#print(cov)

# fs_labels : csv에서 사용된 feature들 저장된 배열
#print(fs_labels)

feat_surf = []
feat_thick = []

feat_idx = []
x_del_idx = []

del_idx = []

def computeDist(df,feat1,feat2,min,max,group):
    # dtype='f8' : a, b 배열을 float64로 맞춰주기 위한 코드
    a = np.array(df.loc[feat1].tolist()[:-1], dtype='f8')  # .reshape(1, -1)
    b = np.array(df.loc[feat2].tolist()[:-1], dtype='f8')  # .reshape(1, -1)
    
    for idx in range(3):
        a[idx] = (a[idx] - min[group][idx])/(max[group][idx] - min[group][idx])
        b[idx] = (b[idx] - min[group][idx])/(max[group][idx] - min[group][idx])

    return np.linalg.norm(a - b)

# 각 label별 min, max 계산 + 저장
def computeMinMax(row,min,max,group):
    for idx in range(3):
        if min[group][idx] > float(row[idx]):
            min[group][idx] = float(row[idx])
        if max[group][idx] < float(row[idx]):
            max[group][idx] = float(row[idx])
    return min,max

X shape:(938, 248)


In [5]:
if args.model == 'GCN' or args.model == 'DSGC' or args.model == 'ChebyNet':

    df = pd.read_csv('./dat/dictionary.csv', index_col=0)
    #print(df.index)

    dist = np.zeros((fs_labels.shape[0], fs_labels.shape[0]))
    max = [[-9999,-9999,-9999],[-9999,-9999,-9999],[-9999,-9999,-9999]]
    min = [[9999,9999,9999],[9999,9999,9999],[9999,9999,9999]]

    del_idx = []
    index = df.index
    #print(df.index.values)

    for feat1 in index:
        # if label_group[feat1] == 1:
        row = df.loc[feat1].tolist()[:-1]
        if(not (df.loc[feat1].tolist()[3]=='skip' or df.loc[feat1].tolist()[3]==' skip')):
            #print(row)
            min,max = computeMinMax(row,min,max,label_group[feat1] - 1)


    #remove LSurfArea,RSurfArea,LThickness,RThickness
    for i, feat1 in enumerate(fs_labels):
        
        if feat1 in index:
            for j, feat2 in enumerate(fs_labels):
                if feat2 in index:
                    
                    if label_group[feat1] == 1:
                        if label_group[feat2] == 1:
                            dist[i, j] = computeDist(df,feat1,feat2,min,max,0)
                        elif label_group[feat2] == 2 or label_group[feat2] == 3:
                            dist[i, j] = 1.0
                            
                    elif label_group[feat1] == 2:
                        if label_group[feat2] == 1:
                            dist[i, j] = 1.0
                        elif label_group[feat2] == 2:
                            dist[i, j] = computeDist(df, feat1, feat2,min,max,1)
                            
                    elif label_group[feat1] == 3:
                        if label_group[feat2] == 1:
                            dist[i, j] = 1.0
                        elif label_group[feat2] == 3:
                            dist[i, j] = computeDist(df, feat1, feat2,min,max,2)
        else:
            del_idx.append(i)
            #print(feat1): subcort_ICV & cort_ICV : ENIGMA data에는 존재 but dictionary에는 존재x feature
            
    dist = np.delete(dist,del_idx,axis = 0)
    dist = np.delete(dist,del_idx,axis = 1)

    print(dist.shape)
    print(dist)
    print(del_idx)
    #print(np.sum(np.isnan(dist)))
    # X = np.delete(X,del_idx,1)

    # 각 feature별로 가장 가까운 순서대로 feature의 index 저장
    print(np.argsort(dist)[:, 1:args.nn+1] )

if args.model == 'MoNet':
    dist = np.full(X.shape, 0.5)
    print('dist: ' + str(dist))
    print(len(dist), len(dist[0]))

dist: [[0.5 0.5 0.5 ... 0.5 0.5 0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]
 ...
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]]
938 248


In [6]:
"""
유전자 data decode/encode -> team3
ocd에서는 영향 x
abcd, conn data와 같이 큰 data non-lin
        
<data pre-processing>
age에 큰 영향, age: non-linear
1. subcort + cor_ICV = total_ICV -> 분모 / 분자 : feature들 -> ratio 이용
   -> age related된 effect 보정 가능 예상
      thickness 는 잘 보정 x
      missing data 많은 col 찾기
   => 보정하고 다시 학습 진행
2. 10대가 많으면 , 나눠서 실험
3. 23개 feature -> dictionary.csv 파일 feature들로 새로 생성
   개수 적으면 RF에서 강력 -> light model이 더 잘나올듯
   gcn & 다른 모델 학습 진행
"""

'\n유전자 data decode/encode -> team3\nocd에서는 영향 x\nabcd, conn data와 같이 큰 data non-lin\n        \n<data pre-processing>\nage에 큰 영향, age: non-linear\n1. subcort + cor_ICV = total_ICV -> 분모 / 분자 : feature들 -> ratio 이용\n   -> age related된 effect 보정 가능 예상\n      thickness 는 잘 보정 x\n      missing data 많은 col 찾기\n   => 보정하고 다시 학습 진행\n2. 10대가 많으면 , 나눠서 실험\n3. 23개 feature -> dictionary.csv 파일 feature들로 새로 생성\n   개수 적으면 RF에서 강력 -> light model이 더 잘나올듯\n   gcn & 다른 모델 학습 진행\n'

In [7]:
# Now we import the cross-validators we want to use, depending on the specific analysis we want to perform. These are based on scikit learn's CV classes (http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection)

# Since I am performing gridsearches to optimize my models, I have to use a nested cross-validation loop (inner_cv).
# If you are not doing any nested cross-validation you can ignore the inner_cv's for now and only have to use the outer_cv lines.
random_seed = 0 # By setting this on zero we ensure we have the exact same splits!

# nested(중첩) cross-validation하기 위해 outer_cv, inner_cv 두 개 사용
if cv_type == 0:
    # '1. Outer CV: Site-stratified fixed fold sizes, Inner CV: Site-stratified fixed fold sizes'
    #outer_cv = StratifiedKFoldByGroups(n_splits=10, random_state=random_seed, shuffle=True)
    #inner_cv = StratifiedKFoldByGroups(n_splits=5, random_state=random_seed, shuffle=True)
    outer_cv = StratifiedKFold(n_splits=10, random_state=random_seed, shuffle=True)
    inner_cv = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
elif cv_type == 1:
    # '2. Outer CV: Leave One Group Out, Inner CV: Group K-Fold'
    outer_cv = LeaveOneGroupOut() # Note that here we don't have to use a random seed, as these splits will always be the same
    inner_cv = GroupKFold(n_splits=5)
elif cv_type == 2:
    # '3. Outer CV: Site-stratified mixed fold sizes, Inner CV: Site-stratified fixed fold sizes'
    outer_cv = StratifiedKFoldMixedSizes(random_state=random_seed)
    inner_cv = StratifiedKFoldByGroups(n_splits=5, random_state=random_seed, shuffle=True)

testAccArr = []
avgValAcc = []
avgAUC = []
avgAUC_train = []
sc = StandardScaler()

for outer_id, (train_id) in enumerate(outer_cv.split(X, y, cov)):

        print("Outer ID: {}".format(outer_id + 1))
        
        """Train & Test set 맞추는 코드"""
        X_train_val = X
        y_train_val = y
        groups_train_val = cov
        groups_test = cov_test
        
        #X_train_val, X_test = X[train_id], X[test_id]
        #y_train_val, y_test = y[train_id], y[test_id]
        #groups_train_val, groups_test = cov[train_id], cov[test_id]

        if args.model == 'GCN' or args.model == 'DSGC' or args.model == 'ChebyNet' or args.model == 'MoNet':

            del_idx=[]
            X_graph_feat = np.delete(X_train_val,del_idx,1)
            datAdj = computeLaplacian(X_graph_feat,args,dist)  #adjacency Matrix
            valAcc = []
            
            for inner_id, (train_id, val_id) in enumerate(inner_cv.split(X_train_val, y_train_val, groups_train_val)):
                print("Inner ID: {}".format(inner_id + 1))
                X_train, X_val = X_train_val[train_id], X_train_val[val_id]
                y_train, y_val = y_train_val[train_id], y_train_val[val_id]
                groups_train, groups_val = groups_train_val[train_id], groups_train_val[val_id]

                X_train = sc.fit_transform(X_train)
                X_val = sc.transform(X_val)
                
                #print(len(X_train), len(X_train[0]))
                
                #initialize weights & bias
                # set a model
                model = eval(args.model).Model(args, datAdj)
                model = model.to(device)
                
                # set a loss function & an optimizer
                # optimizer : sgd, adam만 가능
                criterion = nn.CrossEntropyLoss()
                optimizer = Optim(model.parameters(), 'sgd', lr=args.lr, weight_decay=1e-4)

                #GCN
                for epoch in range(args.epochs):
                    train(model,epoch,X_train,y_train)
                    val_acc = test("acc", model, X_val, y_val)
                #valAcc.append(val_acc)
                avgValAcc.append(val_acc)
                #print('Avg validation accuracy:{}({})'.format(np.mean(valAcc),np.std(valAcc)))
                # if test_acc > best_acc:
                #     # best_epoch = epoch
                #     torch.save(model.state_dict(), 'best_'+ str(args.lr)+'.pt')
                #     best = test_acc


        
        else:
            
            if args.model == "RF":
                params = {'randomforest__min_samples_leaf': np.arange(1, 51, 5),
                          'randomforest__n_estimators': np.arange(10, 100, 10)}

                pipe = Pipeline([
                    ('featureExtract', VarianceThreshold()),
                    ('scaling', StandardScaler()),
                    ('randomforest', RandomForestClassifier(random_state=0))
                ])
            elif args.model == 'SVM':
                params = {'svm__alpha': np.logspace(-4, 7, 12)}

                pipe = Pipeline([
                    ('featureExtract', VarianceThreshold()),
                    ('scaling', StandardScaler()),
                    ("svm", SGDClassifier(max_iter=1000, tol=1e-5, random_state=0))
                ])
            elif args.model == 'LR':
                params = {'lr__C': np.logspace(-3, 8, 12)}

                pipe = Pipeline([
                    ('featureExtract', VarianceThreshold()),
                    ('scaling', StandardScaler()),
                    ('lr', linear_model.LogisticRegression(random_state=0))
                ])
            elif args.model == 'MLP':
                params = {'mlp__hidden_layer_sizes': [[64,],[128,],[64,32],[128,64]]}

                pipe = Pipeline([
                    ('featureExtract', VarianceThreshold()),
                    ('scaling', StandardScaler()),
                    ('mlp', MLPClassifier(random_state=0,early_stopping=True))
                ])

            clf = GridSearchCV(estimator=pipe, param_grid=params, cv=inner_cv, scoring='accuracy', n_jobs=-1)
            clf.fit(X_train_val, y_train_val,groups_train_val)
            print(clf.best_params_)
            
            # AUC (Area Under Curve) -> train data로
            y_pred = clf.predict(X_train_val)
            avgAUC_train.append(roc_auc_score(y_train_val, y_pred))

            fpr, tpr, _ = roc_curve(y_train_val, y_pred)
            roc_auc = auc(fpr, tpr)
            
            fs = clf.best_estimator_.named_steps['featureExtract']
            
            # Accuracy
            y_pred = clf.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            testAccArr.append(acc)
            
            # 모델의 학습 결과를 믿을 수 있는지를 보는 것이므로 
            # train data로 AUC 계산하는게 맞는 듯
            # AUC (Area Under Curve) -> test data로
            #y_pred = clf.predict(X_test)
            #avgAUC.append(roc_auc_score(y_test, y_pred))
            

Outer ID: 1
==> Computing Pseudo Coordinates..
Inner ID: 1


KeyError: "None of [Int64Index([  0,   1,   2,   4,   6,   7,   8,   9,  10,  11,\n            ...\n            926, 927, 929, 930, 931, 932, 933, 935, 936, 937],\n           dtype='int64', length=750)] are in the [columns]"

In [None]:
# Accuracy
X_test = sc.transform(X_test)
testAccArr.append(test("acc", model, X_test,y_test))
            
# AUC (Area Under Curve)
y_pred = test("predict", model, X_test, y_test)
avgAUC_train.append(roc_auc_score(y_test, y_pred.cpu().numpy()))

fpr, tpr, _ = roc_curve(y_test, y_pred.cpu().numpy())
roc_auc = auc(fpr, tpr)

In [None]:
#if args.model == 'GCN' or args.model == 'DPIEnn' or args.model == 'DSGC' or args.model == 'ChebyNet':
print('Avg valication Accuracy:{}({})'.format(np.mean(avgValAcc),np.std(avgValAcc)))
print('Avg Test Accuracy:{}({})'.format(np.mean(testAccArr),np.std(testAccArr)))
print('Avg train AUC:{}({})'.format(np.mean(avgAUC_train),np.std(avgAUC_train)))
#print('Avg AUC:{}({})'.format(np.mean(avgAUC),np.std(avgAUC)))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()