In [1]:
import pandas as pd

#Get data
train_data = pd.read_csv(r'.\data\traindata_overtANDlatentThyroidism.csv', encoding='cp932')
test_data = pd.read_csv(r'.\data\testdata_overtThyroidism.csv', encoding='cp932')

In [2]:
#Target columns
num_features = ['AST', 'ALT', 'γ-GTP', 'Total_cholesterol', 'RBC', 'Hb', 'UA', 'S-Cr', 'UA_S-Cr', 'ALP']
cat_features = ['Sex']
obj_variable = 'class'
info_variable = 'attribute'
target_columns = num_features+cat_features+[obj_variable, info_variable]

In [3]:
#Extract "target_columns"
train_data = train_data.loc[:,target_columns]
test_data = test_data.loc[:,target_columns]

#Exclude "info_variable" including "gunma" from the training data
train_data = train_data[~train_data[info_variable].str.contains('gunma')].reset_index(drop=True)

In [4]:
#Label encoding("cat_features")
label_encoder = {'male':0, 'female':1}
train_data.loc[:,cat_features] = train_data.loc[:,cat_features].applymap(lambda x: label_encoder[x])
test_data.loc[:,cat_features] = test_data.loc[:,cat_features].applymap(lambda x: label_encoder[x])

#Label encoding("obj_variable")
label_encoder = {'hyper':1, 'hypox':0, 'normal':0}
train_data[obj_variable] = train_data[obj_variable].map(lambda x: label_encoder[x])
test_data[obj_variable] = test_data[obj_variable].map(lambda x: label_encoder[x])

In [5]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tensorflow.python.keras.layers import Input, Dense
from tensorflow.python.keras.models import Model

##Define machine learning model
model = CatBoostClassifier() #Catboost
#model = LogisticRegression() #Logistic Regression
#model = SVC() #SVM


##Neural Network
'''input_node = len(num_features+cat_features)
inter_node = 32
inter_num = 1
output_node = len(train_data[obj_variable].unique())

inputs = Input(shape=(input_node,))
x_nn = inputs
for i in range(0,inter_num): x_nn = Dense(inter_node, activation='relu')(x_nn)            
outputs = Dense(output_node, activation='sigmoid')(x_nn)
        
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='Adam', loss='binary_crossentropy')'''

model

<catboost.core.CatBoostClassifier at 0x1de76c0ef70>

In [6]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score

#Validation
kf = StratifiedKFold(n_splits=10)

X_train = train_data.loc[:,num_features+cat_features]
y_train = train_data[obj_variable]
X_test = test_data.loc[:,num_features+cat_features]
y_true = test_data[obj_variable]

result = pd.DataFrame()
for train_indexes, test_indexes in zip(kf.split(X_train, y_train), kf.split(X_test, y_true)):
    train_index = train_indexes[0]
    test_index = test_indexes[1]
    
    model.fit(X_train.loc[train_index,:], y_train[train_index], verbose=0)
    #model.fit(X_train.loc[train_index,:], pd.get_dummies(y_train[train_index]), epochs=50, verbose=0) #For neural network
    
    proba = model.predict_proba(X_test.loc[test_index,:])
    #proba = model.predict(X_test.loc[test_index,:]) #For neural network
    positive_proba = proba[:,1]
    auroc = roc_auc_score(y_true[test_index], positive_proba)
        
    y_pred = np.where(positive_proba>=0.5, 1, 0)
    cm = confusion_matrix(y_true[test_index], y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.flatten()
        
    recall = tp / (tp+fn)
    specificity = tn / (tn+fp)
    
    result = result.append(pd.Series([auroc, recall, specificity], index=['AUROC', 'Recall', 'Specificity']),
                           ignore_index=True)

In [7]:
result

Unnamed: 0,AUROC,Recall,Specificity
0,0.51395,0.01,0.99
1,0.5852,0.0,1.0
2,0.53445,0.0,0.985
3,0.5069,0.0,1.0
4,0.4695,0.0,0.995
5,0.5096,0.0,0.99
6,0.464,0.0,1.0
7,0.50055,0.01,0.995
8,0.4865,0.0,0.99
9,0.5024,0.0,0.995
