# load libraries and data

In [1]:
from torch import nn
import torch
from torch import tensor 
from torch.autograd import Variable
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import torch
import itertools
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm.notebook import tqdm
from pathlib import Path
import os

total_out = Path(os.getcwd()+'/Statin_preprocess.csv')
total_data= pd.read_csv(total_out)
print(total_data.shape)
print(list(total_data.columns))

target ='mace'

# mri feature가 시작하는 column의 index 구하기
# np.where의 결과값이 array에 들어가기 때문에 방금 계산해 넣어놓은 [0]번째 값을 가져온다.
start_features_index = np.where(total_data.columns.values == "group_after_detail_1")[0][0]
features = list(['risk'])

print(len(features))
print(features)

Num_feat = len(features)

(122398, 27)
['interval_statin', 'mace', 'group_after_detail_1', 'statin_drug', 'AGE', 'SEX', 'HTN', 'ICDHTN', 'trt_ICD_DM', 'DM', 'BMI', 'WAIST', 'BP_HIGH', 'BP_LWST', 'BLDS', 'TOT_CHOLE', 'TRIGLYCERIDE', 'HDL_CHOLE', 'LDL_CHOLE', 'HMG', 'smoking', 'econo', 'drinking', 'HMG.1', 'LDL_CHOLE.1', 'current_ascvd', 'risk']
1
['risk']


In [2]:
def feature(Num_feat, clf, test_data_processed, features):
    importance =clf.feature_importances_
    #plt.plot(importance)
    #plt.show()
    labels_importance=importance.argsort()[::-1]

    importance_sort = np.sort(importance)[::-1]

    feat_name_sort=test_data_processed[features].columns[labels_importance]
    important_features = pd.DataFrame() 
    
    for i in range (Num_feat):
        feature = pd.DataFrame([[feat_name_sort[i],importance_sort[i]]], columns = ['feature name', 'ratio'])
        important_features=pd.concat([important_features,feature])

    return important_features.reset_index(drop=True)

# Finding best parameters

In [3]:
# Augmented
import torch
import itertools
from sklearn.metrics import confusion_matrix
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm.notebook import tqdm


def find_bestpar(train_data, test_data, features):
    
    valid_cut = (int)(len(train_data)*0.8)
    for_Train = train_data[:valid_cut]
    for_Validation = train_data[valid_cut:]
    
    """train data 생성"""
    X_train = for_Train[features].values
    Y_train = for_Train[target].values
    
    """valid data 생성"""
    X_valid = for_Validation[features].values
    Y_valid = for_Validation[target].values
    
    """test data 생성"""
    X_test = test_data[features].values
    Y_test = test_data[target].values
    
    print(len(X_train), len(X_valid), len(X_test))            
    
    # Store maximum auc
    max_auc= 0
    # Store maximum hypterparameter set
    max_hy = []
    
    """
    # define hyperparameter space : learning rate, 
    n_ = [4,8,16]                              # 
    lr_ = [2e-2, 1e-2, 5e-3, 2e-3, 1e-3, 1e-4] # learning rate
    w_ = [0.01, 0.001, 0.0001]                 # weight decay
    g_ = [0.95, 0.99, 0.9]                     # scheduler params - gamma
    ss_ = [10, 20, 30]                         # scheduler params - step_size
    """
    # Orginal hyperparameter space 
    
    # define hyperparameter space (quick version)
    n_ = [4,16]
    lr_ = [2e-2,1e-3]
    w_ = [0.01,0.001]
    g_ = [0.95,0.99]
    ss_ = [10,30]
    
    all_ = [n_, lr_, w_, g_, ss_]
    h_space = [s for s in itertools.product(*all_)]
    
    print("start training")
    count=0
    for hy in tqdm(h_space):
        count = count + 1
        clf = TabNetClassifier(n_a = hy[0],
                                n_d = hy[0],
                                optimizer_params = dict(lr=hy[1], weight_decay=hy[2]),
                                scheduler_params={"step_size":hy[4], "gamma":hy[3]},
                                scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                verbose=0)

        clf.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
                    eval_name=['train', 'valid'], eval_metric=['auc'],
                    max_epochs=200 , patience=20)
       
        preds_acc = clf.predict(X_test)
        preds_prob = clf.predict_proba(X_test)
        test_auc = roc_auc_score(y_score=preds_prob[:,1], y_true=Y_test)
        test_acc = accuracy_score(preds_acc, Y_test)
            
        print('Valid score: %2f'% clf.best_cost, 'Test AUC: %.3f%%'%test_auc, 'Test ACC: %.3f%%'%test_acc)
    
        if np.mean(test_auc)>max_auc:
            print("Find new maximum test AUC!!\n")
            max_hy = hy
            max_valid_score = clf.best_cost
            max_auc = test_auc
            max_acc = test_acc
    
    return clf, max_hy, max_valid_score, max_auc, max_acc, preds_prob

In [4]:
def preprocessing (total_data):
    total_data_processed= total_data.fillna(0).reset_index(drop=True)
    
    # frac: 전체 row 중 몇 %를 반환할 지 결정 -> frac=1을 설정해서 모든 데이터를 반환
    # random_state: 추후 이것과 동일한 샘플링을 재현하기 위함
    # sample: 데이터에서 임의의 샘플 선정 -> frac=1이면 전체 data의 순서만 임의로 바뀜
    total_data_processed = total_data_processed.sample(frac=1,random_state=2020).reset_index(drop=True)
    
    print("done preprocessing")
    return total_data_processed

total_data = preprocessing(total_data)
print(len(total_data))

done preprocessing
122398


In [5]:
train_cut = (int)(len(total_data)*0.8)
#print(train_cut)                 

train_data = total_data[:train_cut] 
test_data = total_data[train_cut:] 
#print(len(train_data), len(test_data))

In [6]:
class model():
    def __init__(self, train_data, test_data, Num_feat, features):
        clf, max_hy, max_valid_score, max_test_auc, max_test_acc, preds_prob = find_bestpar(train_data, test_data, features)    
    
        self.train_data = train_data
        self.test_data = test_data
        self.preds_prob = preds_prob 
        self.max_hy = max_hy
        self.test_auc = max_test_auc
        self.test_acc = max_test_acc
        self.valid_score = max_valid_score
        self.clf = clf
        self.features = features

In [7]:
TabNet_mace_only_risk_model = model(train_data, test_data, Num_feat, features)

78334 19584 24480
start training


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=32.0), HTML(value='')))


Early stopping occurred at epoch 20 with best_epoch = 0 and best_valid_auc = 0.69021
Best weights from best epoch are automatically used!
Valid score: 0.690208 Test AUC: 0.684% Test ACC: 0.960%
Find new maximum test AUC!!


Early stopping occurred at epoch 20 with best_epoch = 0 and best_valid_auc = 0.69021
Best weights from best epoch are automatically used!
Valid score: 0.690208 Test AUC: 0.684% Test ACC: 0.960%

Early stopping occurred at epoch 20 with best_epoch = 0 and best_valid_auc = 0.69021
Best weights from best epoch are automatically used!
Valid score: 0.690208 Test AUC: 0.684% Test ACC: 0.960%

Early stopping occurred at epoch 20 with best_epoch = 0 and best_valid_auc = 0.69021
Best weights from best epoch are automatically used!
Valid score: 0.690208 Test AUC: 0.684% Test ACC: 0.960%

Early stopping occurred at epoch 23 with best_epoch = 3 and best_valid_auc = 0.69018
Best weights from best epoch are automatically used!
Valid score: 0.690183 Test AUC: 0.684% Test ACC: 0.9

In [9]:
print("<<Important Feature>>")
import_feat=feature(Num_feat, TabNet_mace_only_risk_model.clf, TabNet_mace_only_risk_model.test_data, TabNet_mace_only_risk_model.features)
TabNet_mace_only_risk_model.import_feat =  import_feat
import_feat

<<Important Feature>>


Unnamed: 0,feature name,ratio
0,risk,1.0


In [12]:
import dill

with open('./TabNet_mace_only_risk_model.pkl', 'wb') as f:
    dill.dump(TabNet_mace_only_risk_model, f)

In [11]:
import xgboost as xgb

"""
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
"""

train_cut = (int)(len(total_data)*0.8)
#print(train_cut)         

train_data = total_data[:train_cut] 
test_data = total_data[train_cut:] 

X_train = train_data[features].values
y_train = train_data[target].values

X_test = test_data[features].values
y_test = test_data[target].values

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc")
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

y_pred_prob = xgb_model.predict_proba(X_test)[:,1]

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))
#print(xgb_model.best_params_)



0.9598447712418301
0.6755930664236737
