In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, auc, roc_auc_score
from dmba import classificationSummary

from imblearn.over_sampling import SMOTE

In [2]:
train_df = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train.csv')
test_df = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Test.csv')

train_df['ownerChange'] = round(train_df['ownerChange'])
train_df['bedCount_class'] = round(train_df['bedCount_class'])

test_df['ownerChange'] = round(train_df['ownerChange'])
test_df['bedCount_class'] = round(train_df['bedCount_class'])

train_df.drop('instkind_nan', axis=1, inplace=True)
test_df.drop('instkind_nan', axis=1, inplace=True)

train_df.set_index('inst_id', inplace=True)
test_df.set_index('inst_id', inplace=True)

In [3]:
train_lr = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train_lr.csv')

train_lr['ownerChange'] = round(train_lr['ownerChange'])
train_lr['bedCount_class'] = round(train_lr['bedCount_class'])

train_lr.head()

Unnamed: 0,inst_id,sido_choongchung,sido_gyeongsang,sido_jeonra,instkind_nursing_hospital,instkind_traditional_hospital,revenue1,sga1,salary1,noe1,...,profit2,inventoryAsset2,OnonCAsset2,shortLoan2,NCLiabilities2,longLoan2,surplus2,ownerChange,bedCount_class,OC
0,1,1.0,0.0,0.0,1.0,0.0,22.162515,22.099796,21.433189,16.539187,...,19.152229,16.425505,18.798422,0.0,20.099863,19.781657,20.963246,0.0,3.0,1
1,3,0.0,1.0,0.0,0.0,0.0,23.98028,23.481393,19.521179,16.823799,...,17.025473,16.044684,15.372413,9.180719,18.812726,17.568559,12.258481,0.0,4.0,1
2,4,0.0,0.0,0.0,1.0,0.0,20.727778,19.918561,19.507228,10.308986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1
3,7,0.0,0.0,0.0,0.0,0.0,25.006954,24.981392,24.182293,20.954034,...,20.546991,20.934483,22.088313,22.944579,23.755157,23.437961,22.769775,0.0,4.0,1
4,9,0.0,1.0,0.0,0.0,0.0,24.615974,24.587275,23.920337,21.073281,...,17.301128,19.908536,20.486709,23.59081,23.784786,23.37001,22.923325,0.0,3.0,1


In [4]:
train_dt = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train_dt.csv')
train_dt.drop('instkind_nan', axis=1, inplace=True)

train_dt.head()

Unnamed: 0,inst_id,instkind_hospital,instkind_traditional_clinic,revenue1,salescost1,sga1,salary1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset2,receivableL2,employee2,OC
0,1,0.0,0.0,22.162515,0.0,22.099796,21.433189,20.735886,20.720935,19.968362,16.425505,0.0,4.174387,1
1,3,0.0,0.0,23.98028,9.659782,23.481393,19.521179,19.074269,19.03727,10.917985,16.044684,0.0,6.70196,1
2,4,0.0,0.0,20.727778,20.060616,19.918561,19.507228,19.422937,19.351593,15.906875,0.0,0.0,0.693147,1
3,7,0.0,0.0,25.006954,0.0,24.981392,24.182293,23.291406,23.16863,0.0,20.934483,0.0,6.498282,1
4,9,0.0,0.0,24.615974,0.0,24.587275,23.920337,22.566524,22.493677,22.134087,19.908536,0.0,5.288267,1


In [5]:
train_dt.shape

(301, 14)

# 1. 전체 columns

### 1) GridSearchCV

In [69]:
X_train = train_df.drop('OC', axis=1)
y_train = train_df['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

In [71]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf = AdaBoostClassifier()
grid_cv = GridSearchCV(clf, param_grid = params, cv = kfold, n_jobs = -1)
grid_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'algorithm': 'SAMME.R', 'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 250}
최고 예측 정확도: 0.9678


In [61]:
def clf_eval(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    specificity = confusion_matrix(y_test, pred)[0][0]/(confusion_matrix(y_test, pred)[0][0]+confusion_matrix(y_test, pred)[0][1])
    print('오차행렬')
    classificationSummary(y_test, pred)
    print()
    print('정확도 : {:.4f}\n정밀도 : {:.4f}\n재현율(민감도) : {:.4f}'.format(accuracy, precision, recall))
    print('특이도 : {:.4f}'.format(specificity))
    
    try:
        print('AUC : {:.4f}'.format(roc_auc_score(y_test, pred)))
    except ValueError:
        print('AUC : test data에 하나의 레이블만이 존재하여 AUC를 구할 수 없습니다.')

In [72]:
best_model = grid_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  0  1
     1  3 56

정확도 : 0.9333
정밀도 : 0.9825
재현율(민감도) : 0.9492
특이도 : 0.0000
AUC : 0.4746


In [73]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.9180)

       Prediction
Actual  0  1
     0  1  4
     1  1 55

정확도 : 0.9180
정밀도 : 0.9322
재현율(민감도) : 0.9821
특이도 : 0.2000
AUC : 0.5911


### 2) RandomizedSearchCV

In [74]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf = AdaBoostClassifier()
rand_cv = RandomizedSearchCV(clf, param_distributions = params, n_iter = 100, cv = kfold, n_jobs = -1)
rand_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', rand_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(rand_cv.best_score_))

최적 하이퍼 파라미터:  {'n_estimators': 250, 'base_estimator': DecisionTreeClassifier(max_depth=1), 'algorithm': 'SAMME.R'}
최고 예측 정확도: 0.9678


In [75]:
best_model = rand_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  0  1
     1  3 56

정확도 : 0.9333
정밀도 : 0.9825
재현율(민감도) : 0.9492
특이도 : 0.0000
AUC : 0.4746


In [76]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.9180)

       Prediction
Actual  0  1
     0  1  4
     1  1 55

정확도 : 0.9180
정밀도 : 0.9322
재현율(민감도) : 0.9821
특이도 : 0.2000
AUC : 0.5911


# 2. Train_lr

### 1) GridSearchCV

In [77]:
X_train = train_lr.drop('OC', axis=1)
y_train = train_lr['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

In [79]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf1 = AdaBoostClassifier()
grid_cv = GridSearchCV(clf1, param_grid = params, cv = kfold, n_jobs = -1)
grid_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'algorithm': 'SAMME', 'base_estimator': LogisticRegression(), 'n_estimators': 50}
최고 예측 정확도: 0.9737


In [80]:
best_model = grid_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9500)

       Prediction
Actual  0  1
     0  1  0
     1  3 56

정확도 : 0.9500
정밀도 : 1.0000
재현율(민감도) : 0.9492
특이도 : 1.0000
AUC : 0.9746


In [81]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8852)

       Prediction
Actual  0  1
     0  0  5
     1  2 54

정확도 : 0.8852
정밀도 : 0.9153
재현율(민감도) : 0.9643
특이도 : 0.0000
AUC : 0.4821


### 2) RandomizedSearchCV

In [82]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf1 = AdaBoostClassifier()
rand_cv = RandomizedSearchCV(clf1, param_distributions = params, n_iter = 100, cv = kfold, n_jobs = -1)
rand_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', rand_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(rand_cv.best_score_))

최적 하이퍼 파라미터:  {'n_estimators': 50, 'base_estimator': LogisticRegression(), 'algorithm': 'SAMME'}
최고 예측 정확도: 0.9737


In [83]:
best_model = rand_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9500)

       Prediction
Actual  0  1
     0  1  0
     1  3 56

정확도 : 0.9500
정밀도 : 1.0000
재현율(민감도) : 0.9492
특이도 : 1.0000
AUC : 0.9746


In [84]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8852)

       Prediction
Actual  0  1
     0  0  5
     1  2 54

정확도 : 0.8852
정밀도 : 0.9153
재현율(민감도) : 0.9643
특이도 : 0.0000
AUC : 0.4821


# 3. Train_dt

### 1) GridSearchCV

In [87]:
X_train = train_dt.drop('OC', axis=1)
y_train = train_dt['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

In [89]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf2 = AdaBoostClassifier()
grid_cv = GridSearchCV(clf2, param_grid = params, cv = kfold, n_jobs = -1)
grid_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'algorithm': 'SAMME', 'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 50}
최고 예측 정확도: 0.9239


In [90]:
best_model = grid_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  0  1
     1  3 56

정확도 : 0.9333
정밀도 : 0.9825
재현율(민감도) : 0.9492
특이도 : 0.0000
AUC : 0.4746


In [91]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8689)

       Prediction
Actual  0  1
     0  0  5
     1  3 53

정확도 : 0.8689
정밀도 : 0.9138
재현율(민감도) : 0.9464
특이도 : 0.0000
AUC : 0.4732


### 2) RandomizedSearchCV

In [92]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

params = {'base_estimator' : [dtree, logreg],
          'n_estimators' : [50, 100, 150, 200, 250, 300],
          'algorithm' : ['SAMME', 'SAMME.R']
         }

kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# AdaBoostClassifier 객체 생성 후 GridSearchCV 수행
clf2 = AdaBoostClassifier()
rand_cv = RandomizedSearchCV(clf2, param_distributions = params, n_iter = 100, cv = kfold, n_jobs = -1)
rand_cv.fit(X_tr_over, y_tr_over)

print('최적 하이퍼 파라미터: ', rand_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(rand_cv.best_score_))

최적 하이퍼 파라미터:  {'n_estimators': 50, 'base_estimator': DecisionTreeClassifier(max_depth=1), 'algorithm': 'SAMME'}
최고 예측 정확도: 0.9239


In [93]:
best_model = rand_cv.best_estimator_  # 최적의 하이퍼 파라미터로 모델 생성
pred = best_model.predict(X_val)

clf_eval(y_val, pred)

오차행렬
Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  0  1
     1  3 56

정확도 : 0.9333
정밀도 : 0.9825
재현율(민감도) : 0.9492
특이도 : 0.0000
AUC : 0.4746


In [94]:
clf_eval(y_test, best_model.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8689)

       Prediction
Actual  0  1
     0  0  5
     1  3 53

정확도 : 0.8689
정밀도 : 0.9138
재현율(민감도) : 0.9464
특이도 : 0.0000
AUC : 0.4732
