# Ensemble

In [1]:
%matplotlib inline
import os
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix, parallel_coordinates
import seaborn as sns
import matplotlib.pylab as plt
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
Train = pd.read_csv("C:/Users/kimch/Desktop/hospital_preprocessed_dataset/train_df.csv")
Test = pd.read_csv("C:/Users/kimch/Desktop/hospital_preprocessed_dataset/test_df.csv")

In [4]:
# csv 파일로 변환하면서 Unnamed: 0이라는 컬럼이 생긴 듯 함 -> 제거
Train = Train.drop(columns=["instkind_nan", "Unnamed: 0"])
Test = Test.drop(columns=["instkind_nan", "Unnamed: 0"])

In [5]:
Train = Train.set_index("inst_id")
Test = Test.set_index("inst_id")

In [6]:
fs = ["sga1", "salary1", "revenue1", "profit2", "interest2", "interest1", "receivableS1", "receivableL1", "quickAsset1", "liquidAsset1", "employee2", "debt1", "instkind_nursing_hospital", "OC"]

In [7]:
train = Train[fs]
test = Test[fs]

In [8]:
train["interestDiff"] = train["interest1"] - train["interest2"]
train_df = train[["sga1", "salary1", "revenue1", "profit2", "receivableS1", "receivableL1", "liquidAsset1", "employee2", "debt1", "instkind_nursing_hospital", "interestDiff", "OC"]]

In [9]:
test["interestDiff"] = test["interest1"] - test["interest2"]
test_df = test[["sga1", "salary1", "revenue1", "profit2", "receivableS1", "receivableL1", "liquidAsset1", "employee2", "debt1", "instkind_nursing_hospital", "interestDiff", "OC"]]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = train_df.drop(columns=["OC"])
y = train["OC"]

train_X, resid_X, train_y, resid_y = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=22)
valid_X, test_X, valid_y, test_y = train_test_split(resid_X, resid_y, test_size=0.5, shuffle=True, random_state=22)

In [12]:
from imblearn.over_sampling import SMOTE

In [13]:
smote = SMOTE(random_state=11)
train_X_smote, train_y_smote = smote.fit_resample(train_X, train_y)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
rf = RandomForestClassifier(random_state=0, max_depth=10, min_impurity_decrease= 0.001862602113776709, min_samples_leaf=3, min_samples_split=7, n_estimators=316)
rf.fit(train_X_smote, train_y_smote)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
gbm = GradientBoostingClassifier(random_state=0, learning_rate=0.1, max_depth=8, min_samples_leaf=2, min_samples_split=8, n_estimators=100, subsample=0.6)
gbm.fit(train_X_smote, train_y_smote)

In [18]:
from xgboost import XGBClassifier

In [19]:
xgb = XGBClassifier(colsample_bytree= 0.8, gamma= 0, learning_rate=0.01, max_depth= 3, n_estimators = 100)
xgb.fit(train_X_smote, train_y_smote)

In [20]:
from lightgbm import LGBMClassifier

In [22]:
lgbm = LGBMClassifier(lgbm__learging_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__reg__lambda=0.1, lgbm__subsample=0.5)
lgbm.fit(train_X_smote, train_y_smote)



## Hard  
-다수결로 분류

## Soft  
-확률의 평균값으로 분류

In [23]:
# rf 이용해서 폐업확률
rf_proba = rf.predict_proba(train_X_smote)[:,0].tolist()
# gbm 이용해서 폐업확률
gbm_proba = gbm.predict_proba(train_X_smote)[:,0].tolist()

In [24]:
result = pd.DataFrame({"RandomForest": rf_proba, "GBM": gbm_proba, "real": train_y_smote.tolist()})

In [25]:
proba_lst = []

for i in range (len(result)):
    proba=0
    for j in range (2):
        proba += result.iloc[i,j]
    proba_mean = round(proba/2, 4)
    proba_lst.append(proba_mean)

In [26]:
result["proba_mean"] = proba_lst

In [27]:
prediction = []

for i in range (len(result)):
    if result["proba_mean"][i] >= 0.5:
        prediction.append(0)
    else:
        prediction.append(1)

In [28]:
result["prediction"] = prediction

In [29]:
result

Unnamed: 0,RandomForest,GBM,real,proba_mean,prediction
0,0.105990,0.001169,1,0.0536,1
1,0.717949,0.254631,0,0.4863,1
2,0.180622,0.003196,1,0.0919,1
3,0.082665,0.000275,1,0.0415,1
4,0.012895,0.000195,1,0.0065,1
...,...,...,...,...,...
397,0.978991,0.999646,0,0.9893,0
398,0.941185,0.999260,0,0.9702,0
399,0.991446,0.999859,0,0.9957,0
400,0.842767,0.998919,0,0.9208,0


In [30]:
from dmba import classificationSummary

In [31]:
classificationSummary(result.real, result.prediction)

Confusion Matrix (Accuracy 0.9925)

       Prediction
Actual   0   1
     0 199   2
     1   1 200


In [32]:
# rf 이용해서 폐업확률
rf_proba = rf.predict_proba(valid_X)[:,0].tolist()
# gbm 이용해서 폐업확률
gbm_proba = gbm.predict_proba(valid_X)[:,0].tolist()

In [33]:
result = pd.DataFrame({"RandomForest": rf_proba, "GBM": gbm_proba, "real": valid_y.tolist()})

In [34]:
proba_lst = []

for i in range (len(result)):
    proba=0
    for j in range (2):
        proba += result.iloc[i,j]
    proba_mean = round(proba/2, 4)
    proba_lst.append(proba_mean)

In [35]:
result["proba_mean"] = proba_lst

In [36]:
prediction = []

for i in range (len(result)):
    if result["proba_mean"][i] >= 0.5:
        prediction.append(0)
    else:
        prediction.append(1)

In [37]:
result["prediction"] = prediction

In [38]:
result

Unnamed: 0,RandomForest,GBM,real,proba_mean,prediction
0,0.128909,0.000279,1,0.0646,1
1,0.411341,0.326732,1,0.369,1
2,0.289338,0.001138,1,0.1452,1
3,0.044237,0.000814,1,0.0225,1
4,0.053103,0.000665,1,0.0269,1
5,0.367622,0.059035,0,0.2133,1
6,0.458554,0.22464,1,0.3416,1
7,0.02896,9.8e-05,1,0.0145,1
8,0.098638,0.000606,1,0.0496,1
9,0.145897,0.003214,1,0.0746,1


In [39]:
classificationSummary(result.real, result.prediction)

Confusion Matrix (Accuracy 0.9556)

       Prediction
Actual  0  1
     0  2  1
     1  1 41


In [40]:
# rf 이용해서 폐업확률
rf_proba = rf.predict_proba(test_X)[:,0].tolist()
# gbm 이용해서 폐업확률
gbm_proba = gbm.predict_proba(test_X)[:,0].tolist()

In [41]:
result = pd.DataFrame({"RandomForest": rf_proba, "GBM": gbm_proba, "real": test_y.tolist()})

In [42]:
proba_lst = []

for i in range (len(result)):
    proba=0
    for j in range (2):
        proba += result.iloc[i,j]
    proba_mean = round(proba/2, 4)
    proba_lst.append(proba_mean)

In [43]:
result["proba_mean"] = proba_lst

In [44]:
prediction = []

for i in range (len(result)):
    if result["proba_mean"][i] >= 0.5:
        prediction.append(0)
    else:
        prediction.append(1)

In [45]:
result["prediction"] = prediction

In [46]:
result

Unnamed: 0,RandomForest,GBM,real,proba_mean,prediction
0,0.063808,0.000553,1,0.0322,1
1,0.271839,0.010553,1,0.1412,1
2,0.02726,9.6e-05,1,0.0137,1
3,0.011679,0.000218,1,0.0059,1
4,0.055467,0.002955,1,0.0292,1
5,0.004297,5.1e-05,1,0.0022,1
6,0.043321,0.000209,1,0.0218,1
7,0.234345,0.000315,1,0.1173,1
8,0.026202,0.001114,1,0.0137,1
9,0.431149,0.014951,1,0.223,1


In [48]:
classificationSummary(result.real, result.prediction)

Confusion Matrix (Accuracy 0.9565)

       Prediction
Actual  0  1
     0  1  2
     1  0 43
