In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
import matplotlib.font_manager as fm
import matplotlib

font_path = 'C:\\Windows\\Fonts\\gulim.ttc'
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font)

### EDA

In [46]:
df = pd.read_csv('./../../eda/data/merged_data.csv')

In [47]:
df = df.drop(['폐업_점포_수', '폐업_영업_개월_평균', '서울시_폐업_영업_개월_평균', '기준_년분기_코드'], axis=1)

In [48]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

cols = ['자치구_코드_명', '서비스_업종_코드_명', '상권_변화_지표']

for col in cols:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    print(encoder.classes_)

['강남구' '강동구' '강북구' '강서구' '관악구' '광진구' '구로구' '금천구' '노원구' '도봉구' '동대문구' '동작구'
 '마포구' '서대문구' '서초구' '성동구' '성북구' '송파구' '양천구' '영등포구' '용산구' '은평구' '종로구' '중구'
 '중랑구']
['PC방' '가구' '가방' '가전제품' '가전제품수리' '고시원' '골프연습장' '네일숍' '노래방' '당구장' '문구'
 '미곡판매' '미용실' '반찬가게' '부동산중개업' '분식전문점' '서적' '섬유제품' '세탁소' '수산물판매' '슈퍼마켓'
 '스포츠 강습' '스포츠클럽' '시계및귀금속' '신발' '안경' '애완동물' '양식음식점' '여관' '예술학원' '완구'
 '외국어학원' '운동/경기용품' '육류판매' '의료기기' '의약품' '인테리어' '일반교습학원' '일반의류' '일반의원'
 '일식음식점' '자동차미용' '자동차수리' '자전거 및 기타운송장비' '전자상거래업' '제과점' '조명용품' '중식음식점'
 '철물점' '청과상' '치과의원' '치킨전문점' '커피-음료' '컴퓨터및주변장치판매' '패스트푸드점' '편의점' '피부관리실'
 '한식음식점' '한의원' '핸드폰' '호프-간이주점' '화장품' '화초']
['HH' 'HL' 'LH' 'LL']


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 전체 데이터의 75%을 기준으로 등급 나누기
df['폐업률_등급'] = pd.qcut(df['폐업_률'], q=[0, 0.75, 1.0], labels=[0, 1])

X = df.drop(['폐업_률', '폐업률_등급'], axis=1) 
y_class = df['폐업률_등급']

smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X, y_class)

X_train, X_test, y_train, y_test = train_test_split(
    X_resample, y_resample, test_size=0.2, random_state=42, stratify=y_resample
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((48244, 132), (12062, 132), (48244,), (12062,))

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# XGBoost 이진 분류 학습
xgb_clf = XGBClassifier()

s_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []

for train_index, val_index in s_kfold.split(X_train, y_train):
    X_tra, y_tra = X_train.to_numpy()[train_index], y_train.to_numpy()[train_index]
    X_val, y_val = X_train.to_numpy()[val_index], y_train.to_numpy()[val_index]


    print(np.unique(y_tra, return_counts=True))
    print(np.unique(y_val, return_counts=True))
    print('=========================================')

    # 모델 학습 > 예측 > 평가
    xgb_clf.fit(X_tra, y_tra)
    y_pred = xgb_clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    cv_accuracy.append(acc)

print('훈련별 정확도: ', cv_accuracy)
print('분류 모델 정확도: ', np.mean(cv_accuracy))

(array([0, 1]), array([19297, 19298]))
(array([0, 1]), array([4825, 4824]))
(array([0, 1]), array([19297, 19298]))
(array([0, 1]), array([4825, 4824]))
(array([0, 1]), array([19298, 19297]))
(array([0, 1]), array([4824, 4825]))
(array([0, 1]), array([19298, 19297]))
(array([0, 1]), array([4824, 4825]))
(array([0, 1]), array([19298, 19298]))
(array([0, 1]), array([4824, 4824]))
훈련별 정확도:  [0.8693128821639549, 0.8635091719349155, 0.8671364908280651, 0.8613327805990258, 0.8641169154228856]
분류 모델 정확도:  0.8650816481897694


In [53]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f'정확도: {accuracy_score(y_test, y_pred)}, 정밀도: {precision_score(y_test, y_pred)}, 재현율: {recall_score(y_test, y_pred)}, f1: {f1_score(y_test, y_pred)}')


정확도: 0.8555795058862543, 정밀도: 0.8580731340791451, 재현율: 0.8520974962692754, f1: 0.8550748752079866


---

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 폐업률 70%을 기준으로 등급 나누기
df['폐업률_등급'] = (df['폐업_률'] > 2.6).astype(int)

X = df.drop(['폐업_률', '폐업률_등급'], axis=1) 
y_class = df['폐업률_등급']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# XGBoost 이진 분류 학습
xgb_clf = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.7,
    random_state=42,
)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['안정', '위험']))

Parameters: { "iterations" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

          안정       0.76      0.85      0.80      4634
          위험       0.75      0.63      0.69      3361

    accuracy                           0.76      7995
   macro avg       0.76      0.74      0.74      7995
weighted avg       0.76      0.76      0.75      7995



In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f'정확도: {accuracy_score(y_test, y_pred)}, 정밀도: {precision_score(y_test, y_pred)}, 재현율: {recall_score(y_test, y_pred)}, f1: {f1_score(y_test, y_pred)}')

정확도: 0.777322256985651, 정밀도: 0.7901987353206865, 재현율: 0.7550712127751402, f1: 0.7722357095563893


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# 테스트해볼 파라미터 후보군 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]

    iterations=500,
    learning_rate=0.1,
    max_depth=6,
}

# GridSearchCV 객체 생성
grid = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3,
                           scoring='accuracy', verbose=2)

# 학습 시작
grid.fit(X_train, y_train)

print('최적의 파라미터: ', grid.best_params_)
print('최적화된 모델 객체: ', grid.best_estimator_)
print('최적화된 점수: ', grid.best_score_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.7; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.9; total time=   1.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.9; total time=   1.1s
[CV] END 

KeyboardInterrupt: 