In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import matplotlib.font_manager as fm
import matplotlib

font_path = 'C:\\Windows\\Fonts\\gulim.ttc'
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font)

### EDA

In [3]:
df = pd.read_csv('../../eda/data/merged_data.csv')

In [4]:
df = df.drop(['폐업_점포_수', '폐업_영업_개월_평균', '서울시_폐업_영업_개월_평균', '기준_년분기_코드'], axis=1)

In [5]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

cols = ['자치구_코드_명', '서비스_업종_코드_명', '상권_변화_지표']

for col in cols:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    print(encoder.classes_)

['강남구' '강동구' '강북구' '강서구' '관악구' '광진구' '구로구' '금천구' '노원구' '도봉구' '동대문구' '동작구'
 '마포구' '서대문구' '서초구' '성동구' '성북구' '송파구' '양천구' '영등포구' '용산구' '은평구' '종로구' '중구'
 '중랑구']
['PC방' '가구' '가방' '가전제품' '가전제품수리' '고시원' '골프연습장' '네일숍' '노래방' '당구장' '문구'
 '미곡판매' '미용실' '반찬가게' '부동산중개업' '분식전문점' '서적' '섬유제품' '세탁소' '수산물판매' '슈퍼마켓'
 '스포츠 강습' '스포츠클럽' '시계및귀금속' '신발' '안경' '애완동물' '양식음식점' '여관' '예술학원' '완구'
 '외국어학원' '운동/경기용품' '육류판매' '의료기기' '의약품' '인테리어' '일반교습학원' '일반의류' '일반의원'
 '일식음식점' '자동차미용' '자동차수리' '자전거 및 기타운송장비' '전자상거래업' '제과점' '조명용품' '중식음식점'
 '철물점' '청과상' '치과의원' '치킨전문점' '커피-음료' '컴퓨터및주변장치판매' '패스트푸드점' '편의점' '피부관리실'
 '한식음식점' '한의원' '핸드폰' '호프-간이주점' '화장품' '화초']
['HH' 'HL' 'LH' 'LL']


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 전체 데이터의 75%을 기준으로 등급 나누기
df['폐업률_등급'] = pd.qcut(df['폐업_률'], q=[0, 0.75, 1.0], labels=[0, 1])

X = df.drop(['폐업_률', '폐업률_등급'], axis=1) 
y_class = df['폐업률_등급']

smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X, y_class)

X_train, X_test, y_train, y_test = train_test_split(
    X_resample, y_resample, test_size=0.2, random_state=42, stratify=y_resample
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((48244, 132), (12062, 132), (48244,), (12062,))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import optuna

def rf_optuna_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
    }
    rf_clf = RandomForestClassifier()
    
    return cross_val_score(rf_clf, X_train, y_train, scoring='accuracy', cv=3).mean()

study = optuna.create_study(direction='maximize')

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-10-14 00:04:37,239] A new study created in memory with name: no-name-1798c0fc-56a6-436b-b3cf-6e86dda9120f


In [None]:
study.optimize(rf_optuna_objective, n_trials=50)

print(study.best_value)
print(study.best_params)

Positional arguments ['self', 'name', 'low', 'high', 'step', 'log'] in suggest_int() have been deprecated since v3.5.0. They will be replaced with the corresponding keyword arguments in v5.0.0, so please use the keyword specification instead. See https://github.com/optuna/optuna/releases/tag/v3.5.0 for details.
  'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100),
[I 2025-10-14 00:05:33,265] Trial 0 finished with value: 0.8503233509879924 and parameters: {'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.08586195482792605, 'colsample_bytree': 0.8948518385652394}. Best is trial 0 with value: 0.8503233509879924.
Positional arguments ['self', 'name', 'low', 'high', 'step', 'log'] in suggest_int() have been deprecated since v3.5.0. They will be replaced with the corresponding keyword arguments in v5.0.0, so please use the keyword specification instead. See https://github.com/optuna/optuna/releases/tag/v3.5.0 for details.
  'n_estimators': trial.suggest_int('n_estimators

0.8508829906080879
{'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.05130020228572684, 'colsample_bytree': 0.9061504564748334}


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(n_estimators=500, max_depth=10, max_features=0.7615338373458207)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []

for train_index, val_index in kfold.split(X_train, y_train):
    X_tra, y_tra = X_train.to_numpy()[train_index], y_train.to_numpy()[train_index]
    X_val, y_val = X_train.to_numpy()[val_index], y_train.to_numpy()[val_index]

    rf_clf.fit(X_tra, y_tra)
    y_pred = rf_clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    cv_accuracy.append(acc)

print(np.mean(cv_accuracy))

0.8254704966032447


---
### test 데이터 예측 -> 평가

In [16]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score 
from sklearn.metrics import classification_report

y_pred = rf_clf.predict(X_test)

print(f'ACC    : {accuracy_score(y_test, y_pred)}')
print(f'PREC   : {precision_score(y_test, y_pred)}')
print(f'REC    : {recall_score(y_test, y_pred)}')
print(f'F1     : {f1_score(y_test, y_pred)}')
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred)}")



ACC    : 0.8251533742331288
PREC   : 0.813058748403576
REC    : 0.844470237108274
F1     : 0.8284668564457096
ROC-AUC: 0.8251533742331288
