In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

In [2]:
import os, sys
# 1) 주피터/윈도우에서 표준출력 UTF-8로
try:
    sys.stdout.reconfigure(encoding="utf-8")  # Py3.7+
except Exception:
    pass
os.environ["PYTHONIOENCODING"] = "utf-8"

# 2) joblib 병렬을 threading으로 (멀티프로세스에서의 인코딩 이슈 회피)
from joblib import parallel_backend
THREADING_BACKEND = parallel_backend("threading")


In [3]:
df = pd.read_csv("../eda/data/merged_data.csv")

In [4]:
drop_cols = ["폐업_점포_수", "운영_영업_개월_평균", "폐업_영업_개월_평균", "서울시_운영_영업_개월_평균", "서울시_폐업_영업_개월_평균"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

In [5]:
KEY_Q = "기준_년분기_코드"
KEY_G = "자치구_코드_명"
KEY_S = "서비스_업종_코드_명"
TARGET_CONT = "폐업_률"

In [6]:
df = df.sort_values([KEY_G, KEY_S, KEY_Q]).copy()
q80 = df.groupby(KEY_Q)[TARGET_CONT].quantile(0.50).rename("thr")
df = df.merge(q80, left_on=KEY_Q, right_index=True, how="left")
df["is_high_t"] = (df[TARGET_CONT] >= df["thr"]).astype(int)
df["y"] = df.groupby([KEY_G, KEY_S])["is_high_t"].shift(-1)
df = df[df["y"].notna()].copy()
df["y"] = df["y"].astype(int)

In [7]:
base_exclude = {TARGET_CONT, "thr", "is_high_t", "y"}
num_cols = df.select_dtypes(include=[np.number]).columns.difference(base_exclude).tolist()
cat_cols = [c for c in ["자치구_코드_명", "서비스_업종_코드_명", "상권_변화_지표"] if c in df.columns]
for c in cat_cols:
    df[c] = df[c].astype("category")

feature_cols = num_cols + cat_cols
X = df[feature_cols].copy()
y = df["y"].copy()

if cat_cols:
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
clf = RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
proba = clf.predict_proba(X_valid)[:, 1]
THRESH = 0.35
y_pred = (proba >= THRESH).astype(int)

acc  = accuracy_score(y_valid, y_pred)
prec = precision_score(y_valid, y_pred, zero_division=0)
rec  = recall_score(y_valid, y_pred, zero_division=0)
f1   = f1_score(y_valid, y_pred)
auc  = roc_auc_score(y_valid, proba)

print(f"Accuracy : {acc:.6f}")
print(f"Precision: {prec:.6f}")
print(f"Recall   : {rec:.6f}")
print(f"F1 Score : {f1:.6f}")
print(f"AUC(ROC) : {auc:.6f}")

Accuracy : 0.688834
Precision: 0.650708
Recall   : 0.850735
F1 Score : 0.737397
AUC(ROC) : 0.799165


In [10]:
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.svm import SVC


In [11]:
# present_cat = [c for c in cat_cols if c in X.columns] if 'cat_cols' in globals() else []
# present_num = [c for c in num_cols if c in X.columns] if 'num_cols' in globals() else []

# print("present_num:", len(present_num), "present_cat:", len(present_cat))

# # 2) 데이터 분리 (이미 되어 있으면 건너뛰어도 됨)
# if not all(v in globals() for v in ["X_train","X_valid","y_train","y_valid"]):
#     X_train, X_valid, y_train, y_valid = train_test_split(
#         X, y, test_size=0.2, random_state=42, stratify=y
#     )

# # 3) 파이프라인 구성 분기
# if len(present_cat) > 0:
#     # (A) 원본 범주형 컬럼이 아직 X에 남아있는 경우 → 원-핫 + 수치 스케일
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ("num", StandardScaler(), present_num),
#             ("cat", OneHotEncoder(handle_unknown="ignore", drop=None), present_cat),
#         ],
#         remainder="drop"
#     )
# else:
#     # (B) 이미 원-핫되어 범주 원본 컬럼이 없는 경우 → 전체에 스케일만 적용
#     preprocessor = ColumnTransformer(
#         transformers=[("all", StandardScaler(), list(X.columns))],
#         remainder="drop"
#     )

# svc = SVC(probability=True)  # ROC/AUC 및 threshold 적용 위해 확률 출력

# pipe = Pipeline(steps=[
#     ("prep", preprocessor),
#     ("clf", svc),
# ])

In [12]:
# from sklearn.model_selection import GridSearchCV
# from joblib import parallel_backend
# THREADING_BACKEND = parallel_backend("threading")  # 윈도우 한글 인코딩 이슈 회피

# param_grid = {
#     "clf__kernel": ["linear", "rbf", "poly", "sigmoid"],
#     "clf__C": [0.1, 1, 3, 10, 30, 100],
# }

# gscv = GridSearchCV(
#     estimator=pipe,
#     param_grid=param_grid,
#     scoring="roc_auc",
#     cv=5,
#     n_jobs=-1,
#     refit=True
# )

# with THREADING_BACKEND:
#     gscv.fit(X_train, y_train)

# print("GridSearch best params:", gscv.best_params_)
# print("GridSearch best ROC-AUC (cv):", round(gscv.best_score_, 6))

In [13]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import loguniform
# from joblib import parallel_backend
# THREADING_BACKEND = parallel_backend("threading")

# param_dist = {
#     "clf__kernel": ["linear", "rbf", "poly", "sigmoid"],
#     "clf__C": loguniform(1e-3, 1e3),
# }

# rscv = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=20,
#     scoring="roc_auc",
#     cv=5,
#     n_jobs=-1,
#     random_state=42,
#     refit=True
# )

# with THREADING_BACKEND:
#     rscv.fit(X_train, y_train)

# print("RandomSearch best params:", rscv.best_params_)
# print("RandomSearch best ROC-AUC (cv):", round(rscv.best_score_, 6))


In [14]:
# from sklearn.metrics import (
#     accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# )

# best_model = gscv if gscv.best_score_ >= rscv.best_score_ else rscv
# print(">> Using:", "GridSearch" if best_model is gscv else "RandomSearch")

# proba = best_model.predict_proba(X_valid)[:, 1]
# THRESH = 0.35
# y_pred = (proba >= THRESH).astype(int)

# acc  = accuracy_score(y_valid, y_pred)
# prec = precision_score(y_valid, y_pred, zero_division=0)
# rec  = recall_score(y_valid, y_pred, zero_division=0)
# f1   = f1_score(y_valid, y_pred)
# auc  = roc_auc_score(y_valid, proba)

# print("\n=== Hold-out Metrics (thr=0.35) ===")
# print(f"Accuracy : {acc:.6f}")
# print(f"Precision: {prec:.6f}")
# print(f"Recall   : {rec:.6f}")
# print(f"F1 Score : {f1:.6f}")
# print(f"AUC(ROC) : {auc:.6f}")


In [15]:
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix, roc_curve

# cm = confusion_matrix(y_valid, y_pred)
# tn, fp, fn, tp = cm.ravel()

# plt.figure(figsize=(5,4))
# plt.imshow(cm, interpolation="nearest")
# plt.title("Confusion Matrix (thr=0.35)")
# plt.xticks([0,1], ["Pred 0", "Pred 1"])
# plt.yticks([0,1], ["True 0", "True 1"])
# for (i, j), v in np.ndenumerate(cm):
#     plt.text(j, i, str(v), ha="center", va="center")
# plt.xlabel("Predicted")
# plt.ylabel("True")
# plt.tight_layout()
# plt.show()

# fpr, tpr, ths = roc_curve(y_valid, proba)
# plt.figure(figsize=(5,4))
# plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
# plt.plot([0,1], [0,1], linestyle="--")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend(loc="lower right")
# plt.tight_layout()
# plt.show()
