In [3]:
# inspect_pkl.py
import joblib, pickle, sys, pprint, json, os
from pathlib import Path

# === 파일 경로 지정 ===
PKL_PATH = Path("./model_and_data_encoded.pkl")  # 또는 model_and_data.pkl

# === 안전: 신뢰할 수 있는 파일만 언피클하세요! ===
if not PKL_PATH.exists():
    print(f"파일 없음: {PKL_PATH.resolve()}")
    sys.exit(1)

# joblib 우선, 실패 시 pickle로 시도
try:
    obj = joblib.load(PKL_PATH)
except Exception as e:
    print("[joblib.load 실패] -> pickle로 재시도:", e)
    with open(PKL_PATH, "rb") as f:
        obj = pickle.load(f)

print("== 최상위 객체 타입 ==", type(obj))

# 딕셔너리일 가능성이 높음
if isinstance(obj, dict):
    print("\n== 딕셔너리 키 목록 ==")
    print(list(obj.keys()))

    # 주요 항목 미리보기
    for key in ("model", "district_encoder", "industry_encoder", "sanggwon_encoder", "feature_names"):
        if key in obj:
            val = obj[key]
            print(f"\n[{key}] 타입: {type(val)}")
            if key == "feature_names" and isinstance(val, (list, tuple)):
                print(" feature_names[:20] =", val[:20])
            if hasattr(val, "classes_"):
                # 라벨인코더 계열
                classes = getattr(val, "classes_", None)
                if classes is not None:
                    print(" classes_ (앞 20개):", list(classes)[:20])

    # 원하면 텍스트로 덤프 저장
    dump_path = PKL_PATH.with_suffix(".summary.txt")
    with open(dump_path, "w", encoding="utf-8") as f:
        f.write("== keys ==\n")
        f.write(pprint.pformat(list(obj.keys())))
        f.write("\n\n== feature_names (앞 100개) ==\n")
        fn = obj.get("feature_names", [])
        f.write(pprint.pformat(fn[:100]))
    print(f"\n요약 파일 저장: {dump_path}")

else:
    print("딕셔너리가 아닙니다. 속성 dir() 일부를 출력합니다.")
    attrs = [a for a in dir(obj) if not a.startswith("_")]
    print(attrs[:50])

== 최상위 객체 타입 == <class 'dict'>

== 딕셔너리 키 목록 ==
['model', 'district_encoder', 'industry_encoder', 'sanggwon_encoder', 'X_train', 'y_train', 'X_test', 'y_test', 'feature_names']

[model] 타입: <class 'catboost.core.CatBoostClassifier'>
 classes_ (앞 20개): [np.int64(0), np.int64(1)]

[district_encoder] 타입: <class 'sklearn.preprocessing._label.LabelEncoder'>
 classes_ (앞 20개): ['강남구', '강동구', '강북구', '강서구', '관악구', '광진구', '구로구', '금천구', '노원구', '도봉구', '동대문구', '동작구', '마포구', '서대문구', '서초구', '성동구', '성북구', '송파구', '양천구', '영등포구']

[industry_encoder] 타입: <class 'sklearn.preprocessing._label.LabelEncoder'>
 classes_ (앞 20개): ['PC방', '가구', '가방', '가전제품', '가전제품수리', '고시원', '골프연습장', '네일숍', '노래방', '당구장', '문구', '미곡판매', '미용실', '반찬가게', '부동산중개업', '분식전문점', '서적', '섬유제품', '세탁소', '수산물판매']

[sanggwon_encoder] 타입: <class 'sklearn.preprocessing._label.LabelEncoder'>
 classes_ (앞 20개): ['HH', 'HL', 'LH', 'LL']

[feature_names] 타입: <class 'list'>
 feature_names[:20] = ['자치구_코드_명', '서비스_업종_코드_명', '점포_수', '유사_업종_점포_수', '개업_률'

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
import joblib, pickle, pandas as pd, os, sys, inspect, hashlib
from pathlib import Path

paths = [
    Path("model_and_data.pkl"),
    Path("model_and_data_encoded.pkl"),
]

def md5(path):
    import hashlib
    h = hashlib.md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def load_any(path):
    try:
        return joblib.load(path), "joblib"
    except Exception as e1:
        with open(path, "rb") as f:
            try:
                return pickle.load(f), "pickle"
            except Exception as e2:
                print(f"[!] Load failed: {path.name}\n joblib: {repr(e1)}\n pickle: {repr(e2)}")
                return None, "error"

def describe(obj):
    if isinstance(obj, dict):
        keys = list(obj.keys())
        sample = {k: type(obj[k]).__name__ for k in keys[:15]}
        return {"type":"dict","n_keys":len(keys),"sample_keys":sample}
    elif isinstance(obj, (list, tuple, set)):
        seq = list(obj)
        return {"type":type(obj).__name__,"length":len(seq),"sample_types":[type(x).__name__ for x in seq[:10]]}
    else:
        t = type(obj)
        out = {"type": t.__name__, "module": getattr(t, "__module__", "?")}
        for a in ["classes_", "n_features_in_", "feature_names_in_", "best_params_"]:
            if hasattr(obj, a):
                v = getattr(obj, a)
                out[a] = (v if isinstance(v, (int, float, str)) else f"{type(v).__name__}")
        return out

for p in paths:
    print("="*80)
    print(f"FILE: {p.name}  SIZE(KB): {p.stat().st_size/1024:.1f}  MD5: {md5(p)}")
    obj, how = load_any(p)
    print("Loaded via:", how)
    if obj is None: 
        continue
    top = describe(obj)
    print("Top:", top)
    if isinstance(obj, dict):
        # 자주 쓰는 키 힌트
        for k in ["model","final_model","clf","pipeline","district_encoder","industry_encoder","sanggwon_encoder","X","y","X_train","y_train","X_val","y_val"]:
            if k in obj:
                print(f" - {k}: {type(obj[k]).__name__}")
                try:
                    if isinstance(obj[k], pd.DataFrame):
                        print(f"   shape={obj[k].shape}, cols(sample)={obj[k].columns[:12].tolist()}")
                except Exception:
                    pass


FILE: model_and_data.pkl  SIZE(KB): 76620.2  MD5: 76f283f8366acc39f71186b85bb3d272
Loaded via: joblib
Top: {'type': 'dict', 'n_keys': 5, 'sample_keys': {'model': 'CatBoostClassifier', 'X_train': 'DataFrame', 'y_train': 'Series', 'X_test': 'DataFrame', 'y_test': 'Series'}}
 - model: CatBoostClassifier
 - X_train: DataFrame
   shape=(48244, 132), cols(sample)=['자치구_코드_명', '서비스_업종_코드_명', '점포_수', '유사_업종_점포_수', '개업_률', '개업_점포_수', '프랜차이즈_점포_수', '당월_매출_금액', '당월_매출_건수', '월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액']
 - y_train: Series
FILE: model_and_data_encoded.pkl  SIZE(KB): 76626.4  MD5: 58d9dc00da265eaa23b2a76cb8c1f70d
Loaded via: joblib
Top: {'type': 'dict', 'n_keys': 9, 'sample_keys': {'model': 'CatBoostClassifier', 'district_encoder': 'LabelEncoder', 'industry_encoder': 'LabelEncoder', 'sanggwon_encoder': 'LabelEncoder', 'X_train': 'DataFrame', 'y_train': 'Series', 'X_test': 'DataFrame', 'y_test': 'Series', 'feature_names': 'list'}}
 - model: CatBoostClassifier
 - district_encoder: LabelEncode

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
# inspect_pkl.py
import joblib, pickle, sys, pprint, json, os
from pathlib import Path

# === 파일 경로 지정 ===
PKL_PATH = Path("./model_and_data.pkl")

# === 안전: 신뢰할 수 있는 파일만 언피클하세요! ===
if not PKL_PATH.exists():
    print(f"파일 없음: {PKL_PATH.resolve()}")
    sys.exit(1)

# joblib 우선, 실패 시 pickle로 시도
try:
    obj = joblib.load(PKL_PATH)
except Exception as e:
    print("[joblib.load 실패] -> pickle로 재시도:", e)
    with open(PKL_PATH, "rb") as f:
        obj = pickle.load(f)

print("== 최상위 객체 타입 ==", type(obj))

# 딕셔너리일 가능성이 높음
if isinstance(obj, dict):
    print("\n== 딕셔너리 키 목록 ==")
    print(list(obj.keys()))

    # 주요 항목 미리보기
    for key in ("model", "district_encoder", "industry_encoder", "sanggwon_encoder", "feature_names"):
        if key in obj:
            val = obj[key]
            print(f"\n[{key}] 타입: {type(val)}")
            if key == "feature_names" and isinstance(val, (list, tuple)):
                print(" feature_names[:20] =", val[:20])
            if hasattr(val, "classes_"):
                # 라벨인코더 계열
                classes = getattr(val, "classes_", None)
                if classes is not None:
                    print(" classes_ (앞 20개):", list(classes)[:20])

    # 원하면 텍스트로 덤프 저장
    dump_path = PKL_PATH.with_suffix(".summary.txt")
    with open(dump_path, "w", encoding="utf-8") as f:
        f.write("== keys ==\n")
        f.write(pprint.pformat(list(obj.keys())))
        f.write("\n\n== feature_names (앞 100개) ==\n")
        fn = obj.get("feature_names", [])
        f.write(pprint.pformat(fn[:100]))
    print(f"\n요약 파일 저장: {dump_path}")

else:
    print("딕셔너리가 아닙니다. 속성 dir() 일부를 출력합니다.")
    attrs = [a for a in dir(obj) if not a.startswith("_")]
    print(attrs[:50])

== 최상위 객체 타입 == <class 'dict'>

== 딕셔너리 키 목록 ==
['model', 'X_train', 'y_train', 'X_test', 'y_test']

[model] 타입: <class 'catboost.core.CatBoostClassifier'>
 classes_ (앞 20개): [np.int64(0), np.int64(1)]

요약 파일 저장: model_and_data.summary.txt
