In [6]:
import pandas as pd
import numpy as np
import gc
from pandas.api.types import is_numeric_dtype

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import xgboost as xgb
from catboost import CatBoostClassifier

from tqdm.auto import tqdm  # 진행률 표시용
import warnings
warnings.filterwarnings("ignore")


class ML:
    def __init__(self, months=None):
        self.months = months or ['07','08','09','10','11','12']
        self.loaded_data = {}
        self.le_target = None

    # ----------------------------- 1. 데이터 로드 -----------------------------
    def data_load(self):
        cats = {
            "회원정보":("1.회원정보","회원정보","customer"),
            "신용정보":("2.신용정보","신용정보","credit"),
            "승인매출정보":("3.승인매출정보","승인매출정보","sales"),
            "청구정보":("4.청구입금정보","청구정보","billing"),
            "잔액정보":("5.잔액정보","잔액정보","balance"),
            "채널정보":("6.채널정보","채널정보","channel"),
            "마케팅정보":("7.마케팅정보","마케팅정보","marketing"),
            "성과정보":("8.성과정보","성과정보","performance"),
        }
        print("▶ Loading data...")
        for split in tqdm(["train","test"], desc="Splits"):
            for folder, suffix, prefix in cats.values():
                for m in self.months:
                    fp = f"./{split}/{folder}/2018{m}_{split}_{suffix}.parquet"
                    key = f"{prefix}_{split}_{m}"
                    try:
                        self.loaded_data[key] = pd.read_parquet(fp)
                    except Exception:
                        pass
        print("✔ Data load complete\n")
        gc.collect()

    # ----------------------------- 2. 전처리 -----------------------------
    def data_preprocessing(self, select_features=False, slice_n=1, NA_ratio=0.2):
        cats = ["customer","credit","sales","billing",
                "balance","channel","marketing","performance"]
        train_dfs, test_dfs = {}, {}

        print("▶ Preprocessing TRAIN data...")
        for p in tqdm(cats, desc="Categories"):
            chunks = []
            for m in self.months:
                k = f"{p}_train_{m}"
                if k in self.loaded_data:
                    dfm = self.loaded_data.pop(k)
                    rows = len(dfm) if slice_n <= 1 else max(1, len(dfm)//slice_n)
                    chunks.append(dfm.iloc[:rows])
            if not chunks:
                continue
            df = pd.concat(chunks, axis=0)
            df = df.drop(columns=df.columns[df.isna().mean() > NA_ratio])
            for c in df.columns:
                if df[c].isna().any():
                    if is_numeric_dtype(df[c]):
                        df[c] = df[c].fillna(df[c].median())
                    else:
                        mode_val = df[c].mode()
                        v = mode_val.iloc[0] if not mode_val.empty else "MISSING"
                        df[c] = df[c].fillna(v)
            train_dfs[f"{p}_train_df"] = df

        # ------------------ 피처 셀렉션 적용 ------------------
        if select_features:
            sel = pd.read_csv("selected_features.csv")["feature"].tolist()
            for k, v in train_dfs.items():
                # CSV 선택 피처 + 키·타깃 컬럼
                keep = [c for c in sel if c in v.columns] + ["기준년월", "ID", "Segment"]
                # 실제 존재하는 컬럼만, 중복 제거
                keep = [c for c in keep if c in v.columns]
                keep = list(dict.fromkeys(keep))
                train_dfs[k] = v[keep]

        train_df = self._merge(train_dfs, "train")

        print("\n▶ Preprocessing TEST data...")
        for p in tqdm(cats, desc="Categories"):
            chunks = []
            for m in self.months:
                k = f"{p}_test_{m}"
                if k in self.loaded_data:
                    dfm = self.loaded_data.pop(k)
                    rows = len(dfm) if slice_n <= 1 else max(1, len(dfm)//slice_n)
                    chunks.append(dfm.iloc[:rows])
            if not chunks:
                continue
            df = pd.concat(chunks, axis=0)
            extra = [c for c in df.columns if c not in train_df.columns]
            if extra:
                df = df.drop(columns=extra)
            test_dfs[f"{p}_test_df"] = df

        test_df = self._merge(test_dfs, "test")
        for c in test_df.columns:
            if test_df[c].isna().any():
                if is_numeric_dtype(test_df[c]):
                    test_df[c] = test_df[c].fillna(test_df[c].median())
                else:
                    mode_val = test_df[c].mode()
                    v = mode_val.iloc[0] if not mode_val.empty else "MISSING"
                    test_df[c] = test_df[c].fillna(v)

        print("✔ Preprocessing complete\n")
        return train_df, test_df

    # ----------------------------- 안전한 병합 -----------------------------
    def _merge(self, dct, prefix):
        order = ["customer","credit","sales","billing",
                 "balance","channel","marketing","performance"]
        base = dct.get(f"customer_{prefix}_df")
        if base is None:
            raise ValueError("Missing customer base DataFrame for merge")
        for p in order[1:]:
            key = f"{p}_{prefix}_df"
            if key not in dct:
                continue
            df = dct[key]
            # 키 컬럼 보존 여부 확인
            if not {'기준년월','ID'}.issubset(df.columns):
                print(f"[WARN] {key} missing merge keys, skipped")
                continue
            base = base.merge(df, on=['기준년월','ID'], how='left')
        print(f"{prefix.upper()} merged shape: {base.shape}\n")
        return base

    # ----------------------------- 3. 인코딩 -----------------------------
    def data_encoding(self, train_df, test_df, encoding="label"):
        feats = [c for c in train_df.columns if c not in ("ID","Segment")]
        X = train_df[feats].copy()
        y = train_df["Segment"].copy()
        self.le_target = LabelEncoder().fit(y)
        y_enc = self.le_target.transform(y)

        cat_cols = X.select_dtypes("object").columns.tolist()
        if encoding == "label":
            enc = {c: LabelEncoder().fit(X[c]) for c in cat_cols}
            for c, e in enc.items():
                X[c] = e.transform(X[c])
            X_test = test_df.drop(columns="ID", errors="ignore").copy()
            for c, e in enc.items():
                unseen = set(X_test[c]) - set(e.classes_)
                if unseen:
                    e.classes_ = np.append(e.classes_, list(unseen))
                X_test[c] = e.transform(X_test[c])
        else:
            ct = ColumnTransformer(
                [("ohe", OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                remainder='passthrough'
            )
            X = pd.DataFrame(ct.fit_transform(X), columns=ct.get_feature_names_out())
            X_test = pd.DataFrame(
                ct.transform(test_df.drop(columns="ID", errors="ignore")),
                columns=ct.get_feature_names_out()
            )

        print("✔ Encoding complete\n")
        return X, y_enc, X_test

    # ----------------------------- 4. 모델 학습 -----------------------------
    def train_model(self, X, y, X_test, test_df,
                    method, file_name="submit",
                    use_gpu=True, tune=True, val_ratio=0.3):

        if method == 1:   # XGBoost
            base = xgb.XGBClassifier(
                random_state=42, eval_metric="mlogloss",
                tree_method=("gpu_hist" if use_gpu else "hist"),
                predictor=("gpu_predictor" if use_gpu else "auto"),
                gpu_id=(0 if use_gpu else -1)
            )
            grid = {"max_depth":[3,5], "learning_rate":[0.05,0.1], "n_estimators":[300,500]}
        elif method == 2: # CatBoost
            base = CatBoostClassifier(
                random_state=42, verbose=False,
                task_type=("GPU" if use_gpu else "CPU"),
                devices=("0" if use_gpu else None)
            )
            grid = {"depth":[4,6], "learning_rate":[0.05,0.1], "iterations":[400,600]}
        elif method == 3: # RandomForest (CPU only)
            base = RandomForestClassifier(random_state=42, n_jobs=-1)
            grid = {"n_estimators":[300,500], "max_depth":[None,20], "min_samples_leaf":[1,3]}
        else:
            raise ValueError("method must be 1, 2, or 3")

        print("▶ Starting training...")
        if tune and grid:
            gs = GridSearchCV(
                base, grid,
                cv=StratifiedKFold(5, shuffle=True, random_state=42),
                scoring="f1_macro", n_jobs=-1, verbose=2
            ).fit(X, y)
            model = gs.best_estimator_
            print("✔ Best hyperparams:", gs.best_params_)
        else:
            model = base.fit(X, y)

        X_tr, X_val, y_tr, y_val = train_test_split(
            X, y, test_size=val_ratio, random_state=42, stratify=y
        )
        model.fit(X_tr, y_tr)
        print("▶ Validation results:")
        print(classification_report(y_val, model.predict(X_val)))

        if not (tune and grid):
            model.fit(X, y)

        X_test = X_test.reindex(columns=X.columns, fill_value=0)

        preds = self.le_target.inverse_transform(model.predict(X_test))
        out = (
            test_df.copy().reset_index(drop=True)
            .assign(pred=preds)
            .groupby("ID")["pred"].agg(lambda s: s.value_counts().idxmax())
            .reset_index().rename(columns={"pred":"Segment"})
        )
        out.to_csv(f"{file_name}.csv", index=False)
        print(f"✔ Model saved → {file_name}.csv\n")
        gc.collect()

    # ----------------------------- 5. 피처 선택 -----------------------------
    def select_features(self, X, y, top_n=400, corr_threshold=0.9):
        print("▶ Calculating feature importances...")
        rf = RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)
        rf.fit(X, y)
        imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

        # 전체 피처 중요도 CSV 저장
        imp_df = imp.reset_index()
        imp_df.columns = ['feature','importance']
        imp_df.to_csv("feature_importances.csv", index=False)
        print(f"✔ Saved full importances ({len(imp_df)}) to feature_importances.csv")

        # 상위 top_n 필터링
        top_feats = imp.head(top_n).index.tolist()
        print(f"▶ Filtering top {top_n} features by correlation (threshold={corr_threshold})")
        corr = X[top_feats].corr().abs()
        selected = []
        total = len(top_feats)
        for i, f in enumerate(top_feats, 1):
            print(f"   Progress {i}/{total}: {f}", end="\r")
            if all(corr.loc[f, s] <= corr_threshold for s in selected):
                selected.append(f)
        print()

        # 최종 선택 피처 CSV 저장
        sel_df = pd.DataFrame(selected, columns=["feature"])
        sel_df.to_csv("selected_features.csv", index=False)
        print(f"✔ Saved {len(selected)} selected features to selected_features.csv\n")

In [None]:
ml = ML()
ml.data_load()                       # 모든 parquet 로드


train_sub, test_sub = ml.data_preprocessing(
    select_features=False, slice_n=10, NA_ratio=0.2
)
X_sub, y_sub, X_test_dummy = ml.data_encoding(train_sub, test_sub)


# RandomForest + 하이퍼파라미터 서치로 중요도 기반 상위 500개 저장
ml.select_features(
    X=X_sub, y=y_sub,
    top_n=500,
    corr_threshold=0.9             # 내부에서 selected_features.csv 생성
)

ml.data_load()
train_df, test_df = ml.data_preprocessing(
    select_features=True,    # 방금 저장한 CSV를 사용
    slice_n=5,
    NA_ratio=0.2
)
X, y, X_test = ml.data_encoding(train_df, test_df)


ml.train_model(
    X=X, y_encoded=y,
    X_test=X_test,
    method=1,               # 1:XGBoost  2:CatBoost  3:RandomForest
    test_df=test_df,
    file_name="submit", # 저장 파일명
    val_ratio=0.3,
    tune=True               # GridSearchCV 사용
)


In [8]:
ml = ML()
ml.data_load()
train_df, test_df = ml.data_preprocessing(
    select_features=True,    # 방금 저장한 CSV를 사용
    slice_n=5,
    NA_ratio=0.2
)
X, y, X_test = ml.data_encoding(train_df, test_df)


ml.train_model(
    X=X, y=y,
    X_test=X_test,
    method=3,               # 1:XGBoost  2:CatBoost  3:RandomForest
    test_df=test_df,
    file_name="submit", # 저장 파일명
    val_ratio=0.3,
    tune=True               # GridSearchCV 사용
)

▶ Loading data...


Splits:   0%|          | 0/2 [00:00<?, ?it/s]

✔ Data load complete

▶ Preprocessing TRAIN data...


Categories:   0%|          | 0/8 [00:00<?, ?it/s]

TRAIN merged shape: (480000, 272)


▶ Preprocessing TEST data...


Categories:   0%|          | 0/8 [00:00<?, ?it/s]

TEST merged shape: (120000, 271)

✔ Preprocessing complete

✔ Encoding complete

▶ Starting training...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
✔ Best hyperparams: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 500}
▶ Validation results:
              precision    recall  f1-score   support

           0       1.00      0.44      0.61        68
           1       1.00      0.29      0.44         7
           2       0.91      0.72      0.81      7668
           3       0.86      0.75      0.80     20965
           4       0.95      0.99      0.97    115292

    accuracy                           0.94    144000
   macro avg       0.94      0.64      0.73    144000
weighted avg       0.94      0.94      0.94    144000

✔ Model saved → submit.csv

