In [2]:
import pandas as pd
import numpy as np
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb

# 추가: 결측치 imputation에 사용 (KNNImputer)
from sklearn.impute import KNNImputer

# 추가: 모델 튜닝에 사용
from sklearn.model_selection import GridSearchCV, KFold

# 추가: One-Hot Encoding을 위한 도구
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 추가: Lasso 기반 피처 선택
from sklearn.linear_model import LassoCV

class ML:
    def __init__(self, months=None):
        """
        Initializes the ML class with a list of months.
        """
        if months is None:
            months = ['07', '08', '09', '10', '11', '12']
        self.months = months
        self.loaded_data = {}  # 모든 로드한 데이터를 저장할 딕셔너리

    def data_load(self):
        """
        지정된 폴더 및 파일에서 데이터를 로드하여 self.loaded_data에 저장합니다.
        """
        data_splits = ["train", "test"]
        data_categories = {
            "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
            "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
            "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
            "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
            "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
            "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
            "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
            "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
        }

        for split in data_splits:
            for category, info in data_categories.items():
                folder = info["folder"]
                suffix = info["suffix"]
                var_prefix = info["var_prefix"]

                for month in self.months:
                    file_path = f"./{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
                    variable_name = f"{var_prefix}_{split}_{month}"
                    try:
                        df = pd.read_parquet(file_path)
                        self.loaded_data[variable_name] = df
                    except Exception as e:
                        print(f"Failed to load {variable_name} from {file_path}: {e}")
        print('데이터 로드 완료')
        gc.collect()

    def data_preprocessing(self, select_features=False, slice_n=1, NA_ratio=0.2):
        """
        월별 데이터를 slice_n 인자에 따라 각 파일에서 1/slice_n만큼(최소 1행) 선택 후, 
        category별로 concat하여 train 및 test 데이터프레임을 생성합니다.
        select_features=True이면 CSV에 저장된 feature 목록에 따라 불필요한 컬럼을 제거합니다.
        NA_ratio 이상 결측치 비율을 가진 컬럼은 먼저 제거하고, 남은 컬럼들은 imputation을 수행합니다.
        """
        info_categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]
        train_dfs = {}
        test_dfs = {}

        # Train 데이터 처리
        for prefix in info_categories:
            df_list = []
            for month in self.months:
                key = f"{prefix}_train_{month}"
                if key in self.loaded_data:
                    df = self.loaded_data[key]
                    # 최소 1행 이상 선택
                    num_rows = max(1, len(df) // slice_n)
                    sliced_df = df.iloc[:num_rows]
                    df_list.append(sliced_df)
                    del self.loaded_data[key]
            if df_list:
                merged_df = pd.concat(df_list, axis=0)
                # 먼저 NA_ratio 이상 결측치 비율을 가진 컬럼 목록 계산
                missing_ratio = merged_df.isna().mean()
                cols_to_drop = [col for col in missing_ratio.index if missing_ratio[col] > NA_ratio]
                if cols_to_drop:
                    merged_df = merged_df.drop(columns=cols_to_drop)
                    print(f"Prefix {prefix}: Dropped columns due to high NA ratio: {cols_to_drop}")
                # ## Modified: Imputation for remaining missing values without inplace
                for col in merged_df.columns:
                    if merged_df[col].isna().sum() > 0:
                        if merged_df[col].dtype in [np.float64, np.int64]:
                            median_val = merged_df[col].median()
                            merged_df[col] = merged_df[col].fillna(median_val)
                        else:
                            mode_val = merged_df[col].mode()[0]
                            merged_df[col] = merged_df[col].fillna(mode_val)
                        print(f"Prefix {prefix}: Imputed missing values in column {col}")
                train_dfs[f"{prefix}_train_df"] = merged_df
                print(f"{prefix}_train_df is created with shape: {merged_df.shape}")
            else:
                print(f"No train data found for prefix: {prefix}")

        gc.collect()

        if select_features:
            selected_features = pd.read_csv('selected_features.csv')['feature'].tolist()
            for prefix in info_categories:
                key = f"{prefix}_train_df"
                if key in train_dfs:
                    cols_to_drop = [col for col in train_dfs[key].columns if col not in selected_features]
                    train_dfs[key] = train_dfs[key].drop(columns=cols_to_drop)
                    print(f"{key} now has shape: {train_dfs[key].shape}")
        gc.collect()

        if "customer_train_df" in train_dfs and "credit_train_df" in train_dfs:
            train_df = train_dfs["customer_train_df"].merge(train_dfs["credit_train_df"], on=['기준년월', 'ID'], how='left')
            print("Train Step1 merged, shape:", train_df.shape)
        else:
            train_df = None
            print("Missing customer_train_df or credit_train_df in train data.")

        merge_list = [
            ("sales_train_df",    "Step2"),
            ("billing_train_df",  "Step3"),
            ("balance_train_df",  "Step4"),
            ("channel_train_df",  "Step5"),
            ("marketing_train_df","Step6"),
            ("performance_train_df", "Final")
        ]
        for df_name, step in merge_list:
            if df_name in train_dfs:
                train_df = train_df.merge(train_dfs[df_name], on=['기준년월', 'ID'], how='left')
                print(f"{step} merge completed, shape:", train_df.shape)
                del train_dfs[df_name]
                gc.collect()
            else:
                print(f"{df_name} not found during merging in train data.")

        train_columns = train_df.columns

        # Test 데이터 처리 (Train과 유사)
        for prefix in info_categories:
            df_list = []
            for month in self.months:
                key = f"{prefix}_test_{month}"
                if key in self.loaded_data:
                    df = self.loaded_data[key]
                    num_rows = max(1, len(df) // slice_n)
                    sliced_df = df.iloc[:num_rows]
                    df_list.append(sliced_df)
                    del self.loaded_data[key]
            if df_list:
                merged_df = pd.concat(df_list, axis=0)
                test_dfs[f"{prefix}_test_df"] = merged_df
                for col in test_dfs[f"{prefix}_test_df"].columns:
                    if col not in train_columns:
                        test_dfs[f"{prefix}_test_df"] = test_dfs[f"{prefix}_test_df"].drop(columns=[col])
                print(f"{prefix}_test_df is created with shape: {merged_df.shape}")
            else:
                print(f"No test data found for prefix: {prefix}")

        gc.collect()

        if select_features:
            selected_features = pd.read_csv('selected_features.csv')['feature'].tolist()
            for prefix in info_categories:
                key = f"{prefix}_test_df"
                if key in test_dfs:
                    cols_to_drop = [col for col in test_dfs[key].columns if col not in selected_features]
                    test_dfs[key] = test_dfs[key].drop(columns=cols_to_drop)
            gc.collect()

        if "customer_test_df" in test_dfs and "credit_test_df" in test_dfs:
            test_df = test_dfs["customer_test_df"].merge(test_dfs["credit_test_df"], on=['기준년월', 'ID'], how='left')
            print("Test Step1 merged, shape:", test_df.shape)
        else:
            test_df = None
            print("Missing customer_test_df or credit_test_df in test data.")

        merge_list_test = [
            ("sales_test_df",    "Step2"),
            ("billing_test_df",  "Step3"),
            ("balance_test_df",  "Step4"),
            ("channel_test_df",  "Step5"),
            ("marketing_test_df","Step6"),
            ("performance_test_df", "Final")
        ]
        for df_name, step in merge_list_test:
            if df_name in test_dfs:
                test_df = test_df.merge(test_dfs[df_name], on=['기준년월', 'ID'], how='left')
                print(f"{step} merge completed in test data, shape:", test_df.shape)
                del test_dfs[df_name]
                gc.collect()
            else:
                print(f"{df_name} not found during merging in test data.")

        ## Modified: Impute missing values in test data (if any)
        for col in test_df.columns:
            if test_df[col].isna().mean() > 0:
                if test_df[col].dtype in [np.float64, np.int64]:
                    test_df[col] = test_df[col].fillna(test_df[col].median())
                else:
                    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])
                print(f"Test data: Imputed missing values in column {col}")

        self.loaded_data['train'] = train_df
        return train_df, test_df



    def data_encoding(self, train_df, test_df, encoding_method="label"):
        """
        인코딩 방법을 선택할 수 있도록 합니다.
        encoding_method: "label" (기본), "onehot"
        """
        feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
        X = train_df[feature_cols].copy()
        y = train_df["Segment"].copy()
        
        # 타깃 인코딩 (Label Encoding)
        self.le_target = LabelEncoder()
        y_encoded = self.le_target.fit_transform(y)
        
        # 범주형 컬럼 인코딩: 옵션에 따라 라벨 인코딩 또는 원-핫 인코딩 수행
        categorical_features = X.select_dtypes(include=['object']).columns.tolist()
        if encoding_method == "label":
            encoders = {}
            for col in categorical_features:
                le_col = LabelEncoder()
                X[col] = le_col.fit_transform(X[col])
                encoders[col] = le_col
            # Test 데이터에 동일 적용
            X_test = test_df.drop(columns=["ID"], errors='ignore').copy()
            for col in categorical_features:
                le_col = encoders[col]
                test_unique = set(X_test[col].unique())
                train_classes = set(le_col.classes_)
                unseen = test_unique - train_classes
                if unseen:
                    le_col.classes_ = np.append(le_col.classes_, list(unseen))
                X_test[col] = le_col.transform(X_test[col])
        elif encoding_method == "onehot":
            # One-Hot Encoding using ColumnTransformer
            ct = ColumnTransformer(transformers=[('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
                                   remainder='passthrough')
            X = pd.DataFrame(ct.fit_transform(X))
            X_test = pd.DataFrame(ct.transform(test_df.drop(columns=["ID"], errors='ignore')))
        else:
            raise ValueError("unsupported encoding_method")
        
        gc.collect()
        return X, y_encoded, X_test

    def train_model(self, X, y_encoded, X_test, method, test_df, file_name, val_ratio=0.3, tune=True):
        """
        모델 선택 및 학습 + 하이퍼파라미터 튜닝과 KFold 교차 검증을 도입합니다.
        method: 1: XGBoost, 2: CatBoost, 3: RandomForest
        tune: True이면 GridSearchCV를 통해 하이퍼파라미터 튜닝 수행
        """
        if method == 1:
            print("XGBoost 모델 선택")
            base_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
            param_grid = {
                'max_depth': [3, 5, 7],
                'learning_rate': [0.01, 0.1, 0.2],
                'n_estimators': [100, 200]
            }
        elif method == 2:
            print("CatBoost 모델 선택")
            base_model = CatBoostClassifier(random_state=42, verbose=False)
            param_grid = {
                'depth': [3, 5, 7],
                'learning_rate': [0.01, 0.1, 0.2],
                'iterations': [100, 200]
            }
        elif method == 3:
            print("RandomForest 모델 선택")
            base_model = RandomForestClassifier(random_state=42)
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20]
            }
        else:
            print("올바른 모델 번호를 입력하세요: 1 (XGBoost), 2 (CatBoost), 3 (RandomForest)")
            return

        ## Added: Hyperparameter tuning using GridSearchCV and KFold cross-validation
        if tune:
            cv = KFold(n_splits=5, shuffle=True, random_state=42)
            grid = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
            grid.fit(X, y_encoded)
            print("Best parameters:", grid.best_params_)
            model = grid.best_estimator_
        else:
            model = base_model
            model.fit(X, y_encoded)
        
        # Stratified train/validation split
        from sklearn.model_selection import train_test_split
        X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=val_ratio, random_state=42, stratify=y_encoded)
        
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        recall_val = recall_score(y_val, y_val_pred, average='macro')
        precision_val = precision_score(y_val, y_val_pred, average='macro')
        f1_val = f1_score(y_val, y_val_pred, average='macro')
        
        print("\nValidation Performance Metrics:")
        print(f"Recall:    {recall_val:.4f}")
        print(f"Precision: {precision_val:.4f}")
        print(f"F1 Score:  {f1_val:.4f}")
        print("\nValidation Classification Report:")
        print(classification_report(y_val, y_val_pred))
        
        model.fit(X, y_encoded)
        preds = model.predict(X_test)
        pred_labels = self.le_target.inverse_transform(preds)
        
        test_data = test_df.copy().reset_index(drop=True)
        if len(test_data) != len(pred_labels):
            print(f"Error: test data rows ({len(test_data)}) and prediction length ({len(pred_labels)}) do not match.")
            return

        test_data["pred_label"] = pred_labels
        submission = test_data.groupby("ID")["pred_label"] \
            .agg(lambda x: x.value_counts().idxmax()).reset_index()
        submission.columns = ["ID", "Segment"]
        submission.to_csv(f"{file_name}.csv", index=False)
        print(f"Submission CSV created: {file_name}.csv")
        
        gc.collect()

    def select_features(self, X, y_encoded, X_test, top_n=500, corr_threshold=0.9, selection_method="rf"):
        """
        상위 top_n 피처를 기반으로 두 가지 방법 중 선택:
          - "rf": RandomForest 기반 피처 중요도와 상관계수 필터링 (기존 방식)
          - "lasso": LassoCV를 이용한 피처 선택
        최종적으로 필수 컬럼 ["기준년월", "ID", "Segment"]를 추가하고 CSV 파일로 저장합니다.
        """
        if selection_method == "rf":
            rf = RandomForestClassifier(random_state=42, n_estimators=100)
            rf.fit(X, y_encoded)
            feature_importances = rf.feature_importances_
            importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
            importance_df = importance_df.sort_values(by='importance', ascending=False)
            print("전체 feature importance:")
            print(importance_df)
            top_features = importance_df.head(top_n)['feature'].tolist()
            corr_matrix = X[top_features].corr().abs()
            final_features = []
            for feature in top_features:
                add_feature = True
                for sel_feat in final_features:
                    if corr_matrix.loc[feature, sel_feat] > corr_threshold:
                        add_feature = False
                        break
                if add_feature:
                    final_features.append(feature)
            print("\n최종 선택된 피처들 (상관관계 필터 적용):")
            print(final_features)
        elif selection_method == "lasso":
            lasso = LassoCV(cv=5, random_state=42).fit(X, y_encoded)
            coef = pd.Series(lasso.coef_, index=X.columns)
            final_features = coef[coef != 0].index.tolist()
            print("Lasso selected features:")
            print(final_features)
        else:
            raise ValueError("unsupported selection_method")
        
        X_top = X[final_features]
        print("\n최종 선택된 피처를 가진 데이터프레임 (X_top) 일부 출력:")
        print(X_top.head())
        
        mandatory = ["기준년월", "ID", "Segment"]
        for col in mandatory:
            if col not in final_features:
                final_features.append(col)
        top_features_df = pd.DataFrame(final_features, columns=['feature'])
        save_path = 'selected_features.csv'
        top_features_df.to_csv(save_path, index=False)
        print(f"선택한 최종 컬럼들이 '{save_path}' 파일로 저장되었습니다.")
       
        gc.collect()


In [10]:
ml = ML()
ml.data_load()
ml.data_preprocessing(select_features=False ,slice_n=50, NA_ratio=0.9999)

데이터 로드 완료
Prefix customer: Imputed missing values in column 가입통신회사코드
Prefix customer: Imputed missing values in column 직장시도명
Prefix customer: Imputed missing values in column _1순위신용체크구분
Prefix customer: Imputed missing values in column _2순위신용체크구분
Prefix customer: Imputed missing values in column 최종유효년월_신용_이용가능
Prefix customer: Imputed missing values in column 최종유효년월_신용_이용
Prefix customer: Imputed missing values in column 최종카드발급일자
customer_train_df is created with shape: (48000, 78)
Prefix credit: Imputed missing values in column RV신청일자
Prefix credit: Imputed missing values in column RV전환가능여부
credit_train_df is created with shape: (48000, 42)
Prefix sales: Imputed missing values in column _1순위업종
Prefix sales: Imputed missing values in column _2순위업종
Prefix sales: Imputed missing values in column _3순위업종
Prefix sales: Imputed missing values in column _1순위쇼핑업종
Prefix sales: Imputed missing values in column _2순위쇼핑업종
Prefix sales: Imputed missing values in column _3순위쇼핑업종
Prefix sales: Impute

(         기준년월            ID  남녀구분코드   연령 Segment  회원여부_이용가능  회원여부_이용가능_CA  \
 0      201807  TRAIN_000000       2  40대       D          1             1   
 1      201807  TRAIN_000001       1  30대       E          1             1   
 2      201807  TRAIN_000002       1  30대       C          1             1   
 3      201807  TRAIN_000003       2  40대       D          1             1   
 4      201807  TRAIN_000004       2  40대       E          1             1   
 ...       ...           ...     ...  ...     ...        ...           ...   
 47995  201812  TRAIN_007995       2  30대       E          1             1   
 47996  201812  TRAIN_007996       2  40대       E          1             1   
 47997  201812  TRAIN_007997       2  40대       E          1             1   
 47998  201812  TRAIN_007998       2  40대       E          1             1   
 47999  201812  TRAIN_007999       1  50대       E          1             1   
 
        회원여부_이용가능_카드론  소지여부_신용  소지카드수_유효_신용  ...  변동률_RV일시불평잔 

In [11]:
df = ml.loaded_data['train']

In [12]:
df.to_csv('big_sample_data.csv')

In [5]:
df['Segment'].value_counts()

Segment
E    1922052
D     349242
C     127590
A        972
B        144
Name: count, dtype: int64

In [8]:
ml = ML()
ml.data_load()
train_df, test_df = ml.data_preprocessing(select_features=False, slice_n=100, NA_ratio=0.2)
X, y_encoded, X_test = ml.data_encoding(train_df, test_df)
ml.select_features(X=X, y_encoded=y_encoded, top_n=500, X_test=X_test, 
                   corr_threshold=0.9,
                   selection_method='rf')
# ml.train_model(X=X, y_encoded=y_encoded, X_test=X_test, 
#                method=1, test_df=test_df,
#                file_name='xgboost_test_scoreFix')

데이터 로드 완료
Prefix customer: Dropped columns due to high NA ratio: ['_2순위신용체크구분', '최종유효년월_신용_이용']
Prefix customer: Imputed missing values in column 가입통신회사코드
Prefix customer: Imputed missing values in column 직장시도명
Prefix customer: Imputed missing values in column _1순위신용체크구분
Prefix customer: Imputed missing values in column 최종유효년월_신용_이용가능
Prefix customer: Imputed missing values in column 최종카드발급일자
customer_train_df is created with shape: (24000, 76)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_dfs[f"{prefix}_train_df"][col].fillna(train_dfs[f"{prefix}_train_df"][col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_dfs[f"{prefix}_train_df"][col].fillna(train_dfs[f"{prefix}_train_df"][col].median(), inplace=True)


Prefix credit: Dropped columns due to high NA ratio: ['RV신청일자']
Prefix credit: Imputed missing values in column RV전환가능여부
credit_train_df is created with shape: (24000, 41)
Prefix sales: Dropped columns due to high NA ratio: ['_1순위업종', '_2순위업종', '_3순위업종', '_1순위쇼핑업종', '_2순위쇼핑업종', '_3순위쇼핑업종', '_1순위교통업종', '_2순위교통업종', '_3순위교통업종', '_1순위여유업종', '_2순위여유업종', '_3순위여유업종', '_1순위납부업종', '_2순위납부업종', '_3순위납부업종', '최종카드론_금융상환방식코드', '최종카드론_신청경로코드', '최종카드론_대출일자']
sales_train_df is created with shape: (24000, 388)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_dfs[f"{prefix}_train_df"][col].fillna(train_dfs[f"{prefix}_train_df"][col].mode()[0], inplace=True)


billing_train_df is created with shape: (24000, 46)
Prefix balance: Dropped columns due to high NA ratio: ['연체일자_B0M']
balance_train_df is created with shape: (24000, 81)
Prefix channel: Dropped columns due to high NA ratio: ['OS구분코드']
channel_train_df is created with shape: (24000, 104)
marketing_train_df is created with shape: (24000, 64)
Prefix performance: Dropped columns due to high NA ratio: ['혜택수혜율_B0M']
Prefix performance: Imputed missing values in column 혜택수혜율_R3M
performance_train_df is created with shape: (24000, 48)
Train Step1 merged, shape: (24000, 115)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_dfs[f"{prefix}_train_df"][col].fillna(train_dfs[f"{prefix}_train_df"][col].median(), inplace=True)


Step2 merge completed, shape: (24000, 501)
Step3 merge completed, shape: (24000, 545)
Step4 merge completed, shape: (24000, 624)
Step5 merge completed, shape: (24000, 726)
Step6 merge completed, shape: (24000, 788)
Final merge completed, shape: (24000, 834)
customer_test_df is created with shape: (6000, 75)
credit_test_df is created with shape: (6000, 41)
sales_test_df is created with shape: (6000, 388)
billing_test_df is created with shape: (6000, 46)
balance_test_df is created with shape: (6000, 81)
channel_test_df is created with shape: (6000, 104)
marketing_test_df is created with shape: (6000, 64)
performance_test_df is created with shape: (6000, 48)
Test Step1 merged, shape: (6000, 114)
Step2 merge completed in test data, shape: (6000, 500)
Step3 merge completed in test data, shape: (6000, 544)
Step4 merge completed in test data, shape: (6000, 623)
Step5 merge completed in test data, shape: (6000, 725)
Step6 merge completed in test data, shape: (6000, 787)
Final merge completed i

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].median(), inplace=True)


전체 feature importance:
              feature  importance
474        정상청구원금_B5M    0.030151
41      이용금액_R3M_신용체크    0.026018
466        정상청구원금_B0M    0.025370
378     이용금액_오프라인_B0M    0.016503
507           청구금액_B0    0.016002
..                ...         ...
680     IB문의건수_CA_B0M    0.000000
681  IB문의건수_CL_RV_B0M    0.000000
682     IB문의건수_CS_B0M    0.000000
683    IB상담건수_VOC_B0M    0.000000
696   IB문의건수_비밀번호_R6M    0.000000

[832 rows x 2 columns]

최종 선택된 피처들 (상관관계 필터 적용):
['정상청구원금_B5M', '이용금액_R3M_신용체크', '정상청구원금_B0M', '이용금액_오프라인_B0M', '청구금액_B0', '이용금액_일시불_R12M', '입회일자_신용', '연속유실적개월수_기본_24M_카드', '최대이용금액_일시불_R12M', '청구금액_R6M', '이용금액대', '월중평잔_일시불_B0M', '카드이용한도금액', '정상입금원금_B5M', '정상입금원금_B2M', '최종카드발급일자', '평잔_3M', '이용금액_체크_R12M', '월상환론한도금액', '최대이용금액_체크_R12M', '이용금액_오프라인_R6M', '이용건수_신용_R12M', '최대이용금액_CA_R12M', '_1순위업종_이용금액', '_1순위교통업종_이용금액', '이용건수_오프라인_R6M', '_3순위업종_이용금액', '이용금액_페이_온라인_R6M', '최종이용일자_CA', 'CL이자율_할인전', '연체입금원금_B5M', '최종유효년월_신용_이용가능', '최종이용일자_할부', '_1순위쇼핑업종_이용금액', '_3순위쇼핑업종_

In [2]:
ml = ML()
ml.data_load()
train_df, test_df = ml.data_preprocessing(select_features=True, slice_n=5, NA_ratio=0.2)
X, y_encoded, X_test = ml.data_encoding(train_df, test_df)
ml.train_model(X=X, y_encoded=y_encoded, X_test=X_test, 
               method=1, test_df=test_df,
               file_name='xgboost')

데이터 로드 완료
Prefix customer: Dropped columns due to high NA ratio: ['_2순위신용체크구분', '최종유효년월_신용_이용']
Prefix customer: Imputed missing values in column 가입통신회사코드
Prefix customer: Imputed missing values in column 직장시도명
Prefix customer: Imputed missing values in column _1순위신용체크구분
Prefix customer: Imputed missing values in column 최종유효년월_신용_이용가능
Prefix customer: Imputed missing values in column 최종카드발급일자
customer_train_df is created with shape: (480000, 76)
Prefix credit: Dropped columns due to high NA ratio: ['RV신청일자']
Prefix credit: Imputed missing values in column RV전환가능여부
credit_train_df is created with shape: (480000, 41)
Prefix sales: Dropped columns due to high NA ratio: ['_1순위업종', '_2순위업종', '_3순위업종', '_1순위쇼핑업종', '_2순위쇼핑업종', '_3순위쇼핑업종', '_1순위교통업종', '_2순위교통업종', '_3순위교통업종', '_1순위여유업종', '_2순위여유업종', '_3순위여유업종', '_1순위납부업종', '_2순위납부업종', '_3순위납부업종', '최종카드론_금융상환방식코드', '최종카드론_신청경로코드', '최종카드론_대출일자']
sales_train_df is created with shape: (480000, 388)
billing_train_df is created with shape: (480000, 4

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.




Validation Performance Metrics:
Recall:    0.7454
Precision: 0.9461
F1 Score:  0.8040

Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.84      0.91        68
           1       1.00      0.29      0.44         7
           2       0.90      0.81      0.85      7668
           3       0.86      0.81      0.83     20965
           4       0.97      0.98      0.97    115292

    accuracy                           0.95    144000
   macro avg       0.95      0.75      0.80    144000
weighted avg       0.95      0.95      0.95    144000



Parameters: { "use_label_encoder" } are not used.



Submission CSV created: xgboost.csv
