In [None]:
import pandas as pd
import numpy as np
import warnings
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from utils.reduce_memory import trainform_columns_type
import xgboost as xgb
warnings.filterwarnings('ignore')

### 단일 값 가지는 컬럼 & NULL 값 가지는 컬럼 모두 제거.

In [2]:
train_df = pd.read_parquet('./data/train/train_filtered.parquet')
test_df = pd.read_parquet('./data/test/test_filtered.parquet')

# reduce memory by chaing data types of columns
train_df = trainform_columns_type(train_df)
test_df = trainform_columns_type(test_df)

print('shape of train dataset(drop colums)', train_df.shape)
print('shape of test dataset(drop colums)', test_df.shape)

shape of train dataset(drop colums) (2400000, 743)
shape of test dataset(drop colums) (600000, 742)


In [3]:
# remove columns to have null values
null_cols = train_df.columns[train_df.isnull().any()]
train_df = train_df.drop(columns=null_cols)          
test_df = test_df.drop(columns=null_cols, errors='ignore')
print('shape of train dataset(drop colums to have null values)', train_df.shape)
print('shape of test dataset(drop colums to have null values)', test_df.shape)

shape of train dataset(drop colums to have null values) (2400000, 712)
shape of test dataset(drop colums to have null values) (600000, 711)


In [4]:
train_df.dtypes

기준년월                int32
ID                 object
남녀구분코드              int32
연령                 object
Segment            object
                   ...   
변동률_RVCA평잔        float32
변동률_카드론평잔         float32
변동률_잔액_B1M        float32
변동률_잔액_일시불_B1M    float32
변동률_잔액_CA_B1M     float32
Length: 712, dtype: object

## Split X, y in train datasets 분리

In [5]:
# X: ID, Segement 제거
# y: Segment 추출
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# Target Label Encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [7]:
gc.collect()

0

In [10]:
try:
    model = xgb.XGBClassifier(
        tree_method='gpu_hist',  # GPU 모드 설정
        gpu_id=0,
        random_state=42
    )
    print("GPU 사용 가능: gpu_hist 모드 적용")
    model.fit(X, y_encoded)
    
except Exception:
    model = xgb.XGBClassifier(
        random_state=42
    )
    print("GPU 사용 불가: CPU 모드 적용")
    model.fit(X, y_encoded)

GPU 사용 가능: gpu_hist 모드 적용



    E.g. tree_method = "hist", device = "cuda"



In [11]:
X_test.drop(columns=['ID'],inplace=True)

In [12]:
# row-level 예측 수행
y_test_pred = model.predict(X_test)
# 예측 결과를 변환
y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data["pred_label"] = y_test_pred_labels


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [13]:
y_test_pred_labels

array(['E', 'E', 'D', ..., 'E', 'C', 'E'], dtype=object)

In [14]:
submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [15]:
submission

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,E
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C


In [16]:
submission.to_csv('./results/drop_balance_info_datasets(0327).csv',index=False)

In [17]:
X_test.dtypes

기준년월                int32
남녀구분코드              int32
연령                  int32
회원여부_이용가능           int32
회원여부_이용가능_CA        int32
                   ...   
변동률_RVCA평잔        float32
변동률_카드론평잔         float32
변동률_잔액_B1M        float32
변동률_잔액_일시불_B1M    float32
변동률_잔액_CA_B1M     float32
Length: 641, dtype: object