In [1]:
import pandas as pd
import numpy as np
import warnings
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from utils.reduce_memory import trainform_columns_type


import xgboost as xgb
warnings.filterwarnings('ignore')

from sklearn.utils import resample

### 단일 값 가지는 컬럼 & NULL 값 가지는 컬럼 모두 제거.

In [2]:
train_df = pd.read_parquet('./data/train/train_filtered.parquet')
test_df = pd.read_parquet('./data/test/test_filtered.parquet')

# reduce memory by chaing data types of columns
train_df = trainform_columns_type(train_df)
test_df = trainform_columns_type(test_df)

print('shape of train dataset(drop colums)', train_df.shape)
print('shape of test dataset(drop colums)', test_df.shape)

shape of train dataset(drop colums) (2400000, 743)
shape of test dataset(drop colums) (600000, 742)


In [3]:
train_df

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0.999998,1.042805,0.999700,0.999998,0.999998,0.261886,0.270752,0.000000,1.044401,1.280542
1,201807,TRAIN_000001,1,30대,E,1,1,1,1,1,...,1.092698,0.905663,0.999998,0.999998,0.999998,-0.563388,-0.670348,0.000000,0.000000,0.000000
2,201807,TRAIN_000002,1,30대,C,1,1,0,1,1,...,1.006124,1.993590,0.852567,0.999998,0.999998,-0.046516,0.058114,-0.014191,0.524159,1.208420
3,201807,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0.999998,1.050646,0.999877,0.999998,0.999998,0.023821,0.258943,0.000000,0.880925,1.657124
4,201807,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,2,70대이상,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
2399996,201812,TRAIN_399996,2,50대,D,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.921733,-0.203251,-0.159143,0.000000,1.377071,2.533815
2399997,201812,TRAIN_399997,1,30대,C,1,1,0,1,1,...,0.999998,0.345027,0.999998,0.999998,0.999998,0.027319,0.126581,0.000000,0.000000,0.000000
2399998,201812,TRAIN_399998,1,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,


In [4]:
# remove columns to have null values
null_cols = train_df.columns[train_df.isnull().any()]
train_df = train_df.drop(columns=null_cols)          
test_df = test_df.drop(columns=null_cols, errors='ignore')
print('shape of train dataset(drop colums to have null values)', train_df.shape)
print('shape of test dataset(drop colums to have null values)', test_df.shape)

shape of train dataset(drop colums to have null values) (2400000, 712)
shape of test dataset(drop colums to have null values) (600000, 711)


In [5]:
train_df.dtypes

기준년월                int32
ID                 object
남녀구분코드              int32
연령                 object
Segment            object
                   ...   
변동률_RVCA평잔        float32
변동률_카드론평잔         float32
변동률_잔액_B1M        float32
변동률_잔액_일시불_B1M    float32
변동률_잔액_CA_B1M     float32
Length: 712, dtype: object

## Split X, y in train datasets 분리

In [6]:
# X: ID, Segement 제거
# y: Segment 추출
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy() 
y = train_df["Segment"].copy() 

# Target Label Encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [7]:
gc.collect()

0

In [16]:

classes = np.unique(y_encoded)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_encoded)
class_weight_dict = dict(zip(classes, weights))

In [17]:
class_weight_dict

{0: 493.82716049382714,
 1: 3333.3333333333335,
 2: 3.7620503174229953,
 3: 1.3744051402752246,
 4: 0.2497330977517778}

In [20]:
sample_weights = np.array([class_weight_dict[y] for y in y_encoded])
sample_weights

array([1.37440514, 3.76205032, 3.76205032, ..., 3.76205032, 3.76205032,
       3.76205032])

In [21]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = [] 
accuracy_scores = []
recall_scores = []
precision_scores = []
fi_scores = []
classification_reports = []


for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    sample_weights_fold = sample_weights[train_idx]
    print('-'*40)
    print(f'Fold {fold + 1}-th XGBoost model training...')
    
    # XGBoost
    model = xgb.XGBClassifier(
        tree_method='gpu_hist',  # GPU mode
        gpu_id=0,
        random_state=42,
        sample_weight=sample_weights_fold,
        use_label_encoder=False)
    
    # training and validation mornitoring
    model.fit(X_train, y_train, 
              sample_weight=sample_weights[train_idx]
              )
    models.append(model)

    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred, average='macro')
    precision = precision_score(y_val, y_val_pred, average='macro')
    f1 = f1_score(y_val, y_val_pred, average='macro')
    report = classification_report(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} Recall: {recall:.4f}")
    print(f"Fold {fold + 1} Precision: {precision:.4f}")
    print(f"Fold {fold + 1} F1-score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report\n:{report}")
    print(f"Fold {fold + 1} Confusion Matrix:\n{conf_matrix}")
    print('-'*40)
    
    accuracy_scores.append(accuracy)
    recall_scores.append(recall)
    precision_scores.append(precision)
    fi_scores.append(f1)
    
    
print(f"K-Fold mean Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"K-Fold mean Recall: {np.mean(recall_scores):.4f}")
print(f"K-Fold mean Precision: {np.mean(precision_scores):.4f}")
print(f"K-Fold mean F1-score: {np.mean(fi_scores):.4f}")

----------------------------------------
Fold 1-th XGBoost model training...
Fold 1 Accuracy: 0.9080
Fold 1 Recall: 0.8265
Fold 1 Precision: 0.8378
Fold 1 F1-score: 0.8194
Fold 1 Classification Report
:              precision    recall  f1-score   support

           0       0.76      0.92      0.83       201
           1       0.95      0.91      0.93        45
           2       0.68      0.81      0.74     25532
           3       0.87      0.50      0.64     69969
           4       0.93      0.99      0.96    384253

    accuracy                           0.91    480000
   macro avg       0.84      0.83      0.82    480000
weighted avg       0.91      0.91      0.90    480000

Fold 1 Confusion Matrix:
[[   185      0     16      0      0]
 [     0     41      4      0      0]
 [    45      2  20658   2408   2419]
 [     9      0   8273  35234  26453]
 [     4      0   1646   2875 379728]]
----------------------------------------
----------------------------------------
Fold 2-th X

In [22]:
X_test.drop(columns=['ID'],inplace=True)

KeyError: "['ID'] not found in axis"

In [23]:
n_classes = models[0].n_classes_ 
test_probabilities = np.zeros((len(X_test), n_classes)) # (600000, 5)

for model in models:
    test_probabilities += model.predict_proba(X_test) # (600000, 5)

test_probabilities /= len(models)
test_predictions = np.argmax(test_probabilities, axis=1)

print('Soft Voting Inference Done.')

Soft Voting Inference Done.


In [24]:
y_test_pred_labels = le_target.inverse_transform(test_predictions)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data["pred_label"] = y_test_pred_labels

submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [25]:
submission

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,E
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C


In [26]:
submission.to_csv('./results/xgboost_K5fold(class_weight02).csv',index=False)