In [2]:
import pandas as pd
import numpy as np
import warnings
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from utils.reduce_memory import trainform_columns_type
import xgboost as xgb
warnings.filterwarnings('ignore')

### 단일 값 가지는 컬럼 & NULL 값 가지는 컬럼 모두 제거.

In [3]:
train_df = pd.read_parquet('./data/train/train_filtered.parquet')
test_df = pd.read_parquet('./data/test/test_filtered.parquet')

# reduce memory by chaing data types of columns
train_df = trainform_columns_type(train_df)
test_df = trainform_columns_type(test_df)

print('shape of train dataset(drop colums)', train_df.shape)
print('shape of test dataset(drop colums)', test_df.shape)

shape of train dataset(drop colums) (2400000, 743)
shape of test dataset(drop colums) (600000, 742)


In [4]:
# remove columns to have null values
null_cols = train_df.columns[train_df.isnull().any()]
train_df = train_df.drop(columns=null_cols)          
test_df = test_df.drop(columns=null_cols, errors='ignore')
print('shape of train dataset(drop colums to have null values)', train_df.shape)
print('shape of test dataset(drop colums to have null values)', test_df.shape)

shape of train dataset(drop colums to have null values) (2400000, 712)
shape of test dataset(drop colums to have null values) (600000, 711)


In [5]:
train_df.dtypes

기준년월                int32
ID                 object
남녀구분코드              int32
연령                 object
Segment            object
                   ...   
변동률_RVCA평잔        float32
변동률_카드론평잔         float32
변동률_잔액_B1M        float32
변동률_잔액_일시불_B1M    float32
변동률_잔액_CA_B1M     float32
Length: 712, dtype: object

## Split X, y in train datasets 분리

In [6]:
# X: ID, Segement 제거
# y: Segment 추출
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# Target Label Encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [7]:
gc.collect()

0

In [8]:
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_encoded)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_encoded)
class_weight_dict = dict(zip(classes, weights))

In [9]:
sample_weights = np.array([class_weight_dict[y] for y in y_encoded])
sample_weights

array([1.37440514, 0.2497331 , 3.76205032, ..., 3.76205032, 0.2497331 ,
       0.2497331 ])

In [None]:
skf = StratifiedKFold(n_splits=5)

models = [] 
accuracy_scores = []
recall_scores = []
precision_scores = []
fi_scores = []
classification_reports = []


for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    sample_weights_fold = sample_weights[train_idx]
    print('-'*40)
    print(f'stratified Fold {fold + 1}-th XGBoost model training...')
    
    # XGBoost
    model = xgb.XGBClassifier(
        tree_method='gpu_hist',  # GPU mode
        gpu_id=0,
        random_state=42,
        sample_weight=sample_weights_fold,
        use_label_encoder=False)
    
    # training and validation mornitoring

    model.fit(X_train, y_train, sample_weight=sample_weights[train_idx])
    models.append(model)
    
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred, average='macro')
    precision = precision_score(y_val, y_val_pred, average='macro')
    f1 = f1_score(y_val, y_val_pred, average='macro')
    report = classification_report(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} Recall: {recall:.4f}")
    print(f"Fold {fold + 1} Precision: {precision:.4f}")
    print(f"Fold {fold + 1} F1-score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report\n:{report}")
    print(f"Fold {fold + 1} Confusion Matrix:\n{conf_matrix}")
    print('-'*40)
    
    accuracy_scores.append(accuracy)
    recall_scores.append(recall)
    precision_scores.append(precision)
    fi_scores.append(f1)
    
    
print(f"Stratified-K-Fold mean Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Stratified-K-Fold mean Recall: {np.mean(recall_scores):.4f}")
print(f"Stratified-K-Fold mean Precision: {np.mean(precision_scores):.4f}")
print(f"Stratified-K-Fold mean F1-score: {np.mean(fi_scores):.4f}")

----------------------------------------
stratified Fold 1-th XGBoost model training...
Fold 1 Accuracy: 0.8861
Fold 1 Recall: 0.8876
Fold 1 Precision: 0.7860
Fold 1 F1-score: 0.8281
Fold 1 Classification Report
:              precision    recall  f1-score   support

           0       0.82      0.95      0.88       195
           1       0.89      0.89      0.89        28
           2       0.63      0.89      0.73     25518
           3       0.61      0.80      0.69     69849
           4       0.99      0.90      0.94    384410

    accuracy                           0.89    480000
   macro avg       0.79      0.89      0.83    480000
weighted avg       0.91      0.89      0.89    480000

Fold 1 Confusion Matrix:
[[   186      2      7      0      0]
 [     0     25      3      0      0]
 [    33      1  22707   2545    232]
 [     3      0   9607  55858   4381]
 [     5      0   3952  33901 346552]]
----------------------------------------
----------------------------------------


In [11]:
X_test.drop(columns=['ID'],inplace=True)

In [12]:
n_classes = models[0].n_classes_ 
test_probabilities = np.zeros((len(X_test), n_classes)) # (600000, 5)

for model in models:
    test_probabilities += model.predict_proba(X_test) # (600000, 5)

test_probabilities /= len(models)
test_predictions = np.argmax(test_probabilities, axis=1)

print('Soft Voting Inference Done.')

Soft Voting Inference Done.


In [13]:
y_test_pred_labels = le_target.inverse_transform(test_predictions)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data["pred_label"] = y_test_pred_labels

submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [14]:
submission

Unnamed: 0,ID,Segment
0,TEST_00000,D
1,TEST_00001,D
2,TEST_00002,D
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C


In [15]:
submission.to_csv('./results/0327_xgboost_Stratified_5fold(class-weight).csv',index=False)