### Import

In [1]:
import pandas as pd
import numpy as np
import gc

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

### Data Load

In [None]:
## 데이터의 일부만 사용
TRAIN_DATA_RATIO = 1
np.random.seed(42)  # or any other seed value
customer_first = pd.read_parquet(f"./train/1.회원정보/201807_train_회원정보.parquet")
unique_ids = customer_first['ID'].unique()
del customer_first
sampled_ids = np.random.choice(unique_ids, size=int(len(unique_ids) * TRAIN_DATA_RATIO), replace=False)


# 데이터 분할(폴더) 구분
data_splits = ["train", "test"]

# 각 데이터 유형별 폴더명, 파일 접미사, 변수 접두어 설정
data_categories = {
    "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
    "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
    "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
    "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
    "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
    "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
    "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
    "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
}

# 2018년 7월부터 12월까지의 월 리스트
months = ['07', '08', '09', '10', '11', '12']

for split in data_splits:
    for category, info in data_categories.items():
        folder = info["folder"]
        suffix = info["suffix"]
        var_prefix = info["var_prefix"]
        
        for month in months:
            # 파일명 형식: 2018{month}_{split}_{suffix}.parquet
            file_path = f"./{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
            # 변수명 형식: {var_prefix}_{split}_{month}
            variable_name = f"{var_prefix}_{split}_{month}"
            
            ## 훈련 데이터의 일부만 사용
            if split == "train":
                df = pd.read_parquet(file_path)
                df = df[df['ID'].isin(sampled_ids)]
            else:
                df = pd.read_parquet(file_path)

            globals()[variable_name] = df
            print(f"{variable_name} is loaded from {file_path}")

gc.collect()

### Data Preprocessing(1) : Concat & Merge

In [2]:
# 데이터 유형별 설정 
info_categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]

# 월 설정
months = ['07', '08', '09', '10', '11', '12']

In [None]:
#### Train ####

# 각 유형별로 월별 데이터를 합쳐서 새로운 변수에 저장
train_dfs = {}

for prefix in info_categories:
    # globals()에서 동적 변수명으로 데이터프레임들을 가져와 리스트에 저장
    df_list = [globals()[f"{prefix}_train_{month}"] for month in months]
    train_dfs[f"{prefix}_train_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_train_df is created with shape: {train_dfs[f'{prefix}_train_df'].shape}")


customer_train_df = train_dfs["customer_train_df"]
credit_train_df   = train_dfs["credit_train_df"]
sales_train_df    = train_dfs["sales_train_df"]
billing_train_df  = train_dfs["billing_train_df"]
balance_train_df  = train_dfs["balance_train_df"]
channel_train_df  = train_dfs["channel_train_df"]
marketing_train_df= train_dfs["marketing_train_df"]
performance_train_df = train_dfs["performance_train_df"]

gc.collect()

In [None]:
#### Test ####

# test 데이터에 대해 train과 동일한 방법 적용
test_dfs = {}

for prefix in info_categories:
    df_list = [globals()[f"{prefix}_test_{month}"] for month in months]
    test_dfs[f"{prefix}_test_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_test_df is created with shape: {test_dfs[f'{prefix}_test_df'].shape}")


customer_test_df = test_dfs["customer_test_df"]
credit_test_df   = test_dfs["credit_test_df"]
sales_test_df    = test_dfs["sales_test_df"]
billing_test_df  = test_dfs["billing_test_df"]
balance_test_df  = test_dfs["balance_test_df"]
channel_test_df  = test_dfs["channel_test_df"]
marketing_test_df= test_dfs["marketing_test_df"]
performance_test_df = test_dfs["performance_test_df"]

gc.collect()

In [None]:
#### Train ####

train_df = customer_train_df.merge(credit_train_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: train_step1, shape:", train_df.shape)
del customer_train_df, credit_train_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_train_df",    "Step2"),
    ("billing_train_df",  "Step3"),
    ("balance_train_df",  "Step4"),
    ("channel_train_df",  "Step5"),
    ("marketing_train_df","Step6"),
    ("performance_train_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    train_df = train_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: train_{step}, shape:", train_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

print(train_df.shape)

In [None]:
#### Test ####

test_df = customer_test_df.merge(credit_test_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: test_step1, shape:", test_df.shape)
del customer_test_df, credit_test_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_test_df",    "Step2"),
    ("billing_test_df",  "Step3"),
    ("balance_test_df",  "Step4"),
    ("channel_test_df",  "Step5"),
    ("marketing_test_df","Step6"),
    ("performance_test_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    test_df = test_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: test_{step}, shape:", test_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

print(test_df.shape)

### PCA

#### train

In [None]:
train_df_with_na = train_df.T[train_df.isna().any(axis=0)].T

categorical_cols = train_df.select_dtypes(include=['object']).columns
train_df_categorical = train_df[categorical_cols]

train_df_no_pca = pd.concat([train_df.loc[:, ['Segment']], train_df_with_na,train_df_categorical], axis=1)
train_df_no_pca = train_df_no_pca.loc[:,~train_df_no_pca.columns.duplicated()]
del train_df_with_na, train_df_categorical
gc.collect()

In [None]:
# PCA를 위한 데이터 전처리
# 결측치가 있는 컬럼 제거
train_df = train_df.dropna(axis=1)

# ID와 Segment 컬럼 제외
train_df = train_df.drop(['ID', 'Segment'], axis=1)

# 수치형 데이터만 선택
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
train_df = train_df[numeric_cols]
gc.collect()

In [12]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_df = scaler.fit_transform(train_df)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=300)

pca.fit(train_df)
print("누적 설명된 분산 비율: ", sum(pca.explained_variance_ratio_))

In [None]:
import joblib
joblib.dump(pca, 'pca_model.pkl')

In [15]:
train_df = pca.transform(train_df)
train_df = pd.DataFrame(train_df, columns=[f'PC{i+1}' for i in range(300)])
train_df = pd.concat([train_df_no_pca, train_df], axis=1)
del train_df_no_pca


In [None]:
train_df.to_parquet('cache/train_df_pca.parquet')

#### Load Train Data

In [2]:
# load
import joblib
from sklearn.decomposition import PCA
pca = joblib.load('cache/pca_model.pkl')
train_df = pd.read_parquet('cache/train_df_pca.parquet')

#### test

In [None]:
test_df_with_na = test_df.T[test_df.isna().any(axis=0)].T

categorical_cols = test_df.select_dtypes(include=['object']).columns
test_df_categorical = test_df[categorical_cols]

test_df_no_pca = pd.concat([test_df_with_na,test_df_categorical], axis=1)
test_df_no_pca = test_df_no_pca.loc[:,~test_df_no_pca.columns.duplicated()]
del test_df_with_na, test_df_categorical
gc.collect()

In [None]:
test_df = test_df.dropna(axis=1)
test_df = test_df.drop(['ID'], axis=1)
numeric_cols = test_df.select_dtypes(include=['int64', 'float64']).columns
test_df = test_df[numeric_cols]
gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
test_df = scaler.fit_transform(test_df)


In [13]:
test_df = pca.transform(test_df)
test_df = pd.DataFrame(test_df, columns=[f'PC{i+1}' for i in range(300)])
test_df = pd.concat([test_df_no_pca, test_df], axis=1)
del test_df_no_pca

In [14]:
test_df.to_parquet('cache/test_df_pca.parquet')


#### Load Test Data

In [3]:
# test
import joblib
test_df = pd.read_parquet('cache/test_df_pca.parquet')


### Data Preprocessing(2) : Encoding

In [4]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [5]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [6]:
gc.collect()

0

### Train

In [7]:
try:
    model = xgb.XGBClassifier(
        tree_method='gpu_hist',  # GPU 모드 설정
        gpu_id=0,
        random_state=42
    )
    print("GPU 사용 가능: gpu_hist 모드 적용")
    model.fit(X, y_encoded)
    
except Exception as e:
    print(e)
    model = xgb.XGBClassifier(
        random_state=42
    )
    print("GPU 사용 불가: CPU 모드 적용")
    model.fit(X, y_encoded)

GPU 사용 가능: gpu_hist 모드 적용



    E.g. tree_method = "hist", device = "cuda"



### Predict

In [8]:
X_test.drop(columns=['ID'],inplace=True)

In [9]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 357 entries, 가입통신회사코드 to PC300
dtypes: float64(309), int32(48)
memory usage: 1.5 GB


In [10]:
# row-level 예측 수행
y_test_pred = model.predict(X_test)
# 예측 결과를 변환
y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data["pred_label"] = y_test_pred_labels


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




### Submission

In [11]:
submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [12]:
submission.to_csv('./base_submit.csv',index=False)