In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection      import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing       import StandardScaler
from sklearn.impute              import SimpleImputer
from xgboost                     import XGBClassifier
from sklearn.metrics             import confusion_matrix, classification_report
from sklearn.pipeline            import Pipeline

# 경고 메시지 억제
import warnings
warnings.filterwarnings('ignore')  

In [2]:
# 데이터 로드 및 ID 제거
df = pd.read_csv("pca_features_with_segment.csv")
X = df.drop(columns=['Segment'])
y = df['Segment']

# E vs not-E 이진 타깃 생성
y_e = (y == 'E').astype(int)

# 학습/검증 분할 (20% 검증, E 분포 유지)
X_train, X_val, y_train_e, y_val_e, y_train, y_val = train_test_split(
    X, y_e, y, test_size=0.2, random_state=42, stratify=y_e
)

# 스케일러 학습 및 변환
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

In [3]:
# 1. 모델1: E vs not-E 분류
model1 = XGBClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# E vs not-E
model1.fit(X_train_s, y_train_e)

# 검증셋 E vs not-E
y1_pred = model1.predict(X_val_s)

print("=== 모델1: E vs not-E ===")
print(confusion_matrix(y_val_e, y1_pred))
print(classification_report(y_val_e, y1_pred, target_names=['not-E','E']))

# 검증셋 DataFrame 및 pred1 추가
df_val = pd.DataFrame(X_val_s, columns=X.columns)
df_val['true_seg'] = y_val.values
df_val['pred1']    = y1_pred

=== 모델1: E vs not-E ===
[[10959  4973]
 [ 2771 61297]]
              precision    recall  f1-score   support

       not-E       0.80      0.69      0.74     15932
           E       0.92      0.96      0.94     64068

    accuracy                           0.90     80000
   macro avg       0.86      0.82      0.84     80000
weighted avg       0.90      0.90      0.90     80000



In [4]:
# 2. 모델2: A/B vs C/D 분류

# pred1 == 0 (not-E) 데이터
df_val_notE = df_val[df_val['pred1'] == 0].reset_index(drop=True)
X_val_notE  = df_val_notE[X.columns]

# A/B vs C/D 레이블 (A,B→1 / C,D→0)
y2_true = df_val_notE['true_seg'].apply(lambda s: 1 if s in ['A','B'] else 0)

# 학습셋 not-E 샘플 및 A/B vs C/D 타깃
mask_train_notE = (y_train_e == 0)
X_train_notE    = X_train_s[mask_train_notE]
y_train_notE    = y_train[mask_train_notE].apply(lambda s: 1 if s in ['A','B'] else 0)

# XGBoost 분류기 
model2 = XGBClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# A/B vs C/D
model2.fit(X_train_notE, y_train_notE)

# 검증셋 A/B vs C/D
y2_pred = model2.predict(X_val_notE)

print("\n=== 모델2: A/B vs C/D ===")
print(confusion_matrix(y2_true, y2_pred))
print(classification_report(y2_true, y2_pred, target_names=['C/D','A/B']))

# pred2 컬럼
df_val_notE['pred2'] = y2_pred


=== 모델2: A/B vs C/D ===
[[13691     2]
 [   33     4]]
              precision    recall  f1-score   support

         C/D       1.00      1.00      1.00     13693
         A/B       0.67      0.11      0.19        37

    accuracy                           1.00     13730
   macro avg       0.83      0.55      0.59     13730
weighted avg       1.00      1.00      1.00     13730



#### 모델2가 “A/B”로 예측한 6개 샘플에 대해서만 모델3을 돌려본 것

In [5]:
# 모델3: A vs B 분류
# pred2 == 1 (A/B로 예측된) 데이터
df_val_AB = df_val_notE[df_val_notE['pred2'] == 1].reset_index(drop=True)
X_val_AB  = df_val_AB[X.columns]

# A vs B 레이블 (B→1 / A→0)
y3_true = df_val_AB['true_seg'].apply(lambda s: 1 if s == 'B' else 0)

# 학습셋 A/B 샘플 및 A vs B 타깃
mask_train_AB = mask_train_notE & (y_train.isin(['A','B']))
X_train_AB    = X_train_s[mask_train_AB]
y_train_AB    = y_train[mask_train_AB].apply(lambda s: 1 if s == 'B' else 0)

# XGBoost 분류기 
model3 = XGBClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# A vs B
model3.fit(X_train_AB, y_train_AB)

# 검증셋 A vs B
y3_pred = model3.predict(X_val_AB)

print("\n=== 모델3: A vs B ===")
print(confusion_matrix(y3_true, y3_pred))
print(classification_report(y3_true, y3_pred, labels=[0,1], target_names=['A','B']))


=== 모델3: A vs B ===
[[6]]
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         6
           B       0.00      0.00      0.00         0

    accuracy                           1.00         6
   macro avg       0.50      0.50      0.50         6
weighted avg       1.00      1.00      1.00         6



#### 진짜 A/B인 37개 샘플에 대해서만 모델3을 돌려본 것

In [6]:
# 실제 A/B 전체를 추출
df_val_trueAB = df_val_notE[df_val_notE['true_seg'].isin(['A','B'])]

X_val_trueAB = df_val_trueAB[X.columns]
y3_true_all  = df_val_trueAB['true_seg'].apply(lambda s: 1 if s == 'B' else 0)

# model3에 전체 A/B 데이터로 평가
y3_pred_all = model3.predict(X_val_trueAB)

print(confusion_matrix(y3_true_all, y3_pred_all))
print(classification_report(y3_true_all, y3_pred_all, labels=[0,1], target_names=['A','B']))

[[31  1]
 [ 4  1]]
              precision    recall  f1-score   support

           A       0.89      0.97      0.93        32
           B       0.50      0.20      0.29         5

    accuracy                           0.86        37
   macro avg       0.69      0.58      0.61        37
weighted avg       0.83      0.86      0.84        37



#### 모델2가 “C/D”로 예측한 6개 샘플에 대해서만 모델4를 돌려본 것

In [7]:
# 4. 모델4: C vs D 분류
# pred2 == 0 (C/D로 예측된) 데이터
df_val_CD = df_val_notE[df_val_notE['pred2'] == 0].reset_index(drop=True)
X_val_CD  = df_val_CD[X.columns]

# C vs D 레이블 (D→1 / C→0)
y4_true = df_val_CD['true_seg'].apply(lambda s: 1 if s == 'D' else 0)

# 학습셋 C/D 샘플 및 C vs D 타깃
mask_train_CD = mask_train_notE & (y_train.isin(['C','D']))
X_train_CD    = X_train_s[mask_train_CD]
y_train_CD    = y_train[mask_train_CD].apply(lambda s: 1 if s == 'D' else 0)

# XGBoost 분류기 
model4 = XGBClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# C vs D
model4.fit(X_train_CD, y_train_CD)

# 검증셋 C vs D
y4_pred = model4.predict(X_val_CD)

print("\n=== 모델4: C vs D ===")
print(confusion_matrix(y4_true, y4_pred))
print(classification_report(y4_true, y4_pred, target_names=['C','D']))


=== 모델4: C vs D ===
[[2570 4020]
 [ 797 6337]]
              precision    recall  f1-score   support

           C       0.76      0.39      0.52      6590
           D       0.61      0.89      0.72      7134

    accuracy                           0.65     13724
   macro avg       0.69      0.64      0.62     13724
weighted avg       0.68      0.65      0.62     13724



#### 진짜 C/D 인 샘플에 대해서만 모델4를 돌려본 것

In [8]:
# 실제 C/D 레이블인 모든 검증셋 샘플
df_val_trueCD = df_val_notE[df_val_notE['true_seg'].isin(['C','D'])].reset_index(drop=True)
X_val_trueCD  = df_val_trueCD[X.columns]

# C vs D 레이블 (D→1 / C→0)
y4_true_all   = df_val_trueCD['true_seg'].apply(lambda s: 1 if s == 'D' else 0)

# model4로 전체 C/D 샘플 예측
y4_pred_all   = model4.predict(X_val_trueCD)

# 결과 출력
print("=== 모델4 standalone: 실제 C/D 전체 평가 ===")
print(confusion_matrix(y4_true_all, y4_pred_all))
print(classification_report(y4_true_all, y4_pred_all, labels=[0,1], target_names=['C','D']))

=== 모델4 standalone: 실제 C/D 전체 평가 ===
[[2264 1523]
 [ 798 6337]]
              precision    recall  f1-score   support

           C       0.74      0.60      0.66      3787
           D       0.81      0.89      0.85      7135

    accuracy                           0.79     10922
   macro avg       0.77      0.74      0.75     10922
weighted avg       0.78      0.79      0.78     10922



### 가중치 있는 버전

In [9]:
# 데이터 로드 및 ID 제거
df = pd.read_csv("pca_features_with_segment.csv")
X = df.drop(columns=['Segment'])
y = df['Segment']

# E vs not-E 이진 타깃 생성
y_e = (y == 'E').astype(int)

# 학습/검증 분할 (20% 검증, E 비율 유지)
X_train, X_val, y_train_e, y_val_e, y_train, y_val = train_test_split(
    X, y_e, y, test_size=0.2, random_state=42, stratify=y_e
)

# 스케일러 학습 및 변환
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

In [10]:
# 1. 모델1: E vs not-E 분류 (가중치 있는)
# pos/neg 비율로 weight 설정
n_neg1 = (y_train_e == 0).sum()
n_pos1 = (y_train_e == 1).sum()
w1 = n_neg1 / n_pos1

# XGBoost with weight
model1_w = XGBClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w1, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
# 학습·예측
model1_w.fit(X_train_s, y_train_e)
y1_pred_w = model1_w.predict(X_val_s)

print("=== 모델1 (weighted): E vs not-E ===")
print(confusion_matrix(y_val_e, y1_pred_w))
print(classification_report(y_val_e, y1_pred_w, labels=[0,1], target_names=['not-E','E']))

=== 모델1 (weighted): E vs not-E ===
[[13597  2335]
 [ 7551 56517]]
              precision    recall  f1-score   support

       not-E       0.64      0.85      0.73     15932
           E       0.96      0.88      0.92     64068

    accuracy                           0.88     80000
   macro avg       0.80      0.87      0.83     80000
weighted avg       0.90      0.88      0.88     80000



In [11]:
# 검증셋 DataFrame 및 pred1 추가
df_val = pd.DataFrame(X_val_s, columns=X.columns)
df_val['true_seg'] = y_val.values
df_val['pred1']    = y1_pred_w

# 2. 모델2: A/B vs C/D 분류
df_val_notE = df_val[df_val['pred1']==0].reset_index(drop=True)
X_val_notE  = df_val_notE[X.columns]
y2_true     = df_val_notE['true_seg'].apply(lambda s: 1 if s in ['A','B'] else 0)

mask_train_notE = (y_train_e == 0)
X_train_notE    = X_train_s[mask_train_notE]
y_train_notE    = y_train[mask_train_notE].apply(lambda s: 1 if s in ['A','B'] else 0)

n_neg2 = (y_train_notE == 0).sum()
n_pos2 = (y_train_notE == 1).sum()
w2 = n_neg2 / n_pos2

model2_w = XGBClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w2, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model2_w.fit(X_train_notE, y_train_notE)
y2_pred_w = model2_w.predict(X_val_notE)

print("\n=== 모델2 (weighted): A/B vs C/D ===")
print(confusion_matrix(y2_true, y2_pred_w))
print(classification_report(y2_true, y2_pred_w, labels=[0,1], target_names=['C/D','A/B']))

df_val_notE['pred2'] = y2_pred_w


=== 모델2 (weighted): A/B vs C/D ===
[[21095    16]
 [   27    10]]
              precision    recall  f1-score   support

         C/D       1.00      1.00      1.00     21111
         A/B       0.38      0.27      0.32        37

    accuracy                           1.00     21148
   macro avg       0.69      0.63      0.66     21148
weighted avg       1.00      1.00      1.00     21148



In [12]:
# 3. 모델3: A vs B 분류
df_val_AB = df_val_notE[df_val_notE['pred2']==1].reset_index(drop=True)
X_val_AB  = df_val_AB[X.columns]
y3_true   = df_val_AB['true_seg'].apply(lambda s: 1 if s=='B' else 0)

mask_train_AB = mask_train_notE & (y_train.isin(['A','B']))
X_train_AB    = X_train_s[mask_train_AB]
y_train_AB    = y_train[mask_train_AB].apply(lambda s: 1 if s=='B' else 0)

n_neg3 = (y_train_AB == 0).sum()
n_pos3 = (y_train_AB == 1).sum()
w3 = n_neg3 / n_pos3

model3_w = XGBClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w3, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model3_w.fit(X_train_AB, y_train_AB)
y3_pred_w = model3_w.predict(X_val_AB)

print("\n=== 모델3 (weighted): A vs B ===")
print(confusion_matrix(y3_true, y3_pred_w))
print(classification_report(y3_true, y3_pred_w, labels=[0,1], target_names=['A','B']))


=== 모델3 (weighted): A vs B ===
[[24  1]
 [ 1  0]]
              precision    recall  f1-score   support

           A       0.96      0.96      0.96        25
           B       0.00      0.00      0.00         1

    accuracy                           0.92        26
   macro avg       0.48      0.48      0.48        26
weighted avg       0.92      0.92      0.92        26



In [13]:
# 4. 모델4: C vs D 분류
df_val_CD = df_val_notE[df_val_notE['pred2']==0].reset_index(drop=True)
X_val_CD  = df_val_CD[X.columns]
y4_true   = df_val_CD['true_seg'].apply(lambda s: 1 if s=='D' else 0)

mask_train_CD = mask_train_notE & (y_train.isin(['C','D']))
X_train_CD    = X_train_s[mask_train_CD]
y_train_CD    = y_train[mask_train_CD].apply(lambda s: 1 if s=='D' else 0)

n_neg4 = (y_train_CD == 0).sum()
n_pos4 = (y_train_CD == 1).sum()
w4 = n_neg4 / n_pos4

model4_w = XGBClassifier(eval_metric='logloss',random_state=42, scale_pos_weight=w4, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model4_w.fit(X_train_CD, y_train_CD)
y4_pred_w = model4_w.predict(X_val_CD)

print("\n=== 모델4 (weighted): C vs D ===")
print(confusion_matrix(y4_true, y4_pred_w))
print(classification_report(y4_true, y4_pred_w, labels=[0,1], target_names=['C','D']))


=== 모델4 (weighted): C vs D ===
[[3633 7988]
 [1652 7849]]
              precision    recall  f1-score   support

           C       0.69      0.31      0.43     11621
           D       0.50      0.83      0.62      9501

    accuracy                           0.54     21122
   macro avg       0.59      0.57      0.52     21122
weighted avg       0.60      0.54      0.52     21122

