In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection      import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing       import StandardScaler
from sklearn.impute              import SimpleImputer
from xgboost                     import XGBClassifier
from sklearn.metrics             import confusion_matrix, classification_report
from sklearn.pipeline            import Pipeline

# 경고 메시지 억제
import warnings
warnings.filterwarnings('ignore')  

In [13]:
# 데이터 로드 및 ID 제거
df = pd.read_csv("final_features.csv")
X = df.drop(columns=['Segment','ID'])
y = df['Segment']

# E vs not-E 이진 타깃 생성
y_e = (y == 'E').astype(int)

# 학습/검증 분할 (20% 검증, E 분포 유지)
X_train, X_val, y_train_e, y_val_e, y_train, y_val = train_test_split(
    X, y_e, y, test_size=0.2, random_state=42, stratify=y_e
)

# 스케일러 학습 및 변환
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

In [None]:
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 튜닝
param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'n_estimators': [100],
}
model = RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro', verbose=2, n_jobs=-1)

#  이진 타깃으로 학습
grid.fit(X_train_s, y_train_e)

# 결과 출력
print("최적 max_depth:", grid.best_params_['max_depth'])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [None]:
# 1. 모델1: E vs not-E 분류
model1 = RandomForestClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# E vs not-E
model1.fit(X_train_s, y_train_e)

# 검증셋 E vs not-E
y1_pred = model1.predict(X_val_s)

print("=== 모델1: E vs not-E ===")
print(confusion_matrix(y_val_e, y1_pred))
print(classification_report(y_val_e, y1_pred, target_names=['not-E','E']))

# 검증셋 DataFrame 및 pred1 추가
df_val = pd.DataFrame(X_val_s, columns=X.columns)
df_val['true_seg'] = y_val.values
df_val['pred1']    = y1_pred

In [None]:
# 2. 모델2: A/B vs C/D 분류

# pred1 == 0 (not-E) 데이터
df_val_notE = df_val[df_val['pred1'] == 0].reset_index(drop=True)
X_val_notE  = df_val_notE[X.columns]

# A/B vs C/D 레이블 (A,B→1 / C,D→0)
y2_true = df_val_notE['true_seg'].apply(lambda s: 1 if s in ['A','B'] else 0)

# 학습셋 not-E 샘플 및 A/B vs C/D 타깃
mask_train_notE = (y_train_e == 0)
X_train_notE    = X_train_s[mask_train_notE]
y_train_notE    = y_train[mask_train_notE].apply(lambda s: 1 if s in ['A','B'] else 0)

# XGBoost 분류기 
model2 = RandomForestClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# A/B vs C/D
model2.fit(X_train_notE, y_train_notE)

# 검증셋 A/B vs C/D
y2_pred = model2.predict(X_val_notE)

print("\n=== 모델2: A/B vs C/D ===")
print(confusion_matrix(y2_true, y2_pred))
print(classification_report(y2_true, y2_pred, target_names=['C/D','A/B']))

# pred2 컬럼
df_val_notE['pred2'] = y2_pred

### 모델2가 “A/B”로 예측한 6개 샘플에 대해서만 모델3을 돌려본 것

In [None]:
# 모델3: A vs B 분류
# pred2 == 1 (A/B로 예측된) 데이터
df_val_AB = df_val_notE[df_val_notE['pred2'] == 1].reset_index(drop=True)
X_val_AB  = df_val_AB[X.columns]

# A vs B 레이블 (B→1 / A→0)
y3_true = df_val_AB['true_seg'].apply(lambda s: 1 if s == 'B' else 0)

# 학습셋 A/B 샘플 및 A vs B 타깃
mask_train_AB = mask_train_notE & (y_train.isin(['A','B']))
X_train_AB    = X_train_s[mask_train_AB]
y_train_AB    = y_train[mask_train_AB].apply(lambda s: 1 if s == 'B' else 0)

# XGBoost 분류기 
model3 = RandomForestClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# A vs B
model3.fit(X_train_AB, y_train_AB)

# 검증셋 A vs B
y3_pred = model3.predict(X_val_AB)

print("\n=== 모델3: A vs B ===")
print(confusion_matrix(y3_true, y3_pred))
print(classification_report(y3_true, y3_pred, labels=[0,1], target_names=['A','B']))

### 진짜 A/B인 37개 샘플에 대해서만 모델3을 돌려본 것

In [None]:
# 실제 A/B 전체를 추출
df_val_trueAB = df_val_notE[df_val_notE['true_seg'].isin(['A','B'])]

X_val_trueAB = df_val_trueAB[X.columns]
y3_true_all  = df_val_trueAB['true_seg'].apply(lambda s: 1 if s == 'B' else 0)

# model3에 전체 A/B 데이터로 평가
y3_pred_all = model3.predict(X_val_trueAB)

print(confusion_matrix(y3_true_all, y3_pred_all))
print(classification_report(y3_true_all, y3_pred_all, labels=[0,1], target_names=['A','B']))

### 모델2가 “C/D”로 예측한 6개 샘플에 대해서만 모델4를 돌려본 것

In [None]:
# 4. 모델4: C vs D 분류
# pred2 == 0 (C/D로 예측된) 데이터
df_val_CD = df_val_notE[df_val_notE['pred2'] == 0].reset_index(drop=True)
X_val_CD  = df_val_CD[X.columns]

# C vs D 레이블 (D→1 / C→0)
y4_true = df_val_CD['true_seg'].apply(lambda s: 1 if s == 'D' else 0)

# 학습셋 C/D 샘플 및 C vs D 타깃
mask_train_CD = mask_train_notE & (y_train.isin(['C','D']))
X_train_CD    = X_train_s[mask_train_CD]
y_train_CD    = y_train[mask_train_CD].apply(lambda s: 1 if s == 'D' else 0)

# RandomForest 분류기 
model4 = RandomForestClassifier(eval_metric='logloss', random_state=42, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')

# C vs D
model4.fit(X_train_CD, y_train_CD)

# 검증셋 C vs D
y4_pred = model4.predict(X_val_CD)

print("\n=== 모델4: C vs D ===")
print(confusion_matrix(y4_true, y4_pred))
print(classification_report(y4_true, y4_pred, target_names=['C','D']))

### 진짜 C/D 인 샘플에 대해서만 모델4를 돌려본 것

In [None]:
# 실제 C/D 레이블인 모든 검증셋 샘플
df_val_trueCD = df_val_notE[df_val_notE['true_seg'].isin(['C','D'])].reset_index(drop=True)
X_val_trueCD  = df_val_trueCD[X.columns]

# C vs D 레이블 (D→1 / C→0)
y4_true_all   = df_val_trueCD['true_seg'].apply(lambda s: 1 if s == 'D' else 0)

# model4로 전체 C/D 샘플 예측
y4_pred_all   = model4.predict(X_val_trueCD)

# 결과 출력
print("=== 모델4 standalone: 실제 C/D 전체 평가 ===")
print(confusion_matrix(y4_true_all, y4_pred_all))
print(classification_report(y4_true_all, y4_pred_all, labels=[0,1], target_names=['C','D']))

### 가중치 있는 버전

In [None]:
# 데이터 로드 및 ID 제거
df = pd.read_csv("final_features.csv")
X = df.drop(columns=['Segment','ID'])
y = df['Segment']

# E vs not-E 이진 타깃 생성
y_e = (y == 'E').astype(int)

# 학습/검증 분할 (20% 검증, E 비율 유지)
X_train, X_val, y_train_e, y_val_e, y_train, y_val = train_test_split(
    X, y_e, y, test_size=0.2, random_state=42, stratify=y_e
)

# 스케일러 학습 및 변환
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)

In [None]:
# 하이퍼파라미터 튜닝
param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'n_estimators': [100],
}
model = RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro', verbose=2, n_jobs=-1)

#  이진 타깃으로 학습
grid.fit(X_train_s, y_train_e)

# 결과 출력
print("최적 max_depth:", grid.best_params_['max_depth'])

In [None]:
# 1. 모델1: E vs not-E 분류 (가중치 있는)
# pos/neg 비율로 weight 설정
n_neg1 = (y_train_e == 0).sum()
n_pos1 = (y_train_e == 1).sum()
w1 = n_neg1 / n_pos1

# RandomForest with weight
model1_w = RandomForestClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w1, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
# 학습·예측
model1_w.fit(X_train_s, y_train_e)
y1_pred_w = model1_w.predict(X_val_s)

print("=== 모델1 (weighted): E vs not-E ===")
print(confusion_matrix(y_val_e, y1_pred_w))
print(classification_report(y_val_e, y1_pred_w, labels=[0,1], target_names=['not-E','E']))

In [None]:
# 검증셋 DataFrame 및 pred1 추가
df_val = pd.DataFrame(X_val_s, columns=X.columns)
df_val['true_seg'] = y_val.values
df_val['pred1']    = y1_pred_w

# 2. 모델2: A/B vs C/D 분류
df_val_notE = df_val[df_val['pred1']==0].reset_index(drop=True)
X_val_notE  = df_val_notE[X.columns]
y2_true     = df_val_notE['true_seg'].apply(lambda s: 1 if s in ['A','B'] else 0)

mask_train_notE = (y_train_e == 0)
X_train_notE    = X_train_s[mask_train_notE]
y_train_notE    = y_train[mask_train_notE].apply(lambda s: 1 if s in ['A','B'] else 0)

n_neg2 = (y_train_notE == 0).sum()
n_pos2 = (y_train_notE == 1).sum()
w2 = n_neg2 / n_pos2

model2_w = RandomForestClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w2, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model2_w.fit(X_train_notE, y_train_notE)
y2_pred_w = model2_w.predict(X_val_notE)

print("\n=== 모델2 (weighted): A/B vs C/D ===")
print(confusion_matrix(y2_true, y2_pred_w))
print(classification_report(y2_true, y2_pred_w, labels=[0,1], target_names=['C/D','A/B']))

df_val_notE['pred2'] = y2_pred_w

In [None]:
# 3. 모델3: A vs B 분류
df_val_AB = df_val_notE[df_val_notE['pred2']==1].reset_index(drop=True)
X_val_AB  = df_val_AB[X.columns]
y3_true   = df_val_AB['true_seg'].apply(lambda s: 1 if s=='B' else 0)

mask_train_AB = mask_train_notE & (y_train.isin(['A','B']))
X_train_AB    = X_train_s[mask_train_AB]
y_train_AB    = y_train[mask_train_AB].apply(lambda s: 1 if s=='B' else 0)

n_neg3 = (y_train_AB == 0).sum()
n_pos3 = (y_train_AB == 1).sum()
w3 = n_neg3 / n_pos3

model3_w = RandomForestClassifier(eval_metric='logloss', random_state=42, scale_pos_weight=w3, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model3_w.fit(X_train_AB, y_train_AB)
y3_pred_w = model3_w.predict(X_val_AB)

print("\n=== 모델3 (weighted): A vs B ===")
print(confusion_matrix(y3_true, y3_pred_w))
print(classification_report(y3_true, y3_pred_w, labels=[0,1], target_names=['A','B']))

In [None]:
# 4. 모델4: C vs D 분류
df_val_CD = df_val_notE[df_val_notE['pred2']==0].reset_index(drop=True)
X_val_CD  = df_val_CD[X.columns]
y4_true   = df_val_CD['true_seg'].apply(lambda s: 1 if s=='D' else 0)

mask_train_CD = mask_train_notE & (y_train.isin(['C','D']))
X_train_CD    = X_train_s[mask_train_CD]
y_train_CD    = y_train[mask_train_CD].apply(lambda s: 1 if s=='D' else 0)

n_neg4 = (y_train_CD == 0).sum()
n_pos4 = (y_train_CD == 1).sum()
w4 = n_neg4 / n_pos4

model4_w = RandomForestClassifier(eval_metric='logloss',random_state=42, scale_pos_weight=w4, verbose=-1, verbosity=0, tree_method='gpu_hist', predictor = 'gpu_predictor')
model4_w.fit(X_train_CD, y_train_CD)
y4_pred_w = model4_w.predict(X_val_CD)

print("\n=== 모델4 (weighted): C vs D ===")
print(confusion_matrix(y4_true, y4_pred_w))
print(classification_report(y4_true, y4_pred_w, labels=[0,1], target_names=['C','D']))