In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# 데이터
df = pd.read_csv('df.csv')

In [40]:
df['대분류'].unique()

array(['자동차', '광물/토사석', '회원권', '미분류기타', '컴퓨터/전기/통신기계', '건축자재및기계',
       '농/임/축산용기계', '선박', '이륜차', '재활용품', '기타권리및증권', '측정/실험및의료장비', '어업용기계',
       '기타기계기구', '산업위생', '공구/제조부품/물품취급장비', '인쇄,사진및시청각기기', '기타서비스기계/물품',
       '스포츠/레저', '임산물', '기계설비', '사무/가구/가전', '시계/귀금속', '유가증권', '철도',
       '산업기계', '광산기계', '의류/가방/개인용품', '농수축산물', '석유/화학/연료', '폐기물', '중장비부품',
       '교육시설/용품', '예술품/악기', '공공안전/치안장비', '회전기기및경전기', '종이및출판물', 'NPL',
       '식료품/의약품', '피혁/섬유/직물', '중기', '항공기', '시스템장비등', '판매권및광고권', '무형자산'],
      dtype=object)

In [41]:
df = df[df['대분류'] == '자동차']

In [42]:
X = df[['대분류', '중분류', '최저입찰가']]
y = df['이상치']

# 전처리: 범주형 One-Hot, 수치형 표준화
cat_features = ['대분류', '중분류']
num_features = ['최저입찰가']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features),
        ('num', StandardScaler(), num_features)
    ]
)

In [43]:
number_of_positive = (y == 1).sum()
number_of_negative = (y == 0).sum()

In [44]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb

# 분류 모델 인스턴스
models = {
    'sgd'            : SGDClassifier(loss='log', penalty='elasticnet', class_weight='balanced', random_state=42),
    'random_forest'  : RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=42),
    'svc'            : SVC(kernel='rbf',probability=True,class_weight='balanced',random_state=42),
    'xgboost'        : xgb.XGBClassifier( n_estimators=100, eval_metric='logloss', scale_pos_weight=(number_of_negative / number_of_positive), random_state=42 ),
    'lightgbm'       : lgb.LGBMClassifier(n_estimators=100,class_weight='balanced',verbose=-1,random_state=42),
    'mlp'            : MLPClassifier(
                           hidden_layer_sizes=(512,256,128,64,32,32,16),
                           activation='relu',
                           solver='adam',
                           learning_rate_init=0.001,
                           alpha=1e-4,
                           early_stopping=True,
                           validation_fraction=0.1,
                           n_iter_no_change=20,
                           max_iter=100000,
                           tol=1e-4,
                           random_state=42
                       ),
}

# 파이프라인 생성 함수
def make_pipeline(selected_models):
    """
    selected_models: str 또는 list[str]
     - str: 단일 모델 키
     - list[str]: VotingClassifier 앙상블할 모델 키들의 리스트
    """
    if isinstance(selected_models, list):
        estimators = [(name, models[name]) for name in selected_models]
        clf = VotingClassifier(estimators=estimators, voting='soft')
        return Pipeline([
            ('preprocess', preprocessor),
            ('voting',    clf)
        ])
    else:
        return Pipeline([
            ('preprocess', preprocessor),
            ('model',      models[selected_models])
        ])

In [45]:
model_name = 'xgboost'
pipeline = make_pipeline(model_name)
print(f"선택된 단일 모델: {model_name}")

선택된 단일 모델: xgboost


In [46]:
model_names = ['xgboost','lightgbm', 'random_forest', 'svc']
pipeline = make_pipeline(model_names)
print(f"선택된 앙상블 모델: {model_names}")

선택된 앙상블 모델: ['xgboost', 'lightgbm', 'random_forest', 'svc']


In [47]:
# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# 모델 학습
pipeline.fit(X_train, y_train)

In [50]:
# 예측
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# 평가
acc = accuracy_score(y_test, y_pred)
cm  = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Test Accuracy: {acc:.4f}")
print(f"ROC AUC: {roc_auc:.4f}\n")
print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)

Test Accuracy: 0.8196
ROC AUC: 0.7769

Confusion Matrix:
[[5291  943]
 [ 299  350]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.85      0.89      6234
           1       0.27      0.54      0.36       649

    accuracy                           0.82      6883
   macro avg       0.61      0.69      0.63      6883
weighted avg       0.88      0.82      0.84      6883

