In [1]:
# 주요 라이브러리 import
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

In [3]:
# 데이터 로드
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [4]:
# demo data 만들기
train_data_demo = train_data.copy()

In [5]:
# 수치형 컬럼과 문자형 컬럼 분리
numeric_cols = train_data_demo.select_dtypes(include=[np.number]).columns
categorical_cols = train_data_demo.select_dtypes(include=['object']).columns

# 수치형 컬럼의 결측치를 평균으로 대체
train_data_demo[numeric_cols] = train_data_demo[numeric_cols].fillna(train_data_demo[numeric_cols].mean())

# 문자형 컬럼의 결측치는 'missing'으로 대체
train_data_demo[categorical_cols] = train_data_demo[categorical_cols].fillna('missing')

# 모든 값이 NaN인 열을 식별하고 삭제
train_data_demo = train_data_demo.dropna(axis=1, how='all')

In [6]:
train_data_demo

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [7]:
# 언더샘플링
normal_ratio = 1.0  # 1.0 means 1:1 ratio
df_normal = train_data_demo[train_data_demo["target"] == "Normal"]
df_abnormal = train_data_demo[train_data_demo["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total before undersampling: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

print("  Total after undersampling:")
print(df_concat.value_counts("target"))

  Total before undersampling: Normal: 38156, AbNormal: 2350
  Total after undersampling:
target
AbNormal    2350
Normal      2350
Name: count, dtype: int64


In [8]:
# 레이블 인코딩
le = LabelEncoder()
y_encoded = le.fit_transform(df_concat['target'])

In [9]:
# 데이터 분할
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=y_encoded,
    random_state=RANDOM_STATE,
)

In [10]:
# 숫자형 특성만 선택
features = []

for col in df_train.columns:
    if col != 'target':  # target 열은 제외
        try:
            df_train[col] = df_train[col].astype(float)
            features.append(col)
        except:
            continue

In [11]:
# 특성과 타겟 분리
X_train = df_train[features]
y_train = le.transform(df_train['target'])
X_val = df_val[features]
y_val = le.transform(df_val['target'])

print("Train data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)

print("Data distribution after splitting:")
print(f"Train set - Normal: {sum(y_train == 'Normal')}, AbNormal: {sum(y_train == 'AbNormal')}")
print(f"Validation set - Normal: {sum(y_val == 'Normal')}, AbNormal: {sum(y_val == 'AbNormal')}")

Train data shape: (3290, 149)
Validation data shape: (1410, 149)
Data distribution after splitting:
Train set - Normal: 0, AbNormal: 0
Validation set - Normal: 0, AbNormal: 0


In [12]:
# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [13]:
print("Scaled data shape - X_train:", X_train_scaled.shape, "X_val:", X_val_scaled.shape)

Scaled data shape - X_train: (3290, 149) X_val: (1410, 149)


In [14]:
# SMOTE 적용
smote = SMOTE(random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [15]:
print("After SMOTE:")
print(f"Train set - Normal: {sum(y_train_resampled == 'Normal')}, AbNormal: {sum(y_train_resampled == 'AbNormal')}")

After SMOTE:
Train set - Normal: 0, AbNormal: 0


In [16]:
# 각 모델의 하이퍼파라미터 탐색 공간 정의
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

catboost_params = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

f1_scorer = make_scorer(f1_score, pos_label='AbNormal')

In [17]:
# RandomizedSearchCV를 이용한 모델 최적화
def optimize_model(model, params, X, y):
    random_search = RandomizedSearchCV(model, params, n_iter=20, cv=5, n_jobs=-1, scoring=f1_scorer, random_state=RANDOM_STATE)
    random_search.fit(X, y)
    return random_search.best_estimator_

In [18]:
# 각 모델 정의
xgb = XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='logloss')
lgbm = LGBMClassifier(random_state=RANDOM_STATE)
catboost = CatBoostClassifier(random_state=RANDOM_STATE, verbose=0)

In [19]:
# 각 모델 최적화
xgb_optimized = optimize_model(xgb, xgb_params, X_train_resampled, y_train_resampled)

 nan nan]
Parameters: { "use_label_encoder" } are not used.



In [20]:
lgbm_optimized = optimize_model(lgbm, lgbm_params, X_train_resampled, y_train_resampled)

 nan nan]


[LightGBM] [Info] Number of positive: 1645, number of negative: 1645
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3335
[LightGBM] [Info] Number of data points in the train set: 3290, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [21]:
catboost_optimized = optimize_model(catboost, catboost_params, X_train_resampled, y_train_resampled)

15 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\99kih\AppData\Roaming\Python\Python311\site-packages\catboost\core.py", line 5220, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\99kih\AppData\Roaming\Python\Python311\site-packages\catboost\core.py", line 2400, in _fit
    self._train(
  File "C:\Users\99kih\AppData\Roaming\Python\Python311

In [22]:
# Stacking 앙상블 모델 구축
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_optimized),
        ('lgbm', lgbm_optimized),
        ('catboost', catboost_optimized)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

In [23]:
# 앙상블 모델 학습
stacking_model.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1645, number of negative: 1645
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3335
[LightGBM] [Info] Number of data points in the train set: 3290, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1316, number of negative: 1316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3281
[LightGBM] [Info] Number of data points in the train set: 2632, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1316, number of negative: 1316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3270
[LightGBM] [Info] Number of data points in the train set: 2632, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1316, number of negative

In [24]:
# 검증 세트에 대한 예측
y_val_pred_ensemble = stacking_model.predict(X_val_scaled)

In [25]:
# 모델 성능 평가
accuracy = accuracy_score(y_val, y_val_pred_ensemble)
f1 = f1_score(y_val, y_val_pred_ensemble, average='macro')
print("\nStacking Ensemble Model Validation Accuracy:", accuracy)
print("Stacking Ensemble Model Validation F1 Score:", f1)
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_ensemble, target_names=le.classes_))


Stacking Ensemble Model Validation Accuracy: 0.6007092198581561
Stacking Ensemble Model Validation F1 Score: 0.6006993784364687

Classification Report:
              precision    recall  f1-score   support

    AbNormal       0.60      0.61      0.60       705
      Normal       0.60      0.60      0.60       705

    accuracy                           0.60      1410
   macro avg       0.60      0.60      0.60      1410
weighted avg       0.60      0.60      0.60      1410



In [26]:
# f1-Score: 0.6007092198581561 (Validation Set)

In [27]:
# 테스트 데이터 불러오기
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [28]:
# NaN값 제거
test_data_demo = test_data.copy()
test_data_demo = test_data_demo.dropna(axis=1, how='all')  # 모든 값이 NaN인 열 삭제

In [29]:
df_X_test = test_data_demo[features]

for col in df_X_test.columns:
    try:
        df_X_test.loc[:, col] = df_X_test[col].astype(float)
    except:
        continue

In [30]:
test_X_scaled = scaler.transform(df_X_test)

In [31]:
# 예측
test_pred = stacking_model.predict(test_X_scaled)
test_pred

array([0, 1, 0, ..., 1, 0, 1])

In [32]:
# 예측 결과를 원래의 레이블로 변환
test_pred_labels = le.inverse_transform(test_pred)

In [35]:
# 제출 데이터 읽어오기
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [36]:
# 예측 결과 확인
print("예측된 레이블 분포:")
print(pd.Series(test_pred_labels).value_counts())

예측된 레이블 분포:
Normal      10270
AbNormal     7091
Name: count, dtype: int64


In [34]:
# test set f1-score: .14983