In [1]:
# 주요 라이브러리 import
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

In [3]:
# 데이터 로드
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [4]:
# demo data 만들기
train_data_demo = train_data.copy()

In [5]:
# 수치형 컬럼과 문자형 컬럼 분리
numeric_cols = train_data_demo.select_dtypes(include=[np.number]).columns
categorical_cols = train_data_demo.select_dtypes(include=['object']).columns

# 수치형 컬럼의 결측치를 평균으로 대체
train_data_demo[numeric_cols] = train_data_demo[numeric_cols].fillna(train_data_demo[numeric_cols].mean())

# 문자형 컬럼의 결측치는 'missing'으로 대체
train_data_demo[categorical_cols] = train_data_demo[categorical_cols].fillna('missing')

# 모든 값이 NaN인 열을 식별하고 삭제
train_data_demo = train_data_demo.dropna(axis=1, how='all')

In [6]:
train_data_demo

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [7]:
# 언더샘플링
normal_ratio = 1.0  # 1.0 means 1:1 ratio
df_normal = train_data_demo[train_data_demo["target"] == "Normal"]
df_abnormal = train_data_demo[train_data_demo["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total before undersampling: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

print("  Total after undersampling:")
print(df_concat.value_counts("target"))

  Total before undersampling: Normal: 38156, AbNormal: 2350
  Total after undersampling:
target
AbNormal    2350
Normal      2350
Name: count, dtype: int64


In [8]:
# 데이터 분할
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)

In [9]:
# 숫자형 특성만 선택
features = []

for col in df_train.columns:
    if col != 'target':  # target 열은 제외
        try:
            df_train[col] = df_train[col].astype(int)
            features.append(col)
        except:
            continue

In [10]:
# 특성과 타겟 분리
X_train = df_train[features]
y_train = df_train['target']
X_val = df_val[features]
y_val = df_val['target']

print("Train data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)

print("Data distribution after splitting:")
print(f"Train set - Normal: {sum(y_train == 'Normal')}, AbNormal: {sum(y_train == 'AbNormal')}")
print(f"Validation set - Normal: {sum(y_val == 'Normal')}, AbNormal: {sum(y_val == 'AbNormal')}")

Train data shape: (3290, 149)
Validation data shape: (1410, 149)
Data distribution after splitting:
Train set - Normal: 1645, AbNormal: 1645
Validation set - Normal: 705, AbNormal: 705


In [11]:
# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [12]:
print("Scaled data shape - X_train:", X_train_scaled.shape, "X_val:", X_val_scaled.shape)

Scaled data shape - X_train: (3290, 149) X_val: (1410, 149)


In [13]:
# SMOTE 적용
smote = SMOTE(random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [14]:
print("After SMOTE:")
print(f"Train set - Normal: {sum(y_train_resampled == 'Normal')}, AbNormal: {sum(y_train_resampled == 'AbNormal')}")

After SMOTE:
Train set - Normal: 1645, AbNormal: 1645


In [15]:
# 각 모델의 하이퍼파라미터 탐색 공간 정의
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# svm_params = {
#     'C': [0.1, 1, 10],
#     'kernel': ['rbf', 'poly'],
#     'gamma': ['scale', 'auto', 0.1, 1]
# }

lr_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

f1_scorer = make_scorer(f1_score, pos_label='AbNormal')

In [16]:
# BayesSearchCV를 이용한 모델 최적화 함수
def optimize_model(model, params, X, y):
    random_search = RandomizedSearchCV(model, params, n_iter=20, cv=5, n_jobs=-1, scoring=f1_scorer, random_state=RANDOM_STATE)
    random_search.fit(X, y)
    return random_search.best_estimator_

In [17]:
# 각 모델 최적화
rf_optimized = optimize_model(RandomForestClassifier(random_state=RANDOM_STATE), rf_params, X_train_resampled, y_train_resampled)

In [18]:
gb_optimized = optimize_model(GradientBoostingClassifier(random_state=RANDOM_STATE), gb_params, X_train_resampled, y_train_resampled)

In [19]:
lr_optimized = optimize_model(LogisticRegression(random_state=RANDOM_STATE), lr_params, X_train_resampled, y_train_resampled)



In [None]:
# svm_optimized = optimize_model(SVC(probability=True, random_state=RANDOM_STATE), svm_params, X_train_resampled, y_train_resampled)
# -> 시간이 너무 오래 걸려서 제외

In [21]:
# 앙상블 모델 구축
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_optimized),
        ('gb', gb_optimized),
        # ('svm', svm_optimized),
        ('lr', lr_optimized)
    ],
    voting='soft'
)

In [22]:
# 앙상블 모델 학습
ensemble_model.fit(X_train_resampled, y_train_resampled)

In [23]:
# 검증 세트에 대한 예측
y_val_pred_ensemble = ensemble_model.predict(X_val_scaled)

In [24]:
# 모델 성능 평가
accuracy = accuracy_score(y_val, y_val_pred_ensemble)
f1 = f1_score(y_val, y_val_pred_ensemble, average='macro')
print("\nEnsemble Model Validation Accuracy:", accuracy)
print("Ensemble Model Validation F1 Score:", f1)
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_ensemble))


Ensemble Model Validation Accuracy: 0.5971631205673759
Ensemble Model Validation F1 Score: 0.5969806763285024

Classification Report:
              precision    recall  f1-score   support

    AbNormal       0.60      0.58      0.59       705
      Normal       0.59      0.62      0.61       705

    accuracy                           0.60      1410
   macro avg       0.60      0.60      0.60      1410
weighted avg       0.60      0.60      0.60      1410



In [25]:
# f1-Score: 0.5969806763285024 (Validation Set)

In [26]:
# 테스트 데이터 불러오기
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [27]:
# NaN값 제거
test_data_demo = test_data.copy()
test_data_demo = test_data_demo.dropna(axis=1, how='all')  # 모든 값이 NaN인 열 삭제

In [28]:
df_X_test = test_data_demo[features]

for col in df_X_test.columns:
    try:
        df_X_test.loc[:, col] = df_X_test[col].astype(int)
    except:
        continue

In [29]:
test_X_scaled = scaler.transform(df_X_test)

In [30]:
# 예측
test_pred = ensemble_model.predict(test_X_scaled)
test_pred

array(['AbNormal', 'Normal', 'AbNormal', ..., 'Normal', 'Normal',
       'Normal'], dtype=object)

In [31]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [32]:
# test set f1-score: ??? (제출 제한 때문에 확인 아직 못 함)