# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier

from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    
    make_scorer
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 2022

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


### 데이터 결측치 확인

In [3]:
# demo data 만들기
train_data_demo = train_data.copy()

In [4]:
# 수치형 컬럼과 문자형 컬럼 분리
numeric_cols = train_data_demo.select_dtypes(include=[np.number]).columns
categorical_cols = train_data_demo.select_dtypes(include=['object']).columns

# 수치형 컬럼의 결측치를 평균으로 대체
train_data_demo[numeric_cols] = train_data_demo[numeric_cols].fillna(train_data_demo[numeric_cols].mean())

# 문자형 컬럼의 결측치는 다른 방법으로 대체하거나 제거
train_data_demo[categorical_cols] = train_data_demo[categorical_cols].fillna('missing')

# 모든 값이 NaN인 열을 식별하고 삭제
train_data_demo = train_data_demo.dropna(axis=1, how='all')  # 모든 값이 NaN인 열 삭제

In [5]:
train_data_demo

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [6]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data_demo[train_data_demo["target"] == "Normal"]
df_abnormal = train_data_demo[train_data_demo["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

In [7]:
# 레이블 인코딩
le = LabelEncoder()
y_encoded = le.fit_transform(df_concat['target'])

### 데이터 분할


In [8]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


In [9]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

In [10]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)

In [11]:
# 라벨 인코딩한 train dataset
train_y_encoded = le.transform(df_train['target'])

In [12]:
# df_val 활용
features = []

for col in df_val.columns:
    try:
        df_val[col] = df_val[col].astype(int)
        features.append(col)
    except:
        continue

val_x = df_val[features]
val_y_encoded = le.transform(df_val["target"])

scaler = StandardScaler()
val_x = scaler.fit_transform(val_x)

## 3. 모델 학습


###### 모델 정의 - 0.136 앙상블 기법 적용 -> 0.152 GridSearch 기법 적용 -> logistic regression 모델 추가하여 0.155 -> stacking으로 0.12ㅜ -> blending 시도할 예정


In [13]:
param_grid_rf = {
    'n_estimators': [100,200,300],
    'max_depth': [10,20,30],
    'min_samples_split':[2,5,10]
}

param_grid_svc = {
    'C': [0.1, 1.0, 10],
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3,5,7]
}

param_grid_logistic = {
    'C': [0.1, 1.0, 10],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [1000, 2000, 3000]
}

rf = RandomForestClassifier(random_state=RANDOM_STATE)
svc = SVC(random_state=RANDOM_STATE, probability=True, kernel='linear')
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
logistic = LogisticRegression(random_state=RANDOM_STATE)

f1_scorer = make_scorer(f1_score, pos_label='AbNormal')

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring=f1_scorer, cv=5, n_jobs=-1,verbose=3)
grid_search_svc = GridSearchCV(estimator=svc, param_grid=param_grid_svc, scoring=f1_scorer, cv=5, n_jobs=-1,verbose=3)
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, scoring=f1_scorer, cv=5, n_jobs=-1,verbose=3)
grid_search_logistic = GridSearchCV(estimator=logistic, param_grid=param_grid_logistic, scoring=f1_scorer, cv=5, n_jobs=-1,verbose=3)


In [14]:
train_y_encoded

array([1, 1, 1, ..., 1, 0, 1])

### 모델 학습


In [31]:
grid_search_rf.fit(train_x, train_y_encoded)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/elicer/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_

In [16]:
grid_search_svc.fit(train_x, train_y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [17]:
grid_search_gb.fit(train_x, train_y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [18]:
grid_search_logistic.fit(train_x, train_y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.602 total time=   0.5s
[CV 5/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.556 total time=   0.4s
[CV 4/5] END max_depth=10, min_samples_split=2, n_estimators=200;, score=0.583 total time=   0.9s
[CV 3/5] END max_depth=10, min_samples_split=2, n_estimators=300;, score=0.608 total time=   1.3s
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.578 total time=   0.5s
[CV 4/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.590 total time=   0.4s
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.568 total time=   0.9s
[CV 1/5] END max_depth=10, min_samples_split=5, n_estimators=300;, score=0.589 total time=   1.3s
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimators=300;, score=0.563 total time=   1.3s
[CV 4/5] END max_depth=10, min_samples_split=10, n_estima

[CV 2/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.560 total time=   0.5s
[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=200;, score=0.599 total time=   0.9s
[CV 5/5] END max_depth=10, min_samples_split=2, n_estimators=200;, score=0.562 total time=   0.9s
[CV 4/5] END max_depth=10, min_samples_split=2, n_estimators=300;, score=0.580 total time=   1.4s
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.576 total time=   0.4s
[CV 3/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.585 total time=   0.9s
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=300;, score=0.553 total time=   1.3s
[CV 1/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=0.603 total time=   0.4s
[CV 3/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=0.600 total time=   0.4s
[CV 1/5] END max_depth=10, min_samples_split=10, n_estimators=200;, score=0.592 total time=   0.9s
[CV 5/5] END max_

In [19]:
best_rf = grid_search_rf.best_estimator_
best_svc = grid_search_svc.best_estimator_
best_gb = grid_search_gb.best_estimator_
best_logistic = grid_search_logistic.best_estimator_

In [20]:
# 특성 중요도 추출을 위한 train 데이터셋 dataframe으로 변환
if isinstance(train_x, np.ndarray):
    train_x = pd.DataFrame(train_x, columns=features)

In [21]:
# 중요도 계산 및 상위 n개의 중요한 특성 선택
rf_importances = best_rf.feature_importances_
gb_importances = best_gb.feature_importances_
svc_importances = np.abs(best_svc.coef_).flatten()
logistic_importances = np.abs(best_logistic.coef_).flatten()

# 각 모델의 중요도 평균 계산
average_importances = np.mean([rf_importances, gb_importances, svc_importances, logistic_importances], axis=0)

# 중요도 상위 n개의 특성 선택
n = 100
important_indices = np.argsort(average_importances)[-n:]
important_features = train_x.columns[important_indices]

print("선택된 중요 특성들:", important_features)

# 선택된 중요한 특성들만 사용하여 데이터셋 축소
train_x_important = train_x.iloc[:, important_indices]

선택된 중요 특성들: Index(['CURE END POSITION X Collect Result_Dam',
       'CURE START POSITION Θ Collect Result_Dam',
       'CURE END POSITION Θ Collect Result_Dam',
       '1st Pressure 1st Pressure Unit Time_AutoClave',
       'PalletID Collect Result_Fill2',
       'Chamber Temp. Collect Result_AutoClave',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
       'Machine Tact time Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
       'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
       'Dispense Volume(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'WorkMode Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
       'Dispense Volume(Stage1) Collect Result_Fill1',
       'Stage1 Line1 Distance Speed Collect Result_Dam',
       'Stage1 Line3 Distance Spe

In [23]:
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('svc', best_svc),
        ('gb', best_gb),
        ('logistic', best_logistic)
    ],
    voting='soft',
    n_jobs=-1,
    verbose=2
)

ensemble_model.fit(train_x_important, train_y)

In [23]:
if isinstance(val_x, np.ndarray):
    val_x = pd.DataFrame(val_x, columns=features)

In [26]:
# validation으로 성능 평가
# 검증 세트에 대한 예측
val_x_important = val_x.iloc[:, important_indices]

y_val_pred_ensemble = ensemble_model.predict(val_x_important)

# 모델 성능 평가
abnormal_label_index = 'Abnormal'
f1 = f1_score(val_y, y_val_pred_ensemble, pos_label=abnormal_label_index)
print("Ensemble Model Validation F1 Score:", f1)
print("\nClassification Report:")
print(classification_report(val_y, y_val_pred_ensemble, target_names='Abnormal'))

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['AbNormal' 'Normal']. Make sure that the predictions provided by the classifier coincides with the true labels.

### Blending 시도할 예정

## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [24]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [25]:
# 내가 추가함. NaN값 제거
test_data_demo = test_data.copy()
test_data_demo = test_data_demo.dropna(axis=1, how='all')  # 모든 값이 NaN인 열 삭제

In [26]:
df_test_x = test_data_demo[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [27]:
if isinstance(val_x, np.ndarray):
    df_test_x = pd.DataFrame(df_test_x, columns=features)

In [28]:
df_test_x_important = test_data_demo[important_features]

In [29]:
test_pred = ensemble_model.predict(df_test_x_important)
test_pred

array(['AbNormal', 'AbNormal', 'AbNormal', ..., 'AbNormal', 'AbNormal',
       'AbNormal'], dtype=object)

### 제출 파일 작성


In [30]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

[CV 5/5] END C=10, max_iter=2000, solver=liblinear;, score=0.564 total time=   0.5s
[CV 1/5] END .C=10, max_iter=2000, solver=lbfgs;, score=0.619 total time=   0.5s
[CV 4/5] END .C=10, max_iter=2000, solver=lbfgs;, score=0.583 total time=   0.6s
[CV 5/5] END .C=10, max_iter=2000, solver=lbfgs;, score=0.561 total time=   0.4s
[CV 2/5] END .C=10, max_iter=3000, solver=lbfgs;, score=0.553 total time=   0.5s
[CV 3/5] END .C=10, max_iter=3000, solver=lbfgs;, score=0.585 total time=   0.6s
[Voting] ...................... (2 of 4) Processing svc, total=  53.0s
[Voting] ................. (4 of 4) Processing logistic, total=   0.6s
[CV 2/5] END C=10, max_iter=2000, solver=liblinear;, score=0.553 total time=   0.7s
[CV 1/5] END C=10, max_iter=3000, solver=liblinear;, score=0.619 total time=   0.7s
[CV 2/5] END C=10, max_iter=3000, solver=liblinear;, score=0.553 total time=   0.6s
[CV 4/5] END .C=10, max_iter=3000, solver=lbfgs;, score=0.583 total time=   0.7s
[Voting] ................. (4 of 4) 

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
