# 데이터 로드

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## train 데이터

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_df = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_df

X_train = train_df.iloc[:,:-1]
y_train = train_df.iloc[:,-1]

In [3]:
df_train = train_df

df_train

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


## test 데이터

In [4]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
df = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
X_test = df.drop(columns = ['Set ID'])
X_test

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,,,...,195,,,1,,,0,,,
1,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,,,...,14,,,256,,,1,,,
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,,,...,98,,,1,,,0,,,
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,,,...,14,,,0,,,1,,,
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,,,...,1,,,215,,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,,,...,14,,,131,,,1,,,
17357,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,,,...,12,,,279,,,1,,,
17358,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,,,...,4,,,66,,,1,,,
17359,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,,,...,117,,,1,,,0,,,


# 데이터 전처리

## 오입력 값 결측치로 바꾸기

In [5]:
X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']= X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)


In [6]:
X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']= X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)

In [7]:
X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']= X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

In [8]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

## 결측치 처리

### 결측치 비율이 50%가 넘는 컬럼 제거

In [9]:
def highly_null(df, threshold = 0.5):
    df_copy = df.copy()
    missing_ratio = df_copy.isnull().mean()
    
    null_columns = df_copy.columns[missing_ratio > threshold]
    df_copy.drop(columns = null_columns, inplace = True)    
    return df_copy

X_train = highly_null(X_train)
X_test = highly_null(X_test)

In [10]:
df_train = highly_null(df_train)

In [11]:
df_train

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [12]:
X_train

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,114.612,19.9,7,127,1
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,19.600,7.0,185,1,0
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,114.612,19.8,10,73,1
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,19.900,12.0,268,1,0
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,19.700,8.0,121,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,19.200,1.0,318,1,0
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,114.612,20.5,14,197,1
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.000,19.7,1,27,1
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,20.100,13.0,117,1,0


In [13]:
X_test

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


## train, validation 데이터 나누기

In [14]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.1, random_state = 42)

In [15]:
df_train = pd.concat([X_train, y_train], axis = 1)

df_train

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
564,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XE078-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.500,14.0,183,1,0,Normal
20342,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XC487-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,20.9,3,98,1,Normal
7616,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3J1XF276-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.800,11.0,395,1,0,Normal
33493,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4C1XD474-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,85.000,19.8,11,262,1,Normal
39919,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XA635-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,85.000,19.9,14,36,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XC498-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,18.700,7.0,217,1,0,Normal
11284,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3J1XB576-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.500,14.0,235,1,0,Normal
38158,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,4E1XA077-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,24.3,12,276,6,Normal
860,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XF537-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,20.1,1,180,1,AbNormal


## 이상치 제거(train 데이터에만)

In [16]:
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

def isolation_forest(df):
    # 숫자형 데이터만 선택
    df_numeric = df.select_dtypes(include=['number'])

    # Isolation Forest 모델 초기화
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    iso_forest.fit(df_numeric)

    y_pred = iso_forest.predict(df_numeric)

    # 원래 데이터프레임에 예측 결과 추가
    df['Prediction'] = y_pred

    # 이상치와 정상 데이터로 구분
    df_normal = df[df['Prediction'] == 1]
    df_outliers = df[df['Prediction'] == -1]
    
    return df_normal


In [17]:
df_train_outlier = isolation_forest(df_train)



In [18]:
df_train_outlier

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target,Prediction
564,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XE078-1,1,OK,1000.0,12.5,90,...,270.0,50,85,19.500,14.0,183,1,0,Normal,1
20342,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XC487-1,1,OK,240.0,2.5,-90,...,91.8,270,50,85.000,20.9,3,98,1,Normal,1
7616,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3J1XF276-1,1,OK,1000.0,12.5,90,...,270.0,50,85,19.800,11.0,395,1,0,Normal,1
33493,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4C1XD474-1,1,OK,1000.0,12.5,90,...,91.8,270,50,85.000,19.8,11,262,1,Normal,1
39919,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XA635-1,1,OK,1000.0,12.5,90,...,91.8,270,50,85.000,19.9,14,36,1,Normal,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-1,1,OK,240.0,2.5,-90,...,270.0,50,85,20.100,7.0,368,1,0,Normal,1
6265,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XC498-1,1,OK,240.0,2.5,-90,...,270.0,50,85,18.700,7.0,217,1,0,Normal,1
38158,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,4E1XA077-1,1,OK,1000.0,12.5,90,...,91.8,270,50,114.612,24.3,12,276,6,Normal,1
860,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XF537-1,1,OK,240.0,2.5,-90,...,91.8,270,50,85.000,20.1,1,180,1,AbNormal,1


In [19]:
df_train_outlier.drop(columns = ['Prediction'], inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_outlier.drop(columns = ['Prediction'], inplace = True)


In [20]:
df_train_outlier

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
564,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XE078-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.500,14.0,183,1,0,Normal
20342,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XC487-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,20.9,3,98,1,Normal
7616,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3J1XF276-1,1,OK,1000.0,12.5,90,...,91.8,270.0,50,85,19.800,11.0,395,1,0,Normal
33493,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4C1XD474-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,85.000,19.8,11,262,1,Normal
39919,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XA635-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,85.000,19.9,14,36,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,20.100,7.0,368,1,0,Normal
6265,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XC498-1,1,OK,240.0,2.5,-90,...,91.8,270.0,50,85,18.700,7.0,217,1,0,Normal
38158,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,4E1XA077-1,1,OK,1000.0,12.5,90,...,50.0,91.8,270,50,114.612,24.3,12,276,6,Normal
860,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XF537-1,1,OK,240.0,2.5,-90,...,50.0,91.8,270,50,85.000,20.1,1,180,1,AbNormal


In [21]:
X_train_outlier = df_train_outlier.iloc[:,:-1]
y_train_outlier = df_train_outlier.iloc[:,-1]

In [22]:
X_train_outlier

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
564,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XE078-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,19.500,14.0,183,1,0
20342,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XC487-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.000,20.9,3,98,1
7616,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3J1XF276-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,19.800,11.0,395,1,0
33493,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4C1XD474-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.000,19.8,11,262,1
39919,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XA635-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.000,19.9,14,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,20.100,7.0,368,1,0
6265,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XC498-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,18.700,7.0,217,1,0
38158,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,4E1XA077-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,114.612,24.3,12,276,6
860,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3M1XF537-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.000,20.1,1,180,1


## 데이터 스케일링(Robust Scaler)

In [23]:
from sklearn.preprocessing import RobustScaler

X_numerical_train_outlier = X_train_outlier[X_train_outlier.select_dtypes(include=['number']).columns]
X_categorical_train_outlier = X_train_outlier[X_train_outlier.select_dtypes(include=['object', 'category']).columns]
display(X_numerical_train_outlier)
X_numerical_val_outlier = X_val[X_val.select_dtypes(include=['number']).columns]
X_categorical_val_outlier = X_val[X_val.select_dtypes(include=['object', 'category']).columns]

X_numerical_test = X_test[X_test.select_dtypes(include=['number']).columns]
X_categorical_test = X_test[X_test.select_dtypes(include=['object', 'category']).columns]
display(X_numerical_test)
scaler_outlier = RobustScaler()
scaler_outlier.fit(X_numerical_train_outlier)
X_numerical_train_outlier_scaled = scaler_outlier.transform(X_numerical_train_outlier)
X_numerical_val_outlier_scaled = scaler_outlier.transform(X_numerical_val_outlier)
X_numerical_test_outlier_scaled = scaler_outlier.transform(X_numerical_test)

X_numerical_train_outlier_scaled_df = pd.DataFrame(X_numerical_train_outlier_scaled,
                                          columns = X_numerical_train_outlier.columns)

X_numerical_val_outlier_scaled_df = pd.DataFrame(X_numerical_val_outlier_scaled,
                                        columns = X_numerical_val_outlier.columns)

X_numerical_test_outlier_scaled_df = pd.DataFrame(X_numerical_test_outlier_scaled,
                                        columns = X_numerical_test.columns)

X_categorical_train_outlier_df = pd.DataFrame(X_categorical_train_outlier, 
                                   columns=X_categorical_train_outlier.columns)
X_categorical_val_outlier_df = pd.DataFrame(X_categorical_val_outlier, 
                                 columns=X_categorical_val_outlier.columns)
X_categorical_test_outlier_df = pd.DataFrame(X_categorical_test, 
                                 columns=X_categorical_test.columns)

X_train_outlier_scaled = pd.concat([X_numerical_train_outlier_scaled_df, 
                            X_categorical_train_outlier_df.reset_index(drop=True)], axis=1)
X_val_outlier_scaled = pd.concat([X_numerical_val_outlier_scaled_df, 
                          X_categorical_val_outlier_df.reset_index(drop=True)], axis=1)
X_test_outlier_scaled = pd.concat([X_numerical_test_outlier_scaled_df, 
                          X_categorical_test_outlier_df.reset_index(drop=True)], axis=1)


Unnamed: 0,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,CURE STANDBY POSITION Θ Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Z Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
564,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,50,91.8,270.0,50,85,19.500,14.0,183,1,0
20342,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,119,50.0,91.8,270,50,85.000,20.9,3,98,1
7616,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,50,91.8,270.0,50,85,19.800,11.0,395,1,0
33493,1,1000.0,12.5,90,105,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.000,19.8,11,262,1
39919,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.000,19.9,14,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,50,91.8,270.0,50,85,20.100,7.0,368,1,0
6265,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,50,91.8,270.0,50,85,18.700,7.0,217,1,0
38158,1,1000.0,12.5,90,105,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,114.612,24.3,12,276,6
860,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,119,50.0,91.8,270,50,85.000,20.1,1,180,1


Unnamed: 0,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,CURE STANDBY POSITION Θ Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Z Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,1,1000.0,12.5,90,70,1150,33.5,0,280,33.5,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,1,240.0,2.5,-90,70,1150,33.5,0,1030,33.5,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


## 문자형 데이터를 숫자형으로 변경(Ordinal Encoder)

In [24]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 범주형 컬럼만 선택
train_outlier_categorical_cols = X_train_outlier_scaled.select_dtypes(include=['object']).columns
val_outlier_categorical_cols = X_val_outlier_scaled.select_dtypes(include=['object']).columns
test_outlier_categorical_cols = X_test_outlier_scaled.select_dtypes(include=['object']).columns

# 범주형 데이터에만 LabelEncoder 적용
X_train_outlier_scaled[train_outlier_categorical_cols] = encoder.fit_transform(X_train_outlier_scaled[train_outlier_categorical_cols])
    
X_val_outlier_scaled[val_outlier_categorical_cols] = encoder.transform(X_val_outlier_scaled[val_outlier_categorical_cols])

X_test_outlier_scaled[test_outlier_categorical_cols] = encoder.transform(X_test_outlier_scaled[test_outlier_categorical_cols])


# SMOTE 샘플링

In [25]:

y_train_outlier.replace({'Normal':0, 'AbNormal':1}, inplace = True)

y_val.replace({'Normal':0, 'AbNormal':1}, inplace = True)

In [26]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 0)
X_resampled_outlier, y_resampled_outlier = smote.fit_resample(X_train_outlier_scaled, y_train_outlier)


In [27]:
y_train.value_counts()

target
Normal      34332
AbNormal     2123
Name: count, dtype: int64

In [28]:
y_resampled_outlier.value_counts()

target
0    32602
1    32602
Name: count, dtype: int64

## PCA

In [29]:
from sklearn.cluster import KMeans

# 예시로 3개의 클러스터로 데이터 클러스터링
kmeans = KMeans(n_clusters=3, random_state=42)
X_clustered = kmeans.fit_predict(X_resampled_outlier)

# 클러스터 레이블을 원래 데이터에 추가
X_resampled_with_cluster = np.c_[X_resampled_outlier, X_clustered]




In [30]:
from sklearn.decomposition import PCA

# 예시로 2개의 주성분으로 차원 축소
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_resampled_with_cluster)

X_val_clustered = np.c_[X_val_outlier_scaled, kmeans.predict(X_val_outlier_scaled)]
X_val_pca = pca.transform(X_val_clustered)

X_test_clustered = np.c_[X_test_outlier_scaled, kmeans.predict(X_test_outlier_scaled)]
X_test_pca = pca.transform(X_test_clustered)

## LDA

In [31]:
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# # y_resampled에서 클래스 수 확인
# n_classes = len(np.unique(y_resampled_outlier))

# # 피처 수 및 LDA의 최대 가능한 차원 수 계산
# n_features = X_resampled_with_cluster.shape[1]
# max_components = min(n_features, n_classes - 1)

# # LDA로 차원 축소 (n_components는 max_components 이하로 설정)
# lda = LDA(n_components=max_components)
# X_pca = lda.fit_transform(X_resampled_with_cluster, y_resampled_outlier)

# X_val_clustered = np.c_[X_val_outlier_scaled, kmeans.predict(X_val_outlier_scaled)]
# X_val_pca = lda.transform(X_val_clustered)

# X_test_clustered = np.c_[X_test_outlier_scaled, kmeans.predict(X_test_outlier_scaled)]
# X_test_pca = lda.transform(X_test_clustered)

# 모델 학습

## 평가지표

In [32]:
def get_clf_eval(y_test, pred=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))

## 하이퍼 파라미터 튜닝

In [33]:
from hyperopt import hp

# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2사이 정규 분포된 값으로 검색. 
lgbm_search_space = {'max_depth': hp.quniform('max_depth', 1, 50, 1),
                    'num_leaves':hp.quniform('num_leaves', 30,150,1),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 10 , 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
               }

In [34]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from lightgbm import LGBMClassifier
from hyperopt import STATUS_OK

# fmin()에서 입력된 search_space값으로 입력된 모든 값은 실수형임. 
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함. 
# 정확도는 높은 수록 더 좋은 수치임. -1* 정확도를 곱해서 큰 정확도 값일 수록 최소가 되도록 변환
def objective_func(search_space):
    # 수행 시간 절약을 위해 n_estimators는 100으로 축소
    lgbm_clf = LGBMClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            num_leaves = int(search_space['num_leaves']),
                            eval_metric='logloss')
    skf = StratifiedKFold(n_splits=3)
    accuracy = cross_val_score(lgbm_clf, X_pca, y_resampled_outlier, scoring='accuracy', cv=skf)
        
    # accuracy는 cv=3 개수만큼의 정확도 결과를 가지므로 이를 평균해서 반환하되 -1을 곱해줌. 
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}

In [35]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
            space=lgbm_search_space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)

[LightGBM] [Info] Number of positive: 21735, number of negative: 21734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510                      
[LightGBM] [Info] Number of data points in the train set: 43469, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500012 -> initscore=0.000046
[LightGBM] [Info] Start training from score 0.000046  
[LightGBM] [Info] Number of positive: 21734, number of negative: 21735
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510                      
[LightGBM] [Info] Number of data points in the train set: 43469, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499988 -> initscore=-0.000046
[LightGBM] [Info] Start tr

In [36]:
print('colsample_bytree:{0}, learning_rate:{1}, max_depth:{2}, min_child_weight:{3}, num_leaves:{4}'.format(
                        round(best['colsample_bytree'], 5), round(best['learning_rate'], 5),
                        int(best['max_depth']), int(best['min_child_weight']), int(best['num_leaves'])))

colsample_bytree:0.85696, learning_rate:0.19864, max_depth:43, min_child_weight:5, num_leaves:145


In [37]:
pip install --upgrade lightgbm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [38]:
import lightgbm

print(lightgbm.__version__)

4.5.0


In [39]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=1000, learning_rate=round(best['learning_rate'], 5), 
                            max_depth=int(best['max_depth']), min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5), num_leaves = int(best['num_leaves']))

evals = [(X_val_pca, y_val)]
lgbm.fit(X_pca, y_resampled_outlier, eval_metric="logloss", 
                 eval_set=evals)
preds = lgbm.predict(X_val_pca)

[LightGBM] [Info] Number of positive: 32602, number of negative: 32602
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000804 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 65204, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [40]:
get_clf_eval(y_val, preds)

오차 행렬
[[2693 1131]
 [ 115  112]]
정확도: 0.6924, 정밀도: 0.0901, 재현율: 0.4934,    F1: 0.1524


# 제출하기

In [41]:
test_pred = lgbm.predict(X_test_pca)

test_pred = np.where(test_pred == 0, 'Normal', 'AbNormal')

np.sum(test_pred == 'AbNormal')

5504

In [42]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)