# 데이터 로드

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## Train 데이터

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_df = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_df

df_train = train_df

## Test 데이터

In [3]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
df = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
X_test = df.drop(columns = ['Set ID'])
X_test

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,,,...,195,,,1,,,0,,,
1,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,,,...,14,,,256,,,1,,,
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,,,...,98,,,1,,,0,,,
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,,,...,14,,,0,,,1,,,
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,,,...,1,,,215,,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,,,...,14,,,131,,,1,,,
17357,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,,,...,12,,,279,,,1,,,
17358,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,,,...,4,,,66,,,1,,,
17359,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,,,...,117,,,1,,,0,,,


# 데이터 전처리

## 오입력 값 결측치로 바꾸기

In [4]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

In [5]:
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']= X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

In [6]:
# 1. 각 컬럼의 결측값 비율 계산
train_null_ratio =df_train.isnull().mean()
test_null_ratio = X_test.isnull().mean()

# 2. 결측값 비율이 50% 이상인 컬럼 선택
train_columns_with_high_null = train_null_ratio[train_null_ratio > 0.5].index
test_columns_with_high_null = test_null_ratio[test_null_ratio > 0.5].index

# 3. 두 결과 비교
columns_in_both = set(train_columns_with_high_null).intersection(set(test_columns_with_high_null))
columns_in_train_only = set(train_columns_with_high_null) - set(test_columns_with_high_null)
columns_in_test_only = set(test_columns_with_high_null) - set(train_columns_with_high_null)

# 4. 결과 출력
print("Columns with >50% nulls in both train and test:", columns_in_both)
print("Columns with >50% nulls in train only:", columns_in_train_only)
print("Columns with >50% nulls in test only:", columns_in_test_only)

Columns with >50% nulls in both train and test: {'Head Zero Position Z Unit Time_Dam', 'Head Purge Position X Unit Time_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Judge Value_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Unit Time_Fill2', 'Head Purge Position X Judge Value_Fill1', 'Head Purge Position Y Unit Time_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Judge Value_Dam', 'THICKNESS 3 Judge Value_Dam', 'CURE SPEED Unit Time_Fill2', 'CURE START POSITION X Judge Value_Dam', 'Head Zero Position Y Unit Time_Dam', 'HEAD Standby Position Y Judge Value_Fill1', 'CURE START POSITION Z Judge Value_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Unit Time_Fill1', 'Machine Tact time Judge Value_Dam', 'Stage3 Line1 Distance Speed Unit Time_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Unit Time_Fill2', 'Head Clean Position Y Judge Value_Fill2', 'Stage2 Line2 Distance Speed Judge Value_Dam', 'Head Purge Position Z Unit Time_Dam', 'HEAD NORMAL

## 결측치 처리

### 결측치 비율이 50%가 넘는 컬럼 제거

In [7]:
def highly_null(df, threshold = 0.5):
    df_copy = df.copy()
    missing_ratio = df_copy.isnull().mean()
    
    null_columns = df_copy.columns[missing_ratio > threshold]
    df_copy.drop(columns = null_columns, inplace = True)    
    return df_copy

df_train= highly_null(df_train)
X_test = highly_null(X_test)

In [8]:
train_columns = df_train.columns
test_columns = X_test.columns
columns_in_both = set(train_columns).intersection(set(test_columns))
print(len(columns_in_both))

177


In [9]:
X_test

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,12.5,90,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,12.5,90,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,2.5,-90,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,2.5,-90,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


## 같은 데이터로만 구성된 컬럼 제거

In [10]:
def remove_all_same(df):
    df_copy = df.copy()
    same_value_columns = df_copy.columns[df_copy.nunique()==1]
    df_copy.drop(columns = same_value_columns, inplace = True)
    return df_copy

In [11]:
df_train_same = remove_all_same(df_train)
X_test_same = remove_all_same(X_test)

In [12]:
train_columns = df_train_same.columns
test_columns = X_test_same.columns
columns_in_both = set(train_columns).intersection(set(test_columns))
print(f"same columns: {len(columns_in_both)}")
print(f"train columns: {len(train_columns)}")
print(f"test columns: {len(test_columns)}")

same columns: 142
train columns: 143
test columns: 142


## null 개수확인

In [13]:
print(f"train null : {df_train_same.isnull().sum().sum()}")
print(f"test null : {X_test_same.isnull().sum().sum()}")      

train null : 0
test null : 0


## train데이터 X, y 분리

In [14]:
# df_train_same

In [15]:
X_train = df_train_same.iloc[:,:-1]
y_train = df_train_same.iloc[:,-1]
X_test = X_test_same

## 데이터 스케일링

In [16]:
from sklearn.preprocessing import RobustScaler

X_numerical_train = X_train[X_train.select_dtypes(include=['number']).columns]
X_categorical_train = X_train[X_train.select_dtypes(include=['object', 'category']).columns]

X_numerical_test = X_test[X_test.select_dtypes(include=['number']).columns]
X_categorical_test = X_test[X_test.select_dtypes(include=['object', 'category']).columns]

scaler = RobustScaler()
scaler.fit(X_numerical_train)
X_numerical_train_scaled = scaler.transform(X_numerical_train)

X_numerical_test_scaled = scaler.transform(X_numerical_test)

X_numerical_train_scaled_df = pd.DataFrame(X_numerical_train_scaled,
                                          columns = X_numerical_train.columns)

X_numerical_test_scaled_df = pd.DataFrame(X_numerical_test_scaled,
                                        columns = X_numerical_test.columns)

X_categorical_train_df = pd.DataFrame(X_categorical_train, 
                                   columns=X_categorical_train.columns)

X_categorical_test_df = pd.DataFrame(X_categorical_test, 
                                 columns=X_categorical_test.columns)

X_train = pd.concat([X_numerical_train_scaled_df, 
                            X_categorical_train_df.reset_index(drop=True)], axis=1)

X_test = pd.concat([X_numerical_test_scaled_df, 
                          X_categorical_test_df.reset_index(drop=True)], axis=1)


## 문자형 데이터 숫자형으로 변경

### y값 변경

In [17]:
y_train.replace({'Normal':0, 'AbNormal':1}, inplace = True)

### X값 변경

### Ordinal Encoder

In [18]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 범주형 컬럼만 선택
train_categorical_cols = X_train.select_dtypes(include=['object']).columns
test_categorical_cols = X_test.select_dtypes(include=['object']).columns

# 범주형 데이터에만 LabelEncoder 적용
X_train[train_categorical_cols] = encoder.fit_transform(X_train[train_categorical_cols])
    
X_test[test_categorical_cols] = encoder.transform(X_test[test_categorical_cols])

X_train_encoded = X_train
X_test_encoded = X_test

### Target Encoder

In [19]:
# from sklearn.model_selection import KFold
# from category_encoders import TargetEncoder
# import numpy as np

# # 범주형 컬럼만 선택
# train_categorical_cols = X_train.select_dtypes(include=['object']).columns
# val_categorical_cols = X_val.select_dtypes(include=['object']).columns
# test_categorical_cols = X_test.select_dtypes(include=['object']).columns

# # KFold 설정
# n_splits = 5
# kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# # 타겟 인코딩을 적용할 컬럼과 결과를 담을 데이터프레임 준비
# X_train_encoded = X_train.copy()
# X_val_encoded = X_val.copy()
# X_test_encoded = X_test.copy()

# # 각 폴드에서 인코딩 수행
# for train_index, valid_index in kf.split(X_train):
#     X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
#     y_train_fold = y_train.iloc[train_index]

#     # Target Encoder 적용
#     encoder = TargetEncoder()
#     encoder.fit(X_train_fold[train_categorical_cols], y_train_fold)
#     X_train_encoded.iloc[valid_index, X_train_encoded.columns.get_indexer(train_categorical_cols)] = encoder.transform(X_valid_fold[train_categorical_cols]).values

# # 2. X_val와 X_test에 타겟 인코딩 적용
# encoder = TargetEncoder()
# encoder.fit(X_train[train_categorical_cols], y_train)

# X_train_encoded[train_categorical_cols] = encoder.transform(X_train[train_categorical_cols]).values
# X_val_encoded[val_categorical_cols] = encoder.transform(X_val[val_categorical_cols]).values
# X_test_encoded[test_categorical_cols] = encoder.transform(X_test[test_categorical_cols]).values

# # 결과 확인
# print("X_train_encoded dtypes:\n", X_train_encoded.dtypes)
# print("X_val_encoded dtypes:\n", X_val_encoded.dtypes)
# print("X_test_encoded dtypes:\n", X_test_encoded.dtypes)


In [20]:
# from sklearn.preprocessing import OrdinalEncoder

# encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# # 범주형 컬럼만 선택
# train_categorical_cols = X_train.select_dtypes(include=['object']).columns
# val_categorical_cols = X_val.select_dtypes(include=['object']).columns
# test_categorical_cols = X_test.select_dtypes(include=['object']).columns

# # 범주형 데이터에만 LabelEncoder 적용
# X_train[train_categorical_cols] = encoder.fit_transform(X_train[train_categorical_cols])
    
# X_val[val_categorical_cols] = encoder.transform(X_val[val_categorical_cols])

# X_test[test_categorical_cols] = encoder.transform(X_test[test_categorical_cols])


## 샘플링

In [21]:
from imblearn.over_sampling import SMOTE
# SMOTE 적용

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train_encoded, y_train)


## PCA

In [22]:
from sklearn.cluster import KMeans

# 예시로 3개의 클러스터로 데이터 클러스터링
kmeans = KMeans(n_clusters=6, random_state=42)
X_clustered = kmeans.fit_predict(X_train)

# 클러스터 레이블을 원래 데이터에 추가
X_resampled_with_cluster = np.c_[X_train, X_clustered]




In [23]:
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA

# # PCA 수행
# pca = PCA()
# pca.fit(X_resampled_with_cluster)

# # 누적 설명 분산 비율 계산
# cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# # 누적 설명 분산 비율 시각화
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('Cumulative Explained Variance by PCA')
# plt.grid()
# plt.show()

In [24]:
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(pca.explained_variance_) + 1), pca.explained_variance_, marker='o', linestyle='--')
# plt.xlabel('Principal Component')
# plt.ylabel('Eigenvalue')
# plt.title('Scree Plot')
# plt.grid()
# plt.show()


In [25]:
from sklearn.decomposition import PCA

# 예시로 2개의 주성분으로 차원 축소
pca = PCA(n_components=4)
X_train = pca.fit_transform(X_resampled_with_cluster)

X_test_clustered = np.c_[X_test_encoded, kmeans.predict(X_test_encoded)]
X_test = pca.transform(X_test_clustered)

# 모델 학습

## 평가지표

In [26]:
def get_clf_eval(y_test, pred=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))

## LGBM

### 하이퍼 파라미터 튜닝

In [27]:
# from hyperopt import hp

# # max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# # colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2사이 정규 분포된 값으로 검색. 
# lgbm_search_space = {
#     'n_estimators': hp.quniform('n_estimators', 100, 1000, 10),  # 트리의 개수
#     'num_leaves': hp.quniform('num_leaves', 20, 150, 1),  # 하나의 트리가 가질 수 있는 최대 리프 노드 개수
#     'learning_rate': hp.uniform('learning_rate', 0.005, 0.2),  # 학습률
#     'max_depth': hp.quniform('max_depth', 3, 15, 1),  # 트리의 최대 깊이
# }

In [28]:
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from lightgbm import LGBMClassifier
# from hyperopt import STATUS_OK

# # fmin()에서 입력된 search_space값으로 입력된 모든 값은 실수형임. 
# # XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함. 
# # 정확도는 높은 수록 더 좋은 수치임. -1* 정확도를 곱해서 큰 정확도 값일 수록 최소가 되도록 변환
# def objective_func(search_space):
#     # LightGBMClassifier with hyperparameters from search_space
#     lgbm_clf = LGBMClassifier(
#         n_estimators=int(search_space['n_estimators']),    
#         learning_rate=search_space['learning_rate'],
#         num_leaves=int(search_space['num_leaves']),
#         max_depth=int(search_space['max_depth']),
#         metric='logloss'
#     )

#     skf = StratifiedKFold(n_splits=5)
#     accuracy = cross_val_score(lgbm_clf, X_train, y_train, scoring='f1_weighted', cv=skf)
        
#     # accuracy는 cv=3 개수만큼의 정확도 결과를 가지므로 이를 평균해서 반환하되 -1을 곱해줌. 
#     return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}

In [29]:
# from hyperopt import fmin, tpe, Trials

# trial_val = Trials()
# best = fmin(fn=objective_func,
#             space=lgbm_search_space,
#             algo=tpe.suggest,
#             max_evals=20, # 최대 반복 횟수를 지정합니다.
#             trials=trial_val, rstate=np.random.default_rng(seed=9))
# print('best:', best)

In [30]:
# from lightgbm import LGBMClassifier
# from sklearn.metrics import precision_recall_curve

# # 최적 하이퍼파라미터로 RandomForestClassifier 생성
# model = LGBMClassifier(
#     n_estimators=int(best['n_estimators']),
#     max_depth=int(best['max_depth']),
#     num_leaves=int(best['num_leaves']),
#     learning_rate=best['learning_rate'],
#     random_state=42
# )
# model.fit(X_train, y_train)

# preds = model.predict(X_val)

In [31]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

model1 = LGBMClassifier()
model2 = XGBClassifier()
model3 = RandomForestClassifier()

ensemble_model = VotingClassifier(estimators=[
    ('lgbm', model1), ('xgb', model2), ('rf', model3)], voting='soft', weights=[2, 1, 2])
ensemble_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 38156, number of negative: 38156
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 76312, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


# 제출하기

In [32]:
test_pred = ensemble_model.predict(X_test)

test_pred = np.where(test_pred == 0, 'Normal', 'AbNormal')

np.sum(test_pred == 'AbNormal')

1500

In [33]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)