# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [15]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기

In [16]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"), sep=',')

### 언더 샘플링

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [22]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

### 데이터 분할

In [23]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")

# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XB278-1,1,OK,240.0,,,...,127,,,1,,,0,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XB274-1,1,OK,1000.0,,,...,157,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1X8298-1,1,OK,240.0,,,...,242,,,1,,,0,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1X9671-2,1,OK,1000.0,,,...,16,,,528,,,1,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XC129-1,1,OK,240.0,,,...,329,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4B1XE009-1,1,OK,240.0,,,...,1,,,9,,,1,,,AbNormal
4696,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XC632-1,1,OK,240.0,,,...,60,,,1,,,0,,,AbNormal
4697,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334502,3K1X7331-1,1,OK,1000.0,,,...,164,,,1,,,0,,,AbNormal
4698,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3G1XA615-2,1,OK,240.0,,,...,252,,,1,,,0,,,AbNormal


## 3. 모델 학습


### 모델 정의


In [12]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습


In [13]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

model.fit(train_x, train_y)

features

['Insp. Seq No._Dam',
 'CURE END POSITION X Collect Result_Dam',
 'CURE END POSITION Z Collect Result_Dam',
 'CURE END POSITION Θ Collect Result_Dam',
 'CURE SPEED Collect Result_Dam',
 'CURE STANDBY POSITION X Collect Result_Dam',
 'CURE STANDBY POSITION Z Collect Result_Dam',
 'CURE STANDBY POSITION Θ Collect Result_Dam',
 'CURE START POSITION X Collect Result_Dam',
 'CURE START POSITION Z Collect Result_Dam',
 'CURE START POSITION Θ Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
 'Dispense Volume(Stage1) Collect Result_Dam',
 'Dispense Volume(Stage2) Collect Result_Dam',
 'Dispense Volume(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'

## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [8]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [9]:
df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

df_test_x

Unnamed: 0,Insp. Seq No._Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,CURE STANDBY POSITION Θ Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Z Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,1,1000.0,12.0,90,70,1150,33.0,0,280,33.0,...,50,91.0,270.0,50,85,19.0,13.0,195,1,0
1,1,1000.0,12.0,90,70,1150,33.0,0,280,33.0,...,119,50.0,91.0,270,50,85.0,19.0,14,256,1
2,1,240.0,2.0,-90,70,1150,33.0,0,1030,33.0,...,50,91.0,270.0,50,85,19.0,1.0,98,1,0
3,1,1000.0,12.0,90,70,1150,33.0,0,280,33.0,...,119,50.0,91.0,270,50,85.0,20.0,14,0,1
4,1,240.0,2.0,-90,70,1150,33.0,0,1030,33.0,...,119,50.0,91.0,270,50,85.0,19.0,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1,1000.0,12.0,90,70,1150,33.0,0,280,33.0,...,119,50.0,91.0,270,50,85.0,19.0,14,131,1
17357,1,1000.0,12.0,90,70,1150,33.0,0,280,33.0,...,119,50.0,91.0,270,50,85.0,19.0,12,279,1
17358,1,240.0,2.0,-90,70,1150,33.0,0,1030,33.0,...,119,50.0,91.0,270,50,85.0,20.0,4,66,1
17359,1,240.0,2.0,-90,70,1150,33.0,0,1030,33.0,...,50,91.0,270.0,50,85,18.0,1.0,117,1,0


In [10]:
test_pred = model.predict(df_test_x)
test_pred

array(['AbNormal', 'Normal', 'AbNormal', ..., 'AbNormal', 'Normal',
       'AbNormal'], dtype=object)

### 제출 파일 작성


In [11]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
# pip install pycaret

In [None]:
# from pycaret.classification import *
# import pandas as pd

# data = get_data(train_data)

데이터 로드 및 확인   

결측치 처리 (Null 값 처리)   

중복된 데이터 및 단일 값으로 구성된 피처 제거   

문자형 데이터를 숫자형 데이터로 변환   

이상치 탐지 및 처리 (Optional)   

데이터 스케일링 및 정규화 (Optional)   

SMOTE(Synthetic Minority Over-sampling Technique) 샘플링

피처 중요도 판단 및 피처 선택

Isolation Forest / One-Class SVM / Local Outlier Factor / IQR

권순재 - lightgbm
고정현 - ramdomforest
박선유 - xgb
최지영 - logistic regression

In [None]:
# null로만 구성된 칼럼 삭제

def remove_null_columns(df):
    df_copy = df.copy()
    null_columns = df_copy.columns[df_copy.isnull().all()]
    df_copy.drop(columns = null_columns, inplace = True)
    return df_copy

df = remove_null_columns(df)

In [None]:
# 결측치 있는 칼럼과 결측치 비율 확인

missing_ratio = df.isnull().mean()*100
missing_ratio = missing_ratio[missing_ratio>0]
missing_ratio = missing_ratio.sort_values(ascending = False)
print(missing_ratio)

In [None]:
# 결측치 비율이 50%가 넘는 칼럼 제거

def highly_null(df, threshold=0.5):
    df_copy = df.copy()
    missing_ratio = df_copy.isnull().mean()
    
    null_colums = df_copy.columns[missing_ratio > threshold]
    df_copy.drop(columns = null_colums, inplace = True)

    return df_copy

df = highly_null(df)

In [None]:
# 문자형 데이터 null값이 포함된 row제거

df = df.dropna(subset = string_columns)

string_columns = df.select_dtypes(include = ['object']).columnns
print(df[string_columns].isnull().sum().sum())

In [None]:
# 결측치 비율이 50% 미만인 칼럼의 null값 평균으로 대체

def lower_null(df):
    df_copy = df.copy()
    for col in df_copy.columns:
        if df_copy[col].dtype in ['float64', 'int64']:
            df_copy[col].fillna(df_copy[col].mean(), inplace = True)
    return df_copy

df = lower_null(df)

In [None]:
# target값 숫자로 변경

df['target'] = df['target'].replace({'Normal':0, 'AbNormal':1})
display(df['target'])

In [None]:
# 단일 값으로 구성된 칼럼 제거

def same_data_columns (df) :
    df_copy = df.copy()
    same_data_columns = df.columns[df.nunique() == 1]
    df_copy.drop(columns = same_data_columns, inplace = True)
    return df_copy

df = same_data_columns(df)

In [None]:
# 문자형 feature랑 숫자형 feature 구분 (붙어있으면 전처리하기 힘든 거 같음)

categorical_features = df.select_dtypes(include=['object', 'category']).columns
numerical_features = df.select_dtypes(include=['number']).columns
df_categorical = df[categorical_features]
df_numerical = df[numerical_features]
display(df_categorical)

In [None]:
# 문자열 데이터 숫자형으로 변경 (원 핫 인코딩)

from sklearn.preprocessing import OneHotEncoder

def onehotencoder(df, categorical_features):
    df_copy = df.copy()
    oh_encoder = OneHotEncoder(sparse = False, drop = 'first')
    encoded_features = oh_encoder.fit_transform(df_copy[categorical_features])
    encoded_feature_names = oh_encoder.get_feature_names_out(input_features = categorical_features)
    print("인코딩된 열수 : ", encoded_features.shape[1])
    print("인코딩된 열 이름 수 : ", len(encoded_feature_names))
    encoded_df = pd.DataFrame(encoded_features, columns = encoded_feature_names, index = df_copy.index)
    display(encoded_df)
    return encoded_df

df_categorical_encoded = onehotencoder(df_categorical, categorical_features)

In [None]:
# 숫자형 데이터의 이상치 처리

from sklearn. ensemble import IsolationForest

# Iso/ation Forest 모델 설정 및 학습
iso_forest = IsolationForest(contamination = 0.05, random_state = 42)
iso_forest.fit(df_numerical)

# 예측 (1:정상, -7:이상처)
pred = iso_forest.predict(df_numerical)

# 이상치 데이터 필터링
outliers_isoforest = df_numerical[pred == -1]
print("Isolation Forest를 이용한 이상치 : ")
print(outliers_isoforest)
df_numerical_outlier = df_numerical[pred == 1]
display(df_numerical_outlier)

In [None]:
#SMOTE 전에 스케일링 먼저

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_numerical_train)
X_numerical_train_scaled = scaler.transform(X_numerical_train)
X_numerical_val_scaled = scaler.transform(X_numerical_val)

X_train_numerical_scaled_df = pd.DataFrame(X_numerical_train_scaled, columns = X_numerical.columns)
X_val_numerical_scaled_df = pd.DataFrame(X_numerical_val_scaled, columns = X_numerical.columns)

X_train_categorical = pd.DataFrame(X_categorical_train, columns = X_categorical.columns)
X_val_categorical = pd.DataFrame(X_categorical_val, columns=X_categorical.columns)

X_train_scaled = pd.concat([X_train_numerical_scaled_df, X_train_categorical.reset_index(drop=True)], axis=1)
X_val_scaled = pd.concat([X_val_numerical_scaled_df, X_val_categorical.reset_index(drop=True)], axis=1)

X_train = X_train_scaled. dropna()
X_val = X_val_scaled.dropna()

In [15]:
# SMOTE 샘플링

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 0)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

NameError: name 'X_train' is not defined