# 데이터 로드

In [44]:
import os
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %mathplotlib inline
import pandas as pd
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

## train 데이터

In [45]:
ROOT_DIR = "data"
RANDOM_STATE = 42

train_df = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

df_train = train_df
# display(df_train)

X_train = train_df.iloc[:,:-1]
# display(X_train)

y_train = train_df.iloc[:,-1]
# display(y_train)

## test 데이터

In [46]:
df = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
X_test = df.drop(columns = ['Set ID'])
# display(X_test)

# 데이터 전처리
## 오입력 값 결측치로 바꾸기

In [47]:
X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)

X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)

X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = X_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)
X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = X_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK',np.nan)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK',np.nan)

## 결측치 처리
### 결측치 비율이 50%가 넘는 컬럼 제거

In [48]:
def highly_null(df, threshold=0.3):
    df_copy = df.copy()
    missing_ratio = df_copy.isnull().mean()
    
    null_columns = df_copy.columns[missing_ratio > threshold]
    df_copy.drop(columns = null_columns, inplace = True)    
    return df_copy

X_train = highly_null(X_train)
# display(X_train)

X_test = highly_null(X_test)
# display(X_test)

df_train = highly_null(df_train)
# display(df_train)

## train, validation 데이터 나누기

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_STATE)

df_train = pd.concat([X_train, y_train], axis=1)
# display(df_train)

## 이상치 제거(train 데이터에만)

In [17]:
# def isolation_forest(df):

#     # 숫자형 데이터만 선택
#     df_numeric = df.select_dtypes(include=['number'])

#     # Isolation Forest 모델 초기화
#     iso_forest = IsolationForest(contamination=0.05, random_state=42)
#     iso_forest.fit(df_numeric)
#     y_pred = iso_forest.predict(df_numeric)

#     # 원래 데이터프레임에 예측 결과 추가
#     df['Prediction'] = y_pred

#     # 이상치와 정상 데이터로 구분
#     df_normal = df[df['Prediction'] == 1]
#     df_outliers = df[df['Prediction'] == -1]
    
#     return df_normal

# df_train_outlier = isolation_forest(df_train)
# # display(df_train_outlier)

In [50]:
df_train_outlier = df_train

#df_train_outlier.drop(columns = ['Prediction'], inplace = True)
# display(df_train_outlier)

X_train_outlier = df_train_outlier.iloc[:,:-1]
# display(X_train_outlier)

y_train_outlier = df_train_outlier.iloc[:,-1]
# display(y_train_outlier)

## 데이터 스케일링(Robust Scaler)

In [51]:
X_numerical_train_outlier = X_train_outlier[X_train_outlier.select_dtypes(include=['number']).columns]
X_categorical_train_outlier = X_train_outlier[X_train_outlier.select_dtypes(include=['object', 'category']).columns]
# display(X_numerical_train_outlier)

X_numerical_val_outlier = X_val[X_val.select_dtypes(include=['number']).columns]
X_categorical_val_outlier = X_val[X_val.select_dtypes(include=['object', 'category']).columns]

X_numerical_test = X_test[X_test.select_dtypes(include=['number']).columns]
X_categorical_test = X_test[X_test.select_dtypes(include=['object', 'category']).columns]
# display(X_numerical_test)

scaler_outlier = RobustScaler()
scaler_outlier.fit(X_numerical_train_outlier)

X_numerical_train_outlier_scaled = scaler_outlier.transform(X_numerical_train_outlier)
X_numerical_val_outlier_scaled = scaler_outlier.transform(X_numerical_val_outlier)
X_numerical_test_outlier_scaled = scaler_outlier.transform(X_numerical_test)

X_numerical_train_outlier_scaled_df = pd.DataFrame(X_numerical_train_outlier_scaled,
                                          columns = X_numerical_train_outlier.columns)
X_numerical_val_outlier_scaled_df = pd.DataFrame(X_numerical_val_outlier_scaled,
                                        columns = X_numerical_val_outlier.columns)
X_numerical_test_outlier_scaled_df = pd.DataFrame(X_numerical_test_outlier_scaled,
                                        columns = X_numerical_test.columns)

X_categorical_train_outlier_df = pd.DataFrame(X_categorical_train_outlier, 
                                   columns=X_categorical_train_outlier.columns)
X_categorical_val_outlier_df = pd.DataFrame(X_categorical_val_outlier, 
                                 columns=X_categorical_val_outlier.columns)
X_categorical_test_outlier_df = pd.DataFrame(X_categorical_test, 
                                 columns=X_categorical_test.columns)

X_train_outlier_scaled = pd.concat([X_numerical_train_outlier_scaled_df, 
                            X_categorical_train_outlier_df.reset_index(drop=True)], axis=1)
X_val_outlier_scaled = pd.concat([X_numerical_val_outlier_scaled_df, 
                          X_categorical_val_outlier_df.reset_index(drop=True)], axis=1)
X_test_outlier_scaled = pd.concat([X_numerical_test_outlier_scaled_df, 
                          X_categorical_test_outlier_df.reset_index(drop=True)], axis=1)


## 문자형 데이터를 숫자형으로 변경(Ordinal Encoder)

In [52]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 범주형 컬럼만 선택
train_outlier_categorical_cols = X_train_outlier_scaled.select_dtypes(include=['object']).columns
val_outlier_categorical_cols = X_val_outlier_scaled.select_dtypes(include=['object']).columns
test_outlier_categorical_cols = X_test_outlier_scaled.select_dtypes(include=['object']).columns

# 범주형 데이터에만 LabelEncoder 적용
X_train_outlier_scaled[train_outlier_categorical_cols] = encoder.fit_transform(X_train_outlier_scaled[train_outlier_categorical_cols])
X_val_outlier_scaled[val_outlier_categorical_cols] = encoder.transform(X_val_outlier_scaled[val_outlier_categorical_cols])
X_test_outlier_scaled[test_outlier_categorical_cols] = encoder.transform(X_test_outlier_scaled[test_outlier_categorical_cols])

# SMOTE 샘플링

In [53]:
y_train_outlier.replace({'Normal':0, 'AbNormal':1}, inplace=True)
y_val.replace({'Normal':0, 'AbNormal':1}, inplace=True)

In [54]:
smote = SMOTE(random_state=0, sampling_strategy='not minority')
# smote = SMOTE(random_state=0)
X_resampled_outlier, y_resampled_outlier = smote.fit_resample(X_train_outlier_scaled, y_train_outlier)

In [192]:
y_train.value_counts()

target
Normal      34332
AbNormal     2123
Name: count, dtype: int64

In [193]:
y_resampled_outlier.value_counts()

target
0    32611
1     2021
Name: count, dtype: int64

## PCA

In [60]:
# 예시로 3개의 클러스터로 데이터 클러스터링
kmeans = KMeans(n_clusters=5, random_state=RANDOM_STATE)
X_clustered = kmeans.fit_predict(X_resampled_outlier)

# 클러스터 레이블을 원래 데이터에 추가
X_resampled_with_cluster = np.c_[X_resampled_outlier, X_clustered]

In [61]:
# 예시로 2개의 주성분으로 차원 축소
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_resampled_with_cluster)

X_val_clustered = np.c_[X_val_outlier_scaled, kmeans.predict(X_val_outlier_scaled)]
X_val_pca = pca.transform(X_val_clustered)

X_test_clustered = np.c_[X_test_outlier_scaled, kmeans.predict(X_test_outlier_scaled)]
X_test_pca = pca.transform(X_test_clustered)

## LDA

In [31]:
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# # y_resampled에서 클래스 수 확인
# n_classes = len(np.unique(y_resampled_outlier))

# # 피처 수 및 LDA의 최대 가능한 차원 수 계산
# n_features = X_resampled_with_cluster.shape[1]
# max_components = min(n_features, n_classes - 1)

# # LDA로 차원 축소 (n_components는 max_components 이하로 설정)
# lda = LDA(n_components=max_components)
# X_pca = lda.fit_transform(X_resampled_with_cluster, y_resampled_outlier)

# X_val_clustered = np.c_[X_val_outlier_scaled, kmeans.predict(X_val_outlier_scaled)]
# X_val_pca = lda.transform(X_val_clustered)

# X_test_clustered = np.c_[X_test_outlier_scaled, kmeans.predict(X_test_outlier_scaled)]
# X_test_pca = lda.transform(X_test_clustered)

# 모델 학습
## 평가지표

In [57]:
def get_clf_eval(y_test, pred=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}\n정밀도: {1:.4f}\n재현율: {2:.4f}\n  F1: {3:.4f}'.format(accuracy, precision, recall, f1))

## 랜덤포레스트

In [62]:
# n_estimators : 트리의 개수 (기본값 100)
# criterion : 불순도 계산 방법 (기본값 gini) (gini, entropy, log_loss)
# max_depth : 트리의 최대 깊이 (기본값 None) none이면 모든 리프가 순수해질 때까지 모든 리프에 min_samples_split개수보다 적은 샘플이 남을 때까지 확장
# min_samples_split : 노드를 분할하기 위한 최소한의 샘플 데이터 수 (기본값 2)
# min_samples_leaf : 리프 노드가 되기 위해 필요한 최소한의 샘플 데이터 수 (기본값 1)
RF = RandomForestClassifier(
    random_state=42, 
    n_jobs=-1,
    n_estimators=200,
    criterion='entropy',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1
)
RF.fit(X_pca, y_resampled_outlier)

# preds = RF.predict(X_val_pca)

probs = RF.predict_proba(X_val_pca)[:,1]
preds = (probs > 0.3).astype(int)

get_clf_eval(y_val, preds)

오차 행렬
[[3663  161]
 [ 181   46]]
정확도: 0.9156
정밀도: 0.2222
재현율: 0.2026
  F1: 0.2120


# 제출하기

In [32]:
# test_pred = RF.predict(X_test_pca)

test_prob = RF.predict_proba(X_test_pca)[:,1]
test_pred = (test_prob > 0.3).astype(int)

test_pred = np.where(test_pred==0, 'Normal', 'AbNormal')
np.sum(test_pred == 'AbNormal')

985

In [141]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
len(df_sub)
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

## Grid Search

In [None]:
params = {  
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier(random_state=RANDOM_STATE)
grid_search = GridSearchCV(estimator=model, param_grid=params, 
                           scoring='f1_weighted', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_pca, y_resampled_outlier)

print('최적 하이퍼 파라미터:\n', grid_search.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_search.best_score_))

best_model = grid_search.best_estimator_
display(best_model)