In [1]:
from typing import Union
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler

In [2]:
RANDOM_SEED = 42
DATA_PATH = Path("../data")

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)


In [3]:
train_data = pd.read_csv(DATA_PATH / "train.csv")
test_data = pd.read_csv(DATA_PATH / "test.csv")

In [8]:
print(train_data['xmeas_39'].min(),train_data['xmeas_39'].max()) 0.06~0.14
print(train_data['xmeas_40'].min(),train_data['xmeas_40'].max()) 51~56
print(train_data['xmeas_41'].min(),train_data['xmeas_41'].max()) 41~56
print(train_data['xmeas_14'].min(),train_data['xmeas_14'].max()) 20~30
print(train_data['xmeas_5'].min(),train_data['xmeas_5'].max()) 25~28
print(train_data['xmeas_6'].min(),train_data['xmeas_6'].max()) 41~44




0.060641 0.13718
51.564 55.773
41.768 45.979
20.752 29.855
0.060641 0.13718
25.951 27.818
41.394 43.257


In [9]:
train_data.columns

Index(['faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2',
       'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7', 'xmeas_8',
       'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13', 'xmeas_14',
       'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19', 'xmeas_20',
       'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25', 'xmeas_26',
       'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31', 'xmeas_32',
       'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37', 'xmeas_38',
       'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2', 'xmv_3', 'xmv_4',
       'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10', 'xmv_11'],
      dtype='object')

In [4]:
def process_data(df) -> pd.DataFrame:
    numeric_cols = [
       'xmeas_1', 'xmeas_2',
       'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7', 'xmeas_8',
       'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13', 'xmeas_14',
       'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19', 'xmeas_20',
       'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25', 'xmeas_26',
       'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31', 'xmeas_32',
       'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37', 'xmeas_38',
       'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2', 'xmv_3', 'xmv_4',
       'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10', 'xmv_11'
    ]
    return df[numeric_cols]

In [9]:
train_df = process_data(train_data)
test_df = process_data(test_data)


In [6]:
class HybridAnomalyDetector:
    def __init__(self):
        self.iso_forest = IsolationForest(contamination=0.05, random_state=42)
        self.sgd_svm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)
        self.scaler = StandardScaler()
        self.feature_stats = {}  # 범위 기준 저장

    def fit(self, data: pd.DataFrame):
        # 1. 상관계수 계산
        corr_matrix = data.corr()
        low_corr_features = [col for col in data.columns if corr_matrix[col].abs().max() < 0.3]

        # 2. 범위 기반 이상 탐지 기준 저장 (평균 ± 3표준편차)
        for feature in low_corr_features:
            mean = data[feature].mean()
            std = data[feature].std()
            self.feature_stats[feature] = (mean, std)

        # 3. 범위 기반 피처 제거 후 나머지로 모델 학습
        data_for_model = data.drop(columns=low_corr_features)
        scaled_data = self.scaler.fit_transform(data_for_model)

        # IsolationForest와 SGDOneClassSVM 학습
        self.iso_forest.fit(scaled_data)
        self.sgd_svm.fit(scaled_data)

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        # 1. 범위 기반 예측 수행
        range_outliers = pd.DataFrame(index=data.index)
        for feature, (mean, std) in self.feature_stats.items():
            lower_bound = mean - 3 * std
            upper_bound = mean + 3 * std
            range_outliers[feature + '_Outlier'] = (~data[feature].between(lower_bound, upper_bound)).astype(int)

        # 2. 나머지 피처에 대해 IsolationForest와 SGDOneClassSVM 적용
        data_for_model = data.drop(columns=self.feature_stats.keys(), errors='ignore')
        scaled_data = self.scaler.transform(data_for_model)

        iso_labels = (self.iso_forest.predict(scaled_data) == -1).astype(int)  # 1: 이상, 0: 정상
        svm_labels = (self.sgd_svm.predict(scaled_data) == -1).astype(int)  # 1: 이상, 0: 정상

        # 3. 결과 통합 (다수결 방식)
        results = pd.DataFrame({
            'IsolationForest': iso_labels,
            'SGDOneClassSVM': svm_labels
        }, index=data.index)

        results = pd.concat([results, range_outliers], axis=1)
        results['Final_Label'] = (results.mean(axis=1) >= 0.5).astype(int)  # 다수결

        return results[['Final_Label']]

In [8]:
if __name__ == "__main__":
    # 학습 데이터 준비
    np.random.seed(42)
    train_data = train_df

    # 모델 생성 및 학습
    model = HybridAnomalyDetector()
    model.fit(train_data)

    # 새로운 데이터에 대한 예측
    new_data = test_df
    predictions = model.predict(new_data)


NameError: name 'test_df' is not defined

In [10]:
predictions = model.predict(test_df)

In [12]:
predictions.to_csv('multi.csv')