In [1]:
import os
import sys
from pathlib import Path

SRC_PATH = Path(__name__).resolve().parents[1]

sys.path.append(str(SRC_PATH))

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score


In [2]:
normal = pd.read_csv("../data/normal.csv")
abnormal = pd.read_csv("../data/abnormal.csv")

In [3]:
normal.describe()

Unnamed: 0,frequency,voltage,current,phase_angle,label
count,12000.0,12000.0,12000.0,12000.0,12000.0
mean,60.0,1.000006,0.949989,-2.4e-05,0.0
std,0.001001,0.001503,0.001492,0.001503,0.0
min,59.995498,0.993688,0.943928,-0.006606,0.0
25%,59.999325,0.998998,0.948986,-0.001036,0.0
50%,59.999992,1.000003,0.949992,-3.1e-05,0.0
75%,60.000665,1.001026,0.951004,0.001,0.0
max,60.003868,1.005905,0.955494,0.005474,0.0


In [4]:
abnormal.describe()

Unnamed: 0,frequency,voltage,current,phase_angle,label
count,11650.0,11645.0,11645.0,11644.0,12000.0
mean,59.99988,0.999747,0.949987,0.000259,0.005
std,0.003142,0.008906,0.001509,0.008887,0.070537
min,59.904699,0.699213,0.944323,-0.006929,0.0
25%,59.999325,0.998985,0.948968,-0.001008,0.0
50%,59.99999,0.999988,0.949986,-5e-06,0.0
75%,60.000661,1.001017,0.951004,0.001026,0.0
max,60.004315,1.005295,0.956996,0.301163,1.0


In [None]:

from scipy.spatial.distance import mahalanobis

class BayesianCovariance(object):
    def __init__(self, num_features:int):
        
        self.num_features = num_features
        self.means = np.zeros(num_features)  # 평균 벡터 초기화
        self.covariance = np.eye(num_features)  # 단위 공분산 행렬로 초기화
        self.n_total = 0  # 전체 데이터 개수
        
    def fit(self, x: np.ndarray):
        """
        초기 정상 데이터를 사용하여 평균 벡터와 공분산 행렬 계산
        - normal_data: 정상 데이터 배열 (shape: [num_samples, num_features])
        """
        self.means = np.mean(x, axis=0)
        self.covariance = np.cov(x.T)  # 공분산 행렬 계산
        self.n_total = x.shape[0]  # 정상 데이터 샘플 개수 저장
    
    def update_covariance(self, x):
        """
        윈도우 데이터에 기반하여 다변수 평균, 분산 및 임계값 업데이트
        - window_data: 윈도우 내 데이터 배열 (shape: [window_size, num_features])
        """
        n_w = len(x)  # 윈도우 크기
        mean_w = np.mean(x, axis=0)  # 윈도우 내 각 특성에 대한 평균
        variance_w = np.var(x, axis=0)  # 윈도우 내 각 특성에 대한 분산

        if self.n_total > 0:
            # 각 특성에 대한 평균 및 분산 업데이트
            new_means = (self.n_total * self.means + n_w * mean_w) / (self.n_total + n_w)
            new_variances = (
                (self.n_total - 1) * self.variances +
                (n_w - 1) * variance_w +
                (self.n_total * n_w) / (self.n_total + n_w) * (self.means - mean_w) ** 2
            ) / (self.n_total + n_w)

            # 업데이트
            self.means = new_means
            self.variances = new_variances
        else:
            # 첫 윈도우일 경우, 윈도우의 평균과 분산으로 초기화
            self.means = mean_w
            self.variances = variance_w

        # 데이터 개수 업데이트
        self.n_total += n_w

        # 각 특성에 대해 임계값 동적 업데이트 (평균 + 3 표준편차)
        self.thresholds = self.means + 3 * np.sqrt(self.variances)
    
    def detect(self, x: np.ndarray, threhold=3):
        """
        마할라노비스 거리를 이용한 이상 탐지 수행
        """
        anomalies = []
        inv_covariance = np.linalg.inv(self.covariance)  # 공분산 행렬의 역행렬
        for i, data_point in enumerate(x):
            dist = mahalanobis(data_point, self.means, inv_covariance)  # 마할라노비스 거리 계산
            if dist > threhold:  # 임계값 (마할라노비스 거리가 3 이상일 경우 이상으로 간주)
                anomalies.append(i)
        return anomalies    
    

In [38]:
import numpy as np
from scipy.spatial.distance import mahalanobis

class BayesianWindowCovarianceUpdater:
    def __init__(self, num_features, threshold=3.0):
        """
        윈도우 단위로 데이터를 처리하고, 변동이 심한 윈도우는 업데이트를 하지 않는 클래스
        - num_features: 데이터의 특성 개수
        - threshold: 마할라노비스 거리 또는 Z-Score에 기반한 임계값
        """
        self.num_features = num_features
        self.n_total = 0  # 전체 데이터 개수
        self.mean = np.zeros(num_features)  # 초기 평균
        self.covariance = np.eye(num_features)  # 초기 공분산 행렬 (단위 행렬로 초기화)
        self.threshold = threshold  # 변동성에 대한 임계값
        self.anomaly_flags = []

    def update_window(self, window_data):
        """
        윈도우 데이터로 평균과 공분산 행렬을 업데이트, 변동이 크면 업데이트하지 않음
        - window_data: 윈도우 내 데이터 배열 (shape: [window_size, num_features])
        """
        window_data = np.asarray(window_data)  # 윈도우 데이터를 numpy 배열로 변환
        window_mean = np.mean(window_data, axis=0)  # 윈도우 내 평균 계산
        window_cov = np.cov(window_data.T)  # 윈도우 내 공분산 행렬 계산

        if self.n_total > 0:
            # 윈도우 데이터의 중심화된 벡터
            inv_covariance = np.linalg.inv(self.covariance)
            
            for data_point in window_data:
                # 마할라노비스 거리 계산 (변동성 평가)
                dist = mahalanobis(data_point, self.mean, inv_covariance)
                
                # if dist > self.threshold:
                #     print(f"Skipping update due to high variance (Mahalanobis distance: {dist})")
                #     return  # 변동성이 너무 큰 경우 업데이트 중단
                            # 변동성이 임계값을 초과하는 경우 이상(True), 그렇지 않으면 정상(False)
                            
                if dist > self.threshold:
                    self.anomaly_flags.append(True)
                    print(f"Skipping update due to high variance (Mahalanobis distance: {dist})")
                else:
                    self.anomaly_flags.append(False)

                # 변동성이 적절한 경우 업데이트 수행
                if self.n_total > 0:
                    old_mean = self.mean.copy()
                    self.mean = (self.n_total * self.mean + len(window_data) * window_mean) / (self.n_total + len(window_data))
                    centralized_data = window_data - old_mean

                    # 공분산 행렬 업데이트
                    self.covariance = ((self.n_total - 1) / self.n_total) * self.covariance + \
                                    np.dot(centralized_data.T, centralized_data) / (self.n_total + len(window_data))
                else:
                    # 첫 윈도우일 경우 초기화
                    self.mean = window_mean
                    self.covariance = window_cov
                    

        # 전체 데이터 개수 업데이트
        self.n_total += len(window_data)

# 예시: 4개의 특성을 가진 데이터를 윈도우 단위로 처리하는 경우
num_features = 4
updater = BayesianWindowCovarianceUpdater(num_features=num_features, threshold=3.0)

In [39]:
from src.data_handler import handle_missing_values

abnormal = handle_missing_values(abnormal)
abnormal_select = abnormal[["frequency", "voltage", "current", "phase_angle"]]

In [40]:
window_size = 120


for window in range(0, abnormal_select.shape[0], window_size):
    start = window
    end = window+window_size
    
    updater.update_window(abnormal_select[start:end])
    print(f"Updated Mean: {updater.mean}")
    print(f"Updated Covariance:\n{updater.covariance}")


Updated Mean: [0. 0. 0. 0.]
Updated Covariance:
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
Skipping update due to high variance (Mahalanobis distance: 60.01500972919429)
Skipping update due to high variance (Mahalanobis distance: 30.00783538231462)
Skipping update due to high variance (Mahalanobis distance: 15.004237441202031)
Skipping update due to high variance (Mahalanobis distance: 7.502107981324811)
Skipping update due to high variance (Mahalanobis distance: 3.750475953307717)
Updated Mean: [5.99999905e+01 1.00002654e+00 9.50285687e-01 5.67692413e-05]
Updated Covariance:
[[8.89463997e+02 1.48186887e+01 1.40816109e+01 8.38299003e-04]
 [1.48186887e+01 6.13392215e-01 2.34697443e-01 1.13007408e-05]
 [1.40816109e+01 2.34697443e-01 5.89460848e-01 2.21320147e-05]
 [8.38299003e-04 1.13007408e-05 2.21320147e-05 3.66431030e-01]]
Updated Mean: [ 6.00000863e+01  1.00031266e+00  9.49795711e-01 -7.97006203e-05]
Updated Covariance:
[[5.38923986e+02 8.97859912e+00 8.53201831e+00

In [41]:
abnormal["pred"] = 0

In [42]:
abnormal["pred"].iloc[window_size:] = updater.anomaly_flags

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  abnormal["pred"].iloc[window_size:] = updater.anomaly_flags
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abn

In [43]:
abnormal["pred"] = abnormal["pred"].astype(int)

In [44]:
from sklearn.metrics import confusion_matrix

confusion_matrix(abnormal["label"], abnormal["pred"])

array([[11935,     5],
       [   60,     0]])