In [None]:
# 라이브러리
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import plotly.express as px
import numpy as np

In [1]:
train = pd.read_csv("data/train_data.csv")
test = pd.read_csv("data/test_data.csv")
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [5]:
# train 데이터에 스케일러 적용
col = list(train.columns)
x_train = MinMaxScaler().fit_transform(train)
pd.DataFrame(x_train, columns = col).head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,0.213922,0.272396,0.0,0.227873,0.202708,0.275531,0.048241,0.0
1,0.448217,0.827513,0.0,0.49595,0.821413,0.813992,0.089447,0.0
2,0.268251,0.402672,0.0,0.290879,0.347863,0.401857,0.057789,0.0
3,0.34635,0.586092,0.0,0.379388,0.552264,0.579576,0.071357,0.0
4,0.266553,0.400243,0.0,0.289679,0.345324,0.399536,0.057789,0.0


In [7]:
### test 데이터에 스케일러 적용
col = list(test.columns)
x_test = MinMaxScaler().fit_transform(test)
pd.DataFrame(x_test, columns = col).head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,0.367797,0.633592,1.0,0.395891,0.618443,0.599312,0.351981,0.0
1,0.39322,0.69217,1.0,0.423721,0.684856,0.653738,0.37296,0.0
2,0.233898,0.318589,1.0,0.245665,0.261844,0.306225,0.242424,0.0
3,0.315254,0.510161,1.0,0.337025,0.478849,0.484517,0.310023,0.0
4,0.291525,0.454274,1.0,0.31036,0.415398,0.43228,0.289044,0.0


In [8]:
### 주성분 개수 지정
pca = PCA()
pca.fit(x_train)
exp = np.cumsum(pca.explained_variance_ratio_)
px.area(x=range(1, exp.shape[0] + 1), y=exp, labels = {'x' : '# of components', 'y':'explained variance'})

In [9]:
### PCA 적용
pca = PCA(n_components=3, svd_solver='full')
pca_train = pca.fit_transform(x_train)
pca_train = pd.DataFrame(pca_train)
pca_train.head()

Unnamed: 0,0,1,2
0,-0.485624,-0.372119,-0.152992
1,0.559136,-0.453403,-0.047198
2,-0.240654,-0.391253,-0.127804
3,0.104618,-0.418024,-0.093289
4,-0.245319,-0.391001,-0.127766


In [10]:
### test 데이터에 적용
pca_test = pca.transform(x_test) ### fit은 하지 않는다
pca_test = pd.DataFrame(pca_test)
pca_test.head()

Unnamed: 0,0,1,2
0,0.199622,-0.427552,-0.106669
1,0.310141,-0.436217,-0.097221
2,-0.393793,-0.380748,-0.159154
3,-0.032781,-0.409246,-0.127219
4,-0.138351,-0.400896,-0.136502


In [11]:
# 참고 : https://towardsdatascience.com/machine-learning-for-anomaly-detection-and-condition-monitoring-d4614e7de770
def MD(inv_cov_matrix, mean_distr, data, verbose = False):
    inv_covariance_matrix = inv_cov_matrix
    vars_mean = mean_distr
    diff = data - vars_mean
    md = []
    for i in range(len(diff)):
        md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
    return md

In [12]:
def MD_detectOutliers(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    outliers = []
    for i in range(len(dist)):
        if dist[i] >= threshold:
            outliers.append(i)  # index of the outlier
    return np.array(outliers)

In [13]:
def MD_threshold(dist, extreme = False, verbose = False):
    k = 2.
    threshold = np.mean(dist) * k
    return threshold

In [14]:
def is_pos_def(A):
    if np.allclose(A, A.T):
        try:
            np.linalg.cholesky(A)
            return True
        except np.linalg.LinAlgError:
            return False
    else:
        return False

In [15]:
def cov_matrix(data, verbose=False):
    covariance_matrix = np.cov(data, rowvar=False)
    if is_pos_def(covariance_matrix):
        inv_covariance_matrix = np.linalg.inv(covariance_matrix)
        if is_pos_def(inv_covariance_matrix):
            return covariance_matrix, inv_covariance_matrix
        else:
            print("Error: Inverse of Covariance Matrix is not positive definite!")
    else:
        print("Error: Covariance Matrix is not positive definite!")

In [16]:
train_data = np.array(pca_train.values)
test_data = np.array(pca_test.values)

cov_matrix, inv_cov_matrix = cov_matrix(train_data) #3개 주성분 간 공분산 행렬
mean_distr = train_data.mean(axis = 0) #3개 주성분의 평균

In [17]:
dist_train = MD(inv_cov_matrix, mean_distr, train_data, verbose=False)
dist_test = MD(inv_cov_matrix, mean_distr, test_data, verbose=False)

In [18]:
threshold = MD_threshold(dist_train, extreme=True)

anomaly_train = pd.DataFrame()
anomaly_train['Mob dist'] = dist_train
anomaly_train['Threshold'] = threshold
anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Threshold'] #threshold 값 이하이면 이상치
anomaly_train.head()

Unnamed: 0,Mob dist,Threshold,Anomaly
0,1.681194,3.302146,False
1,1.795085,3.302146,False
2,1.453652,3.302146,False
3,1.396096,3.302146,False
4,1.455862,3.302146,False


In [19]:
anomaly_train.Anomaly.value_counts() # train 데이터는 모두 정상이어야 한다

False    2451
True       12
Name: Anomaly, dtype: int64

In [20]:
anomaly_test = pd.DataFrame()
anomaly_test['Mob dist'] = dist_test
anomaly_test['Threshold'] = threshold
anomaly_test['Anomaly'] = anomaly_test['Mob dist'] > anomaly_test['Threshold'] #threshold 값 이하이면 이상치
anomaly_test.head()

Unnamed: 0,Mob dist,Threshold,Anomaly
0,1.484783,3.302146,False
1,1.562263,3.302146,False
2,1.622552,3.302146,False
3,1.425867,3.302146,False
4,1.44874,3.302146,False


In [21]:
anomaly_test.Anomaly.value_counts()

False    7308
True       81
Name: Anomaly, dtype: int64

In [22]:
answer = pd.read_csv("data/answer_sample.csv")
answer

Unnamed: 0,type,label
0,0,-1
1,0,-1
2,0,-1
3,0,-1
4,0,-1
...,...,...
7384,7,-1
7385,7,-1
7386,7,-1
7387,7,-1


In [23]:
# 정상 0, 이상 1
anomaly_test['label'] = [0 if i == False else 1 for i in anomaly_test['Anomaly']]
anomaly_test.head()

Unnamed: 0,Mob dist,Threshold,Anomaly,label
0,1.484783,3.302146,False,0
1,1.562263,3.302146,False,0
2,1.622552,3.302146,False,0
3,1.425867,3.302146,False,0
4,1.44874,3.302146,False,0


In [24]:
answer['label'] = anomaly_test['label']
answer.head()

Unnamed: 0,type,label
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [25]:
answer.to_csv("data/PCA test.csv", index = False)