In [43]:
import pandas as pd
import numpy as np
from sklearn.covariance import EllipticEnvelope
from tabgan.sampler import OriginalGenerator, GANGenerator
from imblearn.combine import *
from imblearn.over_sampling import *
import warnings
warnings.filterwarnings(action='ignore')

In [44]:
train = pd.read_csv("data/train_data.csv")
train2 = pd.read_csv("data/test_data.csv")
test = pd.read_csv("data/test_data.csv")

train = pd.concat([train, train2]) #train, test 합치기
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [46]:
# air_end_temp / motor_rpm 열 추가
train['temp_divide_rpm'] = (train['air_end_temp'] ) / (train['motor_rpm'])
test['temp_divide_rpm'] = (test['air_end_temp'] ) / (test['motor_rpm'])

In [47]:
def get_pred_label(model_pred):
    # (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [48]:
# 일단 모델에 적용해서 label을 만들자
val_contamination = 0.05 #휴리스틱하게 결정

model = EllipticEnvelope(support_fraction = 0.994, contamination = val_contamination, random_state = 42)
train_pred = model.fit_predict(train)
train_pred = get_pred_label(train_pred)
train['label'] = train_pred #train+test 데이터에 label 컬럼 추가

In [49]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1].reset_index()
y.drop("index", axis=1, inplace=True)

In [50]:
X

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,temp_divide_rpm
0,1.59,41.00,0.7,20.53,1680.0,58.67,2.93,0,0.024405
1,2.97,59.28,0.7,38.40,3142.0,74.91,3.75,0,0.018867
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0,0.022388
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0,0.020483
4,1.90,45.21,0.7,24.65,2017.0,62.41,3.12,0,0.022414
...,...,...,...,...,...,...,...,...,...
7384,2.12,48.08,0.7,27.45,2246.0,64.96,3.25,7,0.021407
7385,1.48,39.63,0.7,19.19,1570.0,57.44,2.87,7,0.025242
7386,1.56,40.61,0.7,20.15,1649.0,58.32,2.92,7,0.024627
7387,1.59,40.99,0.7,20.52,1679.0,58.66,2.93,7,0.024413


In [51]:
y

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
9847,0
9848,0
9849,0
9850,0


In [52]:
new_train, new_target = GANGenerator().generate_data_pipe(X, y, test, )

Fitting CTGAN transformers for each column:   0%|          | 0/10 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

In [53]:
new_train['type'].value_counts()

0    1721
1    1459
2    1435
4    1214
3    1203
5     995
6     993
7     742
Name: type, dtype: int64

In [54]:
new_train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,temp_divide_rpm
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,0.024405
1,4.51,52.74,0.7,52.0,2579.0,69.66,4.63,3,0.02045
2,4.99,56.16,0.7,58.0,2853.0,72.7,4.79,3,0.019685
3,5.02,56.33,0.7,58.0,2866.0,72.84,4.79,3,0.019655
4,3.52,45.64,0.7,41.0,2011.0,63.34,4.32,3,0.022695


In [55]:
### GANgenerator로 생성된 데이터 추가해서 학습
new_train['label'] = new_target
df = pd.concat([train, new_train]) # new 학습 데이터

In [56]:
df.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,temp_divide_rpm,label
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,0.024405,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0,0.018867,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0,0.022388,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0,0.020483,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0,0.022414,0


In [57]:
df['label'].value_counts()

0    18736
1      878
Name: label, dtype: int64

In [59]:
# label 컬럼 삭제
df.drop(['label'], axis=1, inplace=True)

In [60]:
val_contamination = 0.03 #휴리스틱하게 결정

model = EllipticEnvelope(support_fraction = 0.994, contamination = val_contamination, random_state = 42)
model.fit(df)

In [61]:
test_pred = model.predict(test)
test_pred = get_pred_label(test_pred)
test['label'] = test_pred

In [62]:
test['label'].value_counts()

0    7093
1     296
Name: label, dtype: int64

In [63]:
answer = pd.read_csv("data/answer_sample.csv")
answer['label'] = test['label']
answer.to_csv("temp_divide_rpm + EllipticEnvelope + GAN + EllipticEnvelope.csv", index = False)