In [19]:
# 라이브러리
import pandas as pd
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from tabgan.sampler import OriginalGenerator, GANGenerator
from imblearn.combine import *
from imblearn.over_sampling import *
import optuna

import warnings
warnings.filterwarnings(action='ignore')

In [20]:
train = pd.read_csv("data/train_data.csv")
train2 = pd.read_csv("data/test_data.csv")
test = pd.read_csv("data/test_data.csv")

train = pd.concat([train, train2]) #train, test 합치기
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [22]:
def get_pred_label(model_pred):
    # (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [24]:
# 모델 1 하이퍼 파라미터 튜닝
model = EllipticEnvelope()
param_grid = {'support_fraction': [0.8, 0.9, 0.99], 'contamination': [0.02, 0.03, 0.04, 0.05]}

grid_search = GridSearchCV(model, param_grid=param_grid, scoring='recall', cv=5)
grid_search.fit(train)
print('Best hyperparameters:', grid_search.best_params_)

Best hyperparameters: {'contamination': 0.02, 'support_fraction': 0.8}


In [25]:
optimized_model = EllipticEnvelope(support_fraction = grid_search.best_params_['support_fraction'], 
                                   contamination = grid_search.best_params_['contamination'], random_state=42)
train_pred = optimized_model.fit_predict(train)
train_pred = get_pred_label(train_pred)
train['label'] = train_pred #train+test 데이터에 label 컬럼 추가

In [26]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1].reset_index()
y.drop("index", axis=1, inplace=True)

In [27]:
new_train, new_target = GANGenerator().generate_data_pipe(X, y, test, )

Fitting CTGAN transformers for each column:   0%|          | 0/10 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

In [28]:
new_train['type'].value_counts()

0    1721
1    1459
2    1435
4    1214
3    1203
5     995
6     993
7     742
Name: type, dtype: int64

In [29]:
new_train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,temp_divide_rpm
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,0.024405
1,3.04,42.2,0.7,35.0,1736.0,60.29,4.16,3,0.024309
2,2.53,38.55,0.7,29.0,1444.0,57.04,4.0,3,0.026697
3,3.56,45.9,0.7,41.0,2032.0,63.58,4.33,3,0.022589
4,2.71,39.84,0.7,31.0,1547.0,58.19,4.06,3,0.025753


In [30]:
### GANgenerator로 생성된 데이터 추가해서 학습
new_train['label'] = new_target
df = pd.concat([train, new_train]) # new 학습 데이터

In [31]:
df['label'].value_counts()

0    19288
1      326
Name: label, dtype: int64

In [32]:
X_train = df.drop(['label'], axis=1)
y_train = df['label']

In [14]:
# 모델 2 하이퍼 파라미터 튜닝
model2 = EllipticEnvelope()
param_dist = {'support_fraction': [0.7, 0.8, 0.9, 0.999], 'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]}

random_search = RandomizedSearchCV(model2, param_distributions=param_dist, n_iter = 100, scoring='recall', cv=5)
random_search.fit(X_train, y_train)
print('Best hyperparameters:', random_search.best_params_)

Best hyperparameters: {'contamination': 0.01, 'support_fraction': 0.7}


In [33]:
optimized_model = EllipticEnvelope(support_fraction = grid_search.best_params_['support_fraction'], contamination = grid_search.best_params_['contamination'], random_state = 42)
optimized_model.fit(X_train)

In [34]:
# 최적 파라미터 값으로 test predict
test_pred = optimized_model.predict(test)
test_pred = get_pred_label(test_pred)
test['label'] = test_pred

In [35]:
test['label'].value_counts()

0    7192
1     197
Name: label, dtype: int64

In [36]:
answer = pd.read_csv("data/answer_sample.csv")
answer['label'] = test['label']
answer.to_csv("GridsearchCV + RandomsearchCV.csv", index = False)