In [1]:
# 라이브러리
import pandas as pd
import numpy as np
from sklearn.covariance import EllipticEnvelope
from imblearn.combine import *
from imblearn.over_sampling import *
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv("data/train_data.csv")
val = pd.read_csv("data/test_data.csv")
test = pd.read_csv("data/test_data.csv")

train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [5]:
train['data'] = 'train'
val['data'] = 'valid'

df = pd.concat([train, val])
df.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,data
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,train
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0,train
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0,train
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0,train
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0,train


In [4]:
def get_pred_label(model_pred):
    # (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

Model define & fit

In [6]:
df.set_index('data', drop=True, inplace=True)
df.head() # data 구분을 인덱스로 삼음

Unnamed: 0_level_0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
train,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
train,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0
train,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0
train,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0
train,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0


In [7]:
val_contamination = 0.05 #휴리스틱하게 결정

In [8]:
model = EllipticEnvelope(support_fraction = 0.994, contamination = val_contamination, random_state = 42)
train_pred = model.fit_predict(df) #train+test 데이터에 모델 적용

In [9]:
train_pred = get_pred_label(train_pred)
df['label'] = train_pred #train+test 데이터에 label 컬럼 추가

In [10]:
df.head()

Unnamed: 0_level_0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,label
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,0
train,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0,0
train,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0,0
train,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0,0
train,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0,0


In [11]:
df[df.index=='train']['label'].value_counts() # train 데이터 분류 확인

0    2414
1      49
Name: label, dtype: int64

In [12]:
df[df.index=='valid']['label'].value_counts() # validation 데이터 분류 확인

0    6945
1     444
Name: label, dtype: int64

In [13]:
new_df = df.reset_index(drop=True).copy() #new_df : train+test with label

In [22]:
new_df.shape

(9852, 9)

Oversampling : Smotetomek

In [18]:
X = new_df.iloc[:,:-1]
y = new_df.iloc[:,-1]

In [19]:
X_new, y_new = SMOTETomek(random_state=42).fit_resample(X, y)
X_new['label'] = y_new #train + test oversampling

In [20]:
X_new.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,label
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0,0
2,1.91,45.29,0.7,24.73,2023.0,62.48,3.12,0,0
3,2.37,51.33,0.7,30.63,2506.0,67.84,3.39,0,0
4,1.9,45.21,0.7,24.65,2017.0,62.41,3.12,0,0


In [21]:
X_new.shape #약 2배 증가

(18624, 9)

AutoML

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, shuffle=True, test_size=0.2)

In [25]:
automl = AutoML()
settings = {"time_budget": 60, "metric": "f1", "task":"classification"}
automl.fit(X_train, y_train, **settings)
print(automl.model.estimator)

[flaml.automl.logger: 04-14 11:29:04] {1768} INFO - task = classification
[flaml.automl.logger: 04-14 11:29:04] {1775} INFO - Data split method: stratified
[flaml.automl.logger: 04-14 11:29:04] {1778} INFO - Evaluation method: cv
[flaml.automl.logger: 04-14 11:29:04] {1891} INFO - Minimizing error metric: 1-f1
[flaml.automl.logger: 04-14 11:29:04] {2011} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 04-14 11:29:04] {2341} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-14 11:29:04] {2479} INFO - Estimated sufficient time budget=2252s. Estimated necessary time budget=55s.
[flaml.automl.logger: 04-14 11:29:04] {2526} INFO -  at 0.3s,	estimator lgbm's best error=0.0000,	best estimator lgbm's best error=0.0000
[flaml.automl.logger: 04-14 11:29:04] {2341} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-14 11:29:04] {2526} INFO -  at 0.4s,	estimator lgbm's best 

In [26]:
pred = automl.predict(X_test)
print(f1_score(y_test, pred))

1.0


In [35]:
test['label'] = df[df.index=='valid']['label'].reset_index(drop=True)
test['label']

0       0
1       0
2       0
3       0
4       0
       ..
7384    0
7385    0
7386    0
7387    0
7388    0
Name: label, Length: 7389, dtype: int32

In [36]:
test_pred = automl.predict(test)
test['label'] = test_pred
test.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type,label
0,2.51,53.28,0.7,32.54,2662.0,69.58,3.48,0,0
1,2.66,55.24,0.7,34.45,2819.0,71.32,3.57,0,0
2,1.72,42.74,0.7,22.23,1819.0,60.21,3.01,0,0
3,2.2,49.15,0.7,28.5,2332.0,65.91,3.3,0,0
4,2.06,47.28,0.7,26.67,2182.0,64.24,3.21,0,0


In [37]:
test['label'].value_counts()

0    6945
1     444
Name: label, dtype: int64