# 1. 분석을 위한 Dataset 생성
* 목적
    * 이후 진행될 모델링 및 분석 과정을 원활하게 수행하기 위한 Dataset을 생성

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline



In [2]:
def split_data_keep_binary_label_ratio(X:pd.DataFrame, y: pd.Series, split_ratio: float = 0.7)\
    -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
    '''
    '불량'의 비율을 유지하며 dataset을 분리한다.
    '''
    Y_index = y[y==0].index
    N_index = y[y==1].index

    Y_train_cnt = int(len(Y_index) * split_ratio)
    N_train_cnt = int(len(N_index) * split_ratio)

    Y_train_idx = np.random.choice(Y_index, Y_train_cnt, replace=False)
    Y_test_idx = np.setdiff1d(Y_index, Y_train_idx)

    N_train_idx = np.random.choice(N_index, N_train_cnt, replace=False)
    N_test_idx = np.setdiff1d(N_index, N_train_idx)

    train_idx = np.random.permutation(np.union1d(Y_train_idx, N_train_idx))
    test_idx = np.random.permutation(np.union1d(Y_test_idx, N_test_idx))

    X_train = X.loc[train_idx, :]
    y_train = y[train_idx]
    X_test = X.loc[test_idx, :]
    y_test = y[test_idx]

    return X_train, y_train, X_test, y_test

In [None]:
labeled_dir = './../dataset/original/labeled.csv'
labeled_data = pd.read_csv(labeled_dir)
labeled_data = labeled_data.drop_duplicates()

unlabeled_dir = './../dataset/original/unlabeled.csv'
unlabeled_data = pd.read_csv(unlabeled_dir)


### Cleansing
- Labeled data & Unlabeled data의 EDA에서 확인된 동일한 값(0)을 가지고 있는 변수 제거
- Y column (PassOrFail) 값 숫자 변환

In [None]:
# remove columns
drop_cols = ['Mold_Temperature_1', 'Mold_Temperature_2','Mold_Temperature_5', 'Mold_Temperature_6', 'Mold_Temperature_7',
             'Mold_Temperature_8','Mold_Temperature_9', 'Mold_Temperature_10', 'Mold_Temperature_11', 'Mold_Temperature_12']
labeled_data = labeled_data.drop(drop_cols, axis=1)
unlabeled_data = unlabeled_data.drop(['Unnamed: 0']+drop_cols, axis=1)

# Y 처리
labeled_data['PassOrFail'] = labeled_data['PassOrFail'].map({'Y':0, 'N':1})

# 저장
labeled_data.to_csv('./dataset/labeled_new.csv', index=False)
unlabeled_data.to_csv('./dataset/unlabeled_new.csv', index=False)

### Split Dataset
1. X, y 데이터 분리
2. label 0, 1별로 index 분리
3. 각 라벨에서 일정한 비율로 train, test index 랜덤하게 추출
4. label0 train index + label1 train index -> train X, y data -> 같은 방식으로 train / validation 분리
5. label0 test index + label1 test index -> test X, ydata

In [None]:
labeled_X = labeled_data.iloc[:, 9:]
labeled_y = labeled_data.PassOrFail

unlabeled_X = unlabeled_data.iloc[:, 9:]

labeled_X_train, labeled_y_train, labeled_X_test, labeled_y_test = split_data_keep_binary_label_ratio(labeled_X, labeled_y)
labeled_X_train, labeled_y_train, labeled_X_valid, labeled_y_valid = split_data_keep_binary_label_ratio(labeled_X_train,
                                                                                                        labeled_y_train)
# train : 3916 / valid :1680 / test : 2400

# save created data
def save_created_dataset(data, file_name):
    save_dir = './dataset'
    data.to_csv(os.path.join(save_dir, file_name), index=False)

save_created_dataset(labeled_X_train, 'labeled_X_train.csv')
save_created_dataset(labeled_y_train, 'labeled_y_train.csv')

save_created_dataset(labeled_X_valid, 'labeled_X_valid.csv')
save_created_dataset(labeled_y_valid, 'labeled_y_valid.csv')

save_created_dataset(labeled_X_test, 'labeled_X_test.csv')
save_created_dataset(labeled_y_test, 'labeled_y_test.csv')


### Scaling

In [None]:
pipeline = Pipeline([('normalizer', Normalizer()),
          ('scaler', MinMaxScaler())])

pipeline.fit(unlabeled_X)

unlabeled_X_scaled = pd.DataFrame(pipeline.transform(unlabeled_X), columns=unlabeled_X.columns)
labeled_X_train_scaled = pd.DataFrame(pipeline.transform(labeled_X_train), columns=labeled_X_train.columns)
labeled_X_valid_scaled = pd.DataFrame(pipeline.transform(labeled_X_valid), columns=labeled_X_valid.columns)
labeled_X_test_scaled = pd.DataFrame(pipeline.transform(labeled_X_test), columns=labeled_X_test.columns)

### 데이터 저장

In [None]:
save_created_dataset(unlabeled_X_scaled, 'unlabeled_X_scaled.csv')
save_created_dataset(labeled_X_train_scaled, 'labeled_X_train_scaled.csv')
save_created_dataset(labeled_X_valid_scaled, 'labeled_X_valid_scaled.csv')
save_created_dataset(labeled_X_test_scaled, 'labeled_X_test_scaled.csv')

with open('./dataset/scaling_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

