## 1. Load Required Libraries

In [1]:
# import packages
import pandas as pd
import numpy as np
import src.util as utils
import joblib
import yaml
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE


## 2. Import Configuration and Load Dataset

In [2]:
config = utils.load_config()

In [3]:
def load_dataset(config: dict):
        # Load every set of data
    X_train = utils.pickle_load(config["train_set_path"][0])
    y_train = utils.pickle_load(config["train_set_path"][1])

    X_valid = utils.pickle_load(config["valid_set_path"][0])
    y_valid = utils.pickle_load(config["valid_set_path"][1])

    X_test = utils.pickle_load(config["test_set_path"][0])
    y_test = utils.pickle_load(config["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([X_train, y_train], axis = 1)
    valid_set = pd.concat([X_valid, y_valid], axis = 1)
    test_set = pd.concat([X_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [4]:
train_set, valid_set, test_set = load_dataset(config)

## 3. Check or Handling Missing Values

In [5]:
train_set.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
valid_set.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
test_set.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## 4. Balancing Label

In [8]:
train_set.Outcome.value_counts()

0    350
1    187
Name: Outcome, dtype: int64

### 4.1 Undersampling

In [9]:
def rus_fit_resample(set_data: pd.DataFrame) -> pd.DataFrame:
    # Create copy of set data
    set_data = set_data.copy()

    # Create sampling object
    rus = RandomUnderSampler(random_state = 123)

    # Balancing set data
    X_rus, y_rus = rus.fit_resample(set_data.drop(columns = config["label"]),
                                 set_data[config["label"]])

    # Concatenate balanced data
    set_data_rus = pd.concat([X_rus, y_rus], axis = 1)

    # Return balanced data
    return set_data_rus

In [10]:
train_set_rus = rus_fit_resample(train_set)

In [11]:
train_set_rus

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,100,70,52,57,40.5,0.677,25,0
1,5,136,82,0,0,0.0,0.640,69,0
2,2,85,65,0,0,39.6,0.930,27,0
3,8,126,88,36,108,38.5,0.349,49,0
4,1,109,38,18,120,23.1,0.407,26,0
...,...,...,...,...,...,...,...,...,...
369,4,146,78,0,0,38.5,0.520,67,1
370,0,107,62,30,74,36.6,0.757,25,1
371,3,80,82,31,70,34.2,1.292,27,1
372,10,161,68,23,132,25.5,0.326,47,1


### 4.2 Oversampling

In [12]:
def ros_fit_resample(set_data: pd.DataFrame) -> pd.DataFrame:
    # Create copy of set data
    set_data = set_data.copy()

    # Create sampling object
    ros = RandomOverSampler(random_state = 123)

    # Balancing set data
    X_ros, y_ros = ros.fit_resample(set_data.drop(columns = config["label"]),
                                 set_data[config["label"]])

    # Concatenate balanced data
    set_data_ros = pd.concat([X_ros, y_ros], axis = 1)

    # Return balanced data
    return set_data_ros

In [13]:
train_set_ros = ros_fit_resample(train_set)

In [14]:
train_set_ros

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,13,152,90,33,29,26.8,0.731,43,1
1,0,104,64,37,64,33.6,0.510,22,1
2,5,137,108,0,0,48.8,0.227,37,1
3,0,111,65,0,0,24.6,0.660,31,0
4,6,105,70,32,68,30.8,0.122,37,0
...,...,...,...,...,...,...,...,...,...
695,7,161,86,0,0,30.4,0.165,47,1
696,3,173,78,39,185,33.8,0.970,31,1
697,5,139,80,35,160,31.6,0.361,25,1
698,0,146,70,0,0,37.9,0.334,28,1


### 4.3 SMOTE

In [15]:
def sm_fit_resample(set_data: pd.DataFrame) -> pd.DataFrame:
    # Create copy of set data
    set_data = set_data.copy()

    # Create sampling object
    sm = SMOTE(random_state = 123)

    # Balancing set data
    X_sm, y_sm = sm.fit_resample(set_data.drop(columns = config["label"]),
                                 set_data[config["label"]])

    # Concatenate balanced data
    set_data_sm = pd.concat([X_sm, y_sm], axis = 1)

    # Return balanced data
    return set_data_sm

In [16]:
train_set_sm = sm_fit_resample(train_set)

In [17]:
train_set_sm

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,13,152,90,33,29,26.800000,0.731000,43,1
1,0,104,64,37,64,33.600000,0.510000,22,1
2,5,137,108,0,0,48.800000,0.227000,37,1
3,0,111,65,0,0,24.600000,0.660000,31,0
4,6,105,70,32,68,30.800000,0.122000,37,0
...,...,...,...,...,...,...,...,...,...
695,7,161,85,0,0,30.639206,0.163306,46,1
696,0,184,98,25,0,38.253784,0.536265,33,1
697,5,126,71,0,0,27.922621,0.411182,27,1
698,0,127,55,31,166,37.781807,1.027951,33,1


In [18]:
train_set_sm.Outcome.value_counts()

1    350
0    350
Name: Outcome, dtype: int64

## 5. Handling Outliers

In [19]:
def remove_outliers(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col_name in set_data.columns[:-1]:
        q1 = set_data[col_name].quantile(0.25)
        q3 = set_data[col_name].quantile(0.75)
        iqr = q3 - q1
        set_data_cleaned = set_data[~((set_data[col_name] < (q1 - 1.5 * iqr)) | (set_data[col_name] > (q3 + 1.5 * iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())
    
    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == (set_data.shape[1]-1)].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned

In [20]:
# remove outlier set rus
train_set_rus_bal_cleaned = remove_outliers(train_set_rus)

In [21]:
train_set_rus_bal_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,100,70,52,57,40.5,0.677,25,0
243,8,197,74,0,0,25.9,1.191,39,1
240,6,147,80,0,0,29.5,0.178,50,1
239,4,148,60,27,318,30.9,0.150,29,1
238,9,145,80,46,130,37.9,0.637,40,1
...,...,...,...,...,...,...,...,...,...
154,1,86,66,52,65,41.3,0.917,29,0
111,6,144,72,27,228,33.9,0.255,40,0
110,2,127,46,21,335,34.4,0.176,22,0
109,1,89,76,34,37,31.2,0.192,23,0


In [22]:
# remove outlier set ros
train_set_ros_bal_cleaned = remove_outliers(train_set_ros)

In [23]:
train_set_ros_bal_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,13,152,90,33,29,26.8,0.731,43,1
440,2,98,60,17,120,34.7,0.198,22,0
431,3,112,74,30,0,31.6,0.197,25,1
432,7,129,68,49,125,38.5,0.439,43,1
433,4,128,70,0,0,34.3,0.303,24,0
...,...,...,...,...,...,...,...,...,...
234,9,57,80,37,0,32.8,0.096,41,0
233,1,109,60,8,182,25.4,0.947,21,0
230,7,62,78,0,0,32.6,0.391,41,0
231,7,133,84,0,0,40.2,0.696,37,0


In [24]:
# remove outlier set sm
train_set_sm_bal_cleaned = remove_outliers(train_set_sm)

In [25]:
train_set_sm_bal_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,13,152,90,33,29,26.8,0.731,43,1
456,4,154,62,31,284,32.8,0.237,23,0
444,2,99,70,16,44,20.4,0.235,27,0
445,7,161,86,0,0,30.4,0.165,47,1
446,2,108,64,0,0,30.8,0.158,21,0
...,...,...,...,...,...,...,...,...,...
213,6,104,74,18,156,29.9,0.722,41,1
212,3,162,52,38,0,37.2,0.652,24,1
308,1,115,70,30,96,34.6,0.529,32,1
275,1,100,66,15,56,23.6,0.666,26,0


## 6. Dump Dataset

In [26]:
x_train = {
    "Undersampling" : train_set_rus_bal_cleaned.drop(columns = "Outcome"),
    "Oversampling" : train_set_ros_bal_cleaned.drop(columns = "Outcome"),
    "SMOTE" : train_set_sm_bal_cleaned.drop(columns = "Outcome")
}

y_train = {
    "Undersampling" : train_set_rus_bal_cleaned.Outcome,
    "Oversampling" : train_set_ros_bal_cleaned.Outcome,
    "SMOTE" : train_set_sm_bal_cleaned.Outcome
}

In [27]:
utils.pickle_dump(x_train, "data/processed/X_train_feng.pkl")
utils.pickle_dump(y_train, "data/processed/y_train_feng.pkl")

utils.pickle_dump(valid_set.drop(columns = "Outcome"), "data/processed/X_valid_feng.pkl")
utils.pickle_dump(valid_set.Outcome, "data/processed/y_valid_feng.pkl")

utils.pickle_dump(test_set.drop(columns = "Outcome"), "data/processed/X_test_feng.pkl")
utils.pickle_dump(test_set.Outcome, "data/processed/y_test_feng.pkl")