# Import Libraries

In [28]:
import pandas as pd
import numpy as np
import src.util as utils
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE 

# Load Config

In [29]:
config = utils.load_config()

# Load Dataset

In [30]:
def load_dataset(config_data: dict):
    # load set of train
    X_train = utils.pickle_load(config_data['train_set_path'][0])
    y_train = utils.pickle_load(config_data['train_set_path'][1])
    
    # Load set of valid
    X_valid = utils.pickle_load(config_data['valid_set_path'][0])
    y_valid = utils.pickle_load(config_data['valid_set_path'][1])
    
    # Load set of test
    X_test = utils.pickle_load(config_data['test_set_path'][0])
    y_test = utils.pickle_load(config_data['test_set_path'][1])
    
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [31]:
X_train, y_train, X_valid, y_valid, X_test, y_test = load_dataset(config)

In [32]:
# check the result
print((X_train.shape, y_train.shape), '\n')
print((X_valid.shape, y_valid.shape), '\n')
print((X_test.shape, y_test.shape))

((700, 3), (700,)) 

((150, 3), (150,)) 

((150, 3), (150,))


In [33]:
# check dtype
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 700 entries, 308 to 29
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   amount         700 non-null    float64
 1   merchant_type  700 non-null    object 
 2   device_type    700 non-null    object 
dtypes: float64(1), object(2)
memory usage: 21.9+ KB


In [34]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 700 entries, 308 to 29
Series name: label
Non-Null Count  Dtype
--------------  -----
700 non-null    int64
dtypes: int64(1)
memory usage: 10.9 KB


What we got at EDA:
1. Categorical features have more than 2 labels (uniqueness), so the treatment for coding applies OHE
2. Labels/targets are not balanced. So, we will do experiments with original data and SMOTE data
3. The `number` feature has outliers, so we will experiment with the original data and transformed data.

# Check Missing value and Data Duplicates

In [35]:
# combining the x and y of each set
train_set = pd.concat([X_train, y_train], axis=1)
valid_set = pd.concat([X_valid, y_valid], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

# print the result of missing value
print(f'Check missing in set Train: \n {train_set.isnull().sum()}\n')
print(f'Check missing in set Valid: \n {valid_set.isnull().sum()}\n')
print(f'Check missing in set test: \n {test_set.isnull().sum()}')

Check missing in set Train: 
 amount           0
merchant_type    0
device_type      0
label            0
dtype: int64

Check missing in set Valid: 
 amount           0
merchant_type    0
device_type      0
label            0
dtype: int64

Check missing in set test: 
 amount           0
merchant_type    0
device_type      0
label            0
dtype: int64


In [36]:
# print the result of data duplicates
print(f'Check missing in set Train: \n {len(train_set.drop_duplicates()) / len(train_set)}\n')
print(f'Check missing in set Valid: \n {len(valid_set.drop_duplicates()) / len(valid_set)}\n')
print(f'Check missing in set test: \n {len(test_set.drop_duplicates()) / len(test_set)}')

Check missing in set Train: 
 1.0

Check missing in set Valid: 
 1.0

Check missing in set test: 
 1.0


If the result is 1, then there is no duplicates

# Categorical Handling

In [37]:
def apply_ohe_preprocessing(X_train_df, X_valid_df, X_test_df, categorical_cols):
    
    # intialize OHE
    ohe_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

    # Fit and transform in set train
    X_train_encoded_ohe = ohe_encoder.fit_transform(X_train_df[categorical_cols])

    X_train_encoded_ohe_df = pd.DataFrame(
        X_train_encoded_ohe,
        columns=ohe_encoder.get_feature_names_out(categorical_cols),
        index=X_train_df.index
    )

    # drop column non-OHE
    X_train_cleaned = X_train_df.drop(columns=categorical_cols).copy()

    X_train_cleaned = pd.concat([X_train_cleaned, X_train_encoded_ohe_df], axis=1)

    # Encode in set valid (only transform from encode that already fitting in X_train)
    X_valid_encoded_ohe = ohe_encoder.transform(X_valid_df[categorical_cols])

    X_valid_encoded_ohe_df = pd.DataFrame(
        X_valid_encoded_ohe,
        columns=ohe_encoder.get_feature_names_out(categorical_cols),
        index=X_valid_df.index
    )

    # drop column non-OHE
    X_valid_cleaned = X_valid_df.drop(columns=categorical_cols).copy()

    X_valid_cleaned = pd.concat([X_valid_cleaned, X_valid_encoded_ohe_df], axis=1)

    # Encode in set valid (only transform from encode that already fitting in X_train)
    X_test_encoded_ohe = ohe_encoder.transform(X_test_df[categorical_cols])

    X_test_encoded_ohe_df = pd.DataFrame(
        X_test_encoded_ohe,
        columns=ohe_encoder.get_feature_names_out(categorical_cols),
        index=X_test_df.index
    )

    # drop column non-OHE
    X_test_cleaned = X_test_df.drop(columns=categorical_cols).copy()

    X_test_cleaned = pd.concat([X_test_cleaned, X_test_encoded_ohe_df], axis=1)

    return X_train_cleaned, X_valid_cleaned, X_test_cleaned

In [38]:
# define col ohe
ohe_cols = [
    'merchant_type', 'device_type'
]

X_train_processed, X_valid_processed, X_test_processed = apply_ohe_preprocessing(
    X_train, X_valid, X_test, ohe_cols
)

# checkpoint
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_valid_processed shape: {X_valid_processed.shape}")
print(f"X_test_processed shape: {X_test_processed.shape}")

X_train_processed shape: (700, 7)
X_valid_processed shape: (150, 7)
X_test_processed shape: (150, 7)


In [39]:
X_train_processed.head()

Unnamed: 0,amount,merchant_type_electronics,merchant_type_groceries,merchant_type_others,merchant_type_travel,device_type_mobile,device_type_tablet
308,158.57,0.0,0.0,1.0,0.0,1.0,0.0
628,3.74,0.0,0.0,0.0,0.0,0.0,1.0
362,61.3,1.0,0.0,0.0,0.0,1.0,0.0
775,16.24,0.0,0.0,0.0,1.0,0.0,1.0
276,160.09,0.0,0.0,0.0,1.0,0.0,1.0


In [40]:
X_valid_processed.head()

Unnamed: 0,amount,merchant_type_electronics,merchant_type_groceries,merchant_type_others,merchant_type_travel,device_type_mobile,device_type_tablet
425,79.75,1.0,0.0,0.0,0.0,0.0,1.0
265,135.0,0.0,0.0,1.0,0.0,0.0,0.0
664,10.73,0.0,0.0,1.0,0.0,0.0,1.0
677,50.73,0.0,0.0,1.0,0.0,1.0,0.0
488,300.4,0.0,0.0,1.0,0.0,1.0,0.0


In [41]:
X_test_processed.head()

Unnamed: 0,amount,merchant_type_electronics,merchant_type_groceries,merchant_type_others,merchant_type_travel,device_type_mobile,device_type_tablet
171,1.67,0.0,0.0,1.0,0.0,0.0,0.0
815,73.16,0.0,0.0,0.0,0.0,1.0,0.0
969,64.64,0.0,0.0,0.0,1.0,1.0,0.0
763,136.24,0.0,1.0,0.0,0.0,0.0,1.0
468,25.31,0.0,0.0,1.0,0.0,0.0,0.0


All look well/fine

# Outlier Handling

In [42]:
def transforming_log(data, column:str):
    # use numpy to transforming log
    transform_data = np.log(data[column] + 1)
    
    return transform_data

# copy
X_train_processed_log = X_train_processed.copy()
X_valid_processed_log = X_valid_processed.copy()
X_test_processed_log = X_test_processed.copy()

# create feature column
X_train_processed_log['log_amount'] = transforming_log(X_train_processed_log, 'amount')
X_valid_processed_log['log_amount'] = transforming_log(X_valid_processed_log, 'amount')
X_test_processed_log['log_amount'] = transforming_log(X_test_processed_log, 'amount')

# drop the column amount 
X_train_processed_log = X_train_processed_log.drop(columns=['amount'], axis=1)
X_valid_processed_log = X_valid_processed_log.drop(columns=['amount'], axis=1)
X_test_processed_log = X_test_processed_log.drop(columns=['amount'], axis=1)

In [43]:
# checkpoint
X_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 700 entries, 308 to 29
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   amount                     700 non-null    float64
 1   merchant_type_electronics  700 non-null    float64
 2   merchant_type_groceries    700 non-null    float64
 3   merchant_type_others       700 non-null    float64
 4   merchant_type_travel       700 non-null    float64
 5   device_type_mobile         700 non-null    float64
 6   device_type_tablet         700 non-null    float64
dtypes: float64(7)
memory usage: 43.8 KB


In [44]:
X_train_processed_log.info()

<class 'pandas.core.frame.DataFrame'>
Index: 700 entries, 308 to 29
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   merchant_type_electronics  700 non-null    float64
 1   merchant_type_groceries    700 non-null    float64
 2   merchant_type_others       700 non-null    float64
 3   merchant_type_travel       700 non-null    float64
 4   device_type_mobile         700 non-null    float64
 5   device_type_tablet         700 non-null    float64
 6   log_amount                 700 non-null    float64
dtypes: float64(7)
memory usage: 43.8 KB


# SMOTE (Imbalance Handling)

In [45]:
y_train.value_counts(normalize=True)

label
0    0.95
1    0.05
Name: proportion, dtype: float64

In [46]:
# intialize smote
smote = SMOTE(random_state=42)

# fit only in set train
X_train_processed_SMOTE, y_train_processed_SMOTE = smote.fit_resample(X_train_processed,
                                                                      y_train)

X_train_processed_log_SMOTE, y_train_processed_log_SMOTE = smote.fit_resample(X_train_processed_log,
                                                                             y_train) 

In [47]:
# checkpoint
X_train_processed_SMOTE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330 entries, 0 to 1329
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   amount                     1330 non-null   float64
 1   merchant_type_electronics  1330 non-null   float64
 2   merchant_type_groceries    1330 non-null   float64
 3   merchant_type_others       1330 non-null   float64
 4   merchant_type_travel       1330 non-null   float64
 5   device_type_mobile         1330 non-null   float64
 6   device_type_tablet         1330 non-null   float64
dtypes: float64(7)
memory usage: 72.9 KB


In [48]:
X_train_processed_log_SMOTE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330 entries, 0 to 1329
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   merchant_type_electronics  1330 non-null   float64
 1   merchant_type_groceries    1330 non-null   float64
 2   merchant_type_others       1330 non-null   float64
 3   merchant_type_travel       1330 non-null   float64
 4   device_type_mobile         1330 non-null   float64
 5   device_type_tablet         1330 non-null   float64
 6   log_amount                 1330 non-null   float64
dtypes: float64(7)
memory usage: 72.9 KB


In [49]:
X_train_processed_SMOTE.columns

Index(['amount', 'merchant_type_electronics', 'merchant_type_groceries',
       'merchant_type_others', 'merchant_type_travel', 'device_type_mobile',
       'device_type_tablet'],
      dtype='object')

In [50]:
X_train_processed_log_SMOTE.columns

Index(['merchant_type_electronics', 'merchant_type_groceries',
       'merchant_type_others', 'merchant_type_travel', 'device_type_mobile',
       'device_type_tablet', 'log_amount'],
      dtype='object')

# Dump Dataset

In [51]:
# dump set data without transforming log
utils.pickle_dump(X_train_processed, config['train_processed_set_path'][0])
utils.pickle_dump(y_train, config['train_processed_set_path'][1])

utils.pickle_dump(X_train_processed_SMOTE, config['train_processed_SMOTE_set_path'][0])
utils.pickle_dump(y_train_processed_SMOTE, config['train_processed_SMOTE_set_path'][1])

utils.pickle_dump(X_valid_processed, config['valid_processed_set_path'][0])
utils.pickle_dump(y_valid, config['valid_processed_set_path'][1])

utils.pickle_dump(X_test_processed, config['test_processed_set_path'][0])
utils.pickle_dump(y_test, config['test_processed_set_path'][1])

# dump set data with transforming log
utils.pickle_dump(X_train_processed_log, config['train_processed_log_set_path'][0])
utils.pickle_dump(y_train, config['train_processed_log_set_path'][1])

utils.pickle_dump(X_train_processed_log_SMOTE, config['train_processed_log_SMOTE_set_path'][0])
utils.pickle_dump(y_train_processed_log_SMOTE, config['train_processed_log_SMOTE_set_path'][1])

utils.pickle_dump(X_valid_processed_log, config['valid_processed_log_set_path'][0])
utils.pickle_dump(y_valid, config['valid_processed_log_set_path'][1])

utils.pickle_dump(X_test_processed_log, config['test_processed_log_set_path'][0])
utils.pickle_dump(y_test, config['test_processed_log_set_path'][1])