# Import Libraries

In [37]:
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

# Load Config File

In [38]:
config = utils.load_config()

In [39]:
dataset = pd.read_csv(config['data_path'])
dataset.head()

Unnamed: 0,transaction_id,amount,merchant_type,device_type,label
0,1,46.93,travel,tablet,0
1,2,301.01,groceries,desktop,0
2,3,131.67,others,tablet,0
3,4,91.29,electronics,desktop,0
4,5,16.96,others,mobile,0


# Data Validation

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  1000 non-null   int64  
 1   amount          1000 non-null   float64
 2   merchant_type   1000 non-null   object 
 3   device_type     1000 non-null   object 
 4   label           1000 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


In [5]:
# check missing value
dataset.isnull().sum()

transaction_id    0
amount            0
merchant_type     0
device_type       0
label             0
dtype: int64

In [6]:
# check statistic descriptive
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
transaction_id,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
amount,1000.0,97.25056,97.250965,0.46,26.915,68.68,136.385,817.24
label,1000.0,0.05,0.218054,0.0,0.0,0.0,0.0,1.0


In [7]:
# check statistic descriptive
dataset.describe(include='object').T

Unnamed: 0,count,unique,top,freq
merchant_type,1000,5,others,214
device_type,1000,3,mobile,346


In [8]:
# drop transaction_id
dataset = dataset.drop(columns=['transaction_id'], axis=1)
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   amount         1000 non-null   float64
 1   merchant_type  1000 non-null   object 
 2   device_type    1000 non-null   object 
 3   label          1000 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 31.4+ KB


# Data Defense

In [None]:
# set(dataset.select_dtypes('float').columns.to_list())

{'amount'}

In [None]:
# set(config['float_columns'])

{'amount'}

In [27]:
def check_data(input_data, config):
    # count input data
    len_input_data = len(input_data)

    # check data types
    assert set(input_data.select_dtypes('float').columns.to_list()) == set(config['float_columns']), 'an error occurs in float columns'
    assert set(input_data.select_dtypes('int').columns.to_list()) == set(config['int_columns']), 'an error occurs in int columns'
    assert set(input_data.select_dtypes('object').columns.to_list()) == set(config['categorical_columns']), 'an error occurs in categorical columns'

    # Check range of data in float_columns
    for col in config['float_columns']:
        range_key = f"range_{col}"       
        if range_key in config:
            min_val, max_val = config[range_key]

            # Count values outside the defined range
            # 'between' returns True for values within the range, so we negate (~) and sum to get out-of-range count
            out_of_range_count = (len_input_data - input_data[col].between(min_val, max_val).sum())

            assert out_of_range_count == 0, \
                f"Error in {col} range. {out_of_range_count} values are outside the defined range [{min_val}, {max_val}]."

    # Check range of data in int_columns
    for col in config['int_columns']:
        range_key = f"range_{col}"       
        if range_key in config:
            min_val, max_val = config[range_key]

            # Count values outside the defined range
            # 'between' returns True for values within the range, so we negate (~) and sum to get out-of-range count
            out_of_range_count = (len_input_data - input_data[col].between(min_val, max_val).sum())

            assert out_of_range_count == 0, \
                f"Error in {col} range. {out_of_range_count} values are outside the defined range [{min_val}, {max_val}]."

In [28]:
# call the function
check_data(dataset, config)

# Data Splitting

In [29]:
config['label']

'label'

In [30]:
feature_columns = config['float_columns'] + config['categorical_columns']

X = dataset[feature_columns].copy()

y = dataset[config['label']].copy()

In [31]:
X.head()

Unnamed: 0,amount,merchant_type,device_type
0,46.93,travel,tablet
1,301.01,groceries,desktop
2,131.67,others,tablet
3,91.29,electronics,desktop
4,16.96,others,mobile


In [32]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   amount         1000 non-null   float64
 1   merchant_type  1000 non-null   object 
 2   device_type    1000 non-null   object 
dtypes: float64(1), object(2)
memory usage: 23.6+ KB


In [34]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1000 entries, 0 to 999
Series name: label
Non-Null Count  Dtype
--------------  -----
1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB


In [35]:
# split data train and test
X_train, X_pretest, y_train, y_pretest = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42,
                                                    stratify = y)

# split data test into set validation = valid
X_valid, X_test, y_valid, y_test = train_test_split(X_pretest,
                                                    y_pretest,
                                                    test_size = 0.5,
                                                    random_state = 42,
                                                    stratify = y_pretest)

In [36]:
if X_train.shape[0] + X_valid.shape[0] + X_test.shape[0] == len(dataset):
    print("The splitting is correct")
else:
    print("There is something wrong in splitting")

The splitting is correct


# Dump dataset with pickle

In [40]:
utils.pickle_dump(dataset, config["dataset_fraud_detection_cleaned_path"])

utils.pickle_dump(X_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(X_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(X_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])