In [1]:
import os
import pickle
import copy
import pandas as pd
import numpy as np
from collections import OrderedDict
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import matplotlib.pyplot as plt

In [2]:
class label_encoder(object):
    def fit_pd(self,df,cols=[]):
        '''
        fit all columns in the df or specific list. 
        generate a dict:
        {feature1:{label1:1,label2:2}, feature2:{label1:1,label2:2}...}
        '''
        if len(cols) == 0:
            cols = df.columns
        self.class_index = {}
        for f in cols:
            uf = df[f].unique()
            self.class_index[f] = {}
            index = 1
            for item in uf:
                self.class_index[f][item] = index
                index += 1
    
    def fit_transform_pd(self,df,cols=[]):
        '''
        fit all columns in the df or specific list and return an update dataframe.
        '''
        if len(cols) == 0:
            cols = df.columns
        newdf = copy.deepcopy(df)
        self.class_index = {}
        for f in cols:
            uf = df[f].unique()
            self.class_index[f] = {}
            index = 1
            for item in uf:
                self.class_index[f][item] = index
                index += 1
                
            newdf[f] = df[f].apply(lambda d: self.update_label(f,d))
        return newdf
    
    def transform_pd(self,df,cols=[]):
        '''
        transform all columns in the df or specific list from lable to index, return an update dataframe.
        '''
        newdf = copy.deepcopy(df)
        if len(cols) == 0:
            cols = df.columns
        for f in cols:
            if f in self.class_index:
                newdf[f] = df[f].apply(lambda d: self.update_label(f,d))
        return newdf
                
    def update_label(self,f,x):
        '''
        update the label to index, if not found in the dict, add and update the dict.
        '''
        try:
            return self.class_index[f][x]
        except:
            self.class_index[f][x] = max(self.class_index[f].values())+1
            return self.class_index[f][x]

In [3]:
def min_max_scaler(df, numerical_columns, scaler_path):
    """
    Description:
        Dataframe column 중 numerical columns에 해당하는 DataFrame에 min max normalize 을 수행
        수행 된 normalize 결과를 pickle 에 담아 저장함.
        test 시에는 저장된 min max normalize 을 활용해 numerical column 에 적용.

    Args:
        :param df: DataFrame,
        :param numerical_columns: list, numerical column 이 저장되어 있는 정보
        :param scaler_path: str, pickle 정보가 저장되는 경로

    Returns:
        df: DataFrame, numerical column에 Min max scaling 이 적용된 dataframe
        scalers, Dict, {column_name : MinMaxScaler instance} 로 저장
    """
    df = df.loc[:, numerical_columns].copy()
    scalers = {}
    mm_sclaers = []
    # Set Minmax scaler
    if not os.path.exists(scaler_path):
        print('Make a New Min Max Scaler')

        # Scale for each Numeric Columns
        for col in numerical_columns:
            mm_sclaer = MinMaxScaler()

            # Fit the scaler
            mm_sclaer.fit(df.loc[:, [col]])
            mm_sclaers.append(mm_sclaer)

            # Normalize each value between 0~1 using the scaler
            df.loc[:, col] = mm_sclaer.transform(df.loc[:, [col]])
            scalers[col] = mm_sclaer

        # Store the encoder
        f = open(scaler_path, 'wb')
        pickle.dump(scalers, f)

    else:
        print('Normalize with existing Min Max Scaler')
        # load scalers
        f = open(scaler_path, 'rb')
        scalers = pickle.load(f)

        # Normalize each value between 0~1 using loaded scaler
        for col in numerical_columns:
            mm_scaler = scalers[col]
            df.loc[:, col] = mm_scaler.transform(df.loc[:, [col]])

    return df, scalers

## 1. Load Dataset  & concatenation

In [4]:
df_1 = pd.read_csv('./result_syn/df_syn_train_eng.csv', encoding='utf-8-sig')

In [5]:
df_2 = pd.read_csv('./result_syn/df_syn_valid_eng.csv', encoding='utf-8-sig')

In [6]:
df = pd.concat([df_1, df_2]) 

## 2. Preprocess Data

In [7]:
import datetime
d = datetime.datetime.now()

In [8]:
df['Date']=df['Date'].astype(str)
df['Date']=pd.to_datetime(df['Date'])

In [9]:
df.sort_values('Date', axis=0, ascending=True, inplace=False)

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2020-01-01,32,B,88,12,11,10,CDO1QN6,QD86XU7,8QOEWI1,...,9503003700,CN,CN,8.0,A,G,2.3,18.17,1,1
18,2020-01-01,40,B,22,21,11,10,5PDPMM1,UYKDFV4,818J1DT,...,8203209000,CN,CN,0.0,FCN1,Y,527.4,202310.64,0,0
17,2020-01-01,33,B,11,21,12,10,E3Q8ABV,QZ3FFBX,IXHGLHQ,...,2701110000,CN,CN,0.0,FCN1,B,300000.0,11610000.00,0,0
16,2020-01-01,20,B,11,21,11,9,GO1EBDS,IWX4YYM,IWSDF6W,...,6206401000,CN,CN,6.5,E1,B,300.0,5730.00,0,0
15,2020-01-01,29,B,91,21,11,40,D88DTE8,BIHPWOB,QQFGD9M,...,8414599000,CN,CN,8.0,A,Y,80.0,5128.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8394,2021-03-31,39,B,10,21,10,9,SC03Z0S,7KGLWW5,78IE02Z,...,6110909000,FR,FR,13.0,A,G,6.0,22.80,0,0
8393,2021-03-31,20,B,11,21,43,10,BWN4U9D,CADKHI0,43FIP7U,...,8703239020,DE,DE,0.0,FEU1,G,2075.0,20127.50,1,1
8392,2021-03-31,20,B,80,21,13,10,JIHZF3K,QAQOV3K,XRLSNK8,...,8517699000,US,US,0.0,CIT,Y,40.0,88712.00,0,0
8405,2021-03-31,20,B,11,21,11,40,5KNH8JV,CF2A7E2,AJP9FWI,...,9603500000,CN,CN,8.0,A,S,17.0,5120.40,0,0


In [10]:
# Check null value
df.isnull().sum()

Date                               0
Office ID                          0
Process Type                       0
Import Type                        0
Import Use                         0
Payment Type                       0
Mode of Transport                  0
Declarant ID                       0
Importer ID                        0
Seller ID                       4682
Courier ID                     37021
HS10 Code                          0
Country of Departure               1
Country of Origin                  1
Tax Rate                           0
Tax Type                           0
Country of Origin Indicator        0
Net Mass                           0
Item Price                         0
Fraud                              0
Critical Fraud                     0
dtype: int64

In [11]:
# Handle Missing value
df[['Seller ID']] = np.where(df[['Seller ID']].isnull(),0,1)
df[['Courier ID']] = np.where(df[['Courier ID']].isnull(),0,1)

In [12]:
df.isnull().sum()

Date                           0
Office ID                      0
Process Type                   0
Import Type                    0
Import Use                     0
Payment Type                   0
Mode of Transport              0
Declarant ID                   0
Importer ID                    0
Seller ID                      0
Courier ID                     0
HS10 Code                      0
Country of Departure           1
Country of Origin              1
Tax Rate                       0
Tax Type                       0
Country of Origin Indicator    0
Net Mass                       0
Item Price                     0
Fraud                          0
Critical Fraud                 0
dtype: int64

In [13]:
df = df.dropna(axis=0)

In [14]:
df['Fraud'].value_counts()

0    35399
1     9673
Name: Fraud, dtype: int64

In [15]:
df['Item Price']=df['Item Price'].astype('int')
df['Net Mass']=df['Net Mass'].astype('int')

## Split data to train and validation data

In [16]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index(df['Date'])
df = df.sort_index()

In [17]:
train = df['2020-01-01':'2020-12-31']
train = train.reset_index(drop=True)
train

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2020-01-01,32,B,88,12,11,10,CDO1QN6,QD86XU7,1,...,9503003700,CN,CN,8.0,A,G,2,18,1,1
1,2020-01-01,29,B,12,21,11,40,K3E28IQ,MEDCNNU,1,...,3926909000,CN,CN,6.5,C,B,2,1,0,0
2,2020-01-01,40,B,10,21,40,39,00RURK1,92X3AXX,1,...,7318190000,CN,CN,8.0,A,B,2,190,0,0
3,2020-01-01,14,B,97,21,10,10,1XCM1XF,XE08QFH,1,...,3926909000,CN,CN,6.5,C,B,40,24,0,0
4,2020-01-01,42,B,11,21,11,39,6V8QPPW,LD8GM65,0,...,7102390000,CN,HK,5.0,A,G,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36587,2020-12-31,40,B,10,21,11,40,DB7JDWC,GVK70MT,1,...,8473302000,US,US,0.0,FUS1,E,1,3363,0,0
36588,2020-12-31,20,B,11,21,11,10,DGIRSXL,BMH6J4M,1,...,7318152000,CN,CN,0.0,FCN1,B,9225,20295,0,0
36589,2020-12-31,20,B,10,21,11,10,Q9ZG6R5,NAUQTEB,1,...,8419200000,LU,FR,0.0,C,E,45,165370,0,0
36590,2020-12-31,40,B,11,21,11,10,4OSUO77,MIBBA9T,1,...,9109900000,CN,CN,0.0,FCN1,G,65,210627,0,0


In [18]:
valid = df['2021-01-01':'2021-03-31']
valid = valid.reset_index(drop=True)
valid

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2021-01-01,13,B,11,21,11,10,L77JJEG,HQ0W7JA,1,...,8407210000,JP,JP,8.0,A,B,1262,1437418,0,0
1,2021-01-01,20,B,11,21,11,40,749ECRU,Y2FJ95S,1,...,8479909070,CN,DE,8.0,A,S,10880,14163584,0,0
2,2021-01-01,19,B,11,21,11,9,OZB7KED,446O4EE,1,...,6211111000,HK,IT,13.0,A,E,1,5142,0,0
3,2021-01-01,31,B,88,21,11,40,SM89FYN,OOWNJY9,1,...,8207191000,CN,CN,8.0,A,E,57,63372,1,1
4,2021-01-01,20,B,10,21,11,39,DM8H44I,MU3B4MR,1,...,3304999000,JP,JP,6.5,C,G,0,107,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8475,2021-03-31,39,B,11,21,11,10,6D4DNS7,8TW53B3,1,...,6907210000,CN,CN,8.0,A,G,1219,8167,0,0
8476,2021-03-31,40,B,11,21,12,10,3BTA0QN,USP5TV1,1,...,4911100000,EE,PE,0.0,A,Y,0,0,1,1
8477,2021-03-31,13,B,11,21,41,40,NULLLLS,XE08QFH,1,...,8530900000,DE,FR,0.0,FEU1,G,1,3268,1,1
8478,2021-03-31,41,B,10,21,10,10,KWSE1HX,JUJJ961,0,...,1207400000,CN,CN,0.0,FCN6,E,54000,39447000,0,0


## 3. Lable Encoding

In [19]:
df.columns

Index(['Date', 'Office ID', 'Process Type', 'Import Type', 'Import Use',
       'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
       'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
       'Country of Origin', 'Tax Rate', 'Tax Type',
       'Country of Origin Indicator', 'Net Mass', 'Item Price', 'Fraud',
       'Critical Fraud'],
      dtype='object')

In [20]:
train.to_csv('./label_syn/df_enc_train_eng.csv', index = False)
valid.to_csv('./label_syn/df_enc_valid_eng.csv', index = False)

In [27]:
# Numeric dataset preprocessing
numerical_columns = ['Net Mass', 'Item Price']
train_numerical_xs, scaler = min_max_scaler(train, numerical_columns, scaler_path='./encoder_syn/min_max_scaler_eng.pkl')
valid_numerical_xs, _ = min_max_scaler(valid, numerical_columns, scaler_path='./encoder_syn/min_max_scaler_eng.pkl')

Normalize with existing Min Max Scaler
Normalize with existing Min Max Scaler


In [31]:
train.columns

Index(['Date', 'Office ID', 'Process Type', 'Import Type', 'Import Use',
       'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
       'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
       'Country of Origin', 'Tax Rate', 'Tax Type',
       'Country of Origin Indicator', 'Net Mass', 'Item Price', 'Fraud',
       'Critical Fraud'],
      dtype='object')

In [32]:
X_train = train[['Office ID', 'Process Type', 'Import Type', 'Import Use',
               'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
               'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
               'Country of Origin', 'Tax Rate', 'Tax Type',
               'Country of Origin Indicator']]

X_valid = valid[['Office ID', 'Process Type', 'Import Type', 'Import Use',
               'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
               'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
               'Country of Origin', 'Tax Rate', 'Tax Type',
               'Country of Origin Indicator']]



y_train = train['Fraud']
y_valid = valid['Fraud']

In [33]:
# Label encoder
encoder = label_encoder()
encoder.fit_pd(X_train)
encoder.transform_pd(X_train)

Unnamed: 0,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,2,1,2,2,1,2,2,2,1,1,2,1,1,2,2,2
2,3,1,3,2,2,3,3,3,1,2,3,1,1,1,1,2
3,4,1,4,2,3,1,4,4,1,1,2,1,1,2,2,2
4,5,1,5,2,1,3,5,5,2,1,4,1,2,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36587,3,1,3,2,1,2,49,1358,1,1,1491,5,6,4,7,3
36588,7,1,5,2,1,1,365,5922,1,2,525,1,1,4,3,2
36589,7,1,3,2,1,1,134,1046,1,1,1545,44,16,4,2,3
36590,3,1,5,2,1,1,76,8381,1,1,3571,1,1,4,3,1


In [34]:
#save the encoder file
filename = './encoder_syn/multiencoder_eng.pkl'

output = open(filename,'wb')
pickle.dump(encoder, output)
output.close()

In [35]:
#load the encoder file
filename = './encoder_syn/multiencoder_eng.pkl'

pkl_file = open(filename, 'rb')
new_encoder = pickle.load(pkl_file) 
pkl_file.close()

In [36]:
# valid, test label encoder
encoding_train = new_encoder.transform_pd(X_train)
encoding_valid = new_encoder.transform_pd(X_valid)
# encoding_test = new_encoder.transform(X_test)

In [37]:
# concaternate ( label encoding data + MinMax scaling data)

X_train_concat = pd.concat([encoding_train,train_numerical_xs ],axis=1)
X_valid_concat = pd.concat([encoding_valid,valid_numerical_xs ],axis=1)


In [40]:
X_train_concat.to_csv('./label_syn/encoding_train_eng.csv', index=False)
X_valid_concat.to_csv('./label_syn/encoding_valid_eng.csv', index=False)

In [41]:
X_train.to_csv('./label_syn/X_train_eng.csv', index=False)
X_valid.to_csv('./label_syn/X_valid_eng.csv', index=False)

In [42]:
y_train.to_csv('./label_syn/y_train_eng.csv', index=False)
y_valid.to_csv('./label_syn/y_valid_eng.csv', index=False)