In [3]:
import os
import copy
import pickle
import pandas as pd
import numpy as np
from collections import OrderedDict
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [4]:
class label_encoder(object):
    def fit_pd(self,df,cols=[]):
        if len(cols) == 0:
            cols = df.columns
        self.class_index = {}
        for f in cols:
            uf = df[f].unique()
            self.class_index[f] = {}
            index = 1
            for item in uf:
                self.class_index[f][item] = index
                index += 1
    
    def fit_transform_pd(self,df,cols=[]):
        if len(cols) == 0:
            cols = df.columns
        newdf = copy.deepcopy(df)
        self.class_index = {}
        for f in cols:
            uf = df[f].unique()
            self.class_index[f] = {}
            index = 1
            for item in uf:
                self.class_index[f][item] = index
                index += 1
                
            newdf[f] = df[f].apply(lambda d: self.update_label(f,d))
        return newdf
    
    def transform_pd(self,df,cols=[]):
        newdf = copy.deepcopy(df)
        if len(cols) == 0:
            cols = df.columns
        for f in cols:
            if f in self.class_index:
                newdf[f] = df[f].apply(lambda d: self.update_label(f,d))
        return newdf
                
    def update_label(self,f,x):
        try:
            return self.class_index[f][x]
        except:
            self.class_index[f][x] = max(self.class_index[f].values())+1
            return self.class_index[f][x]

In [5]:
def min_max_scaler(df, numerical_columns, scaler_path):
    df = df.loc[:, numerical_columns].copy()
    scalers = {}
    mm_sclaers = []
    # Set Minmax scaler
    if not os.path.exists(scaler_path):
        print('Make a New Min Max Scaler')

        # Scale for each Numeric Columns
        for col in numerical_columns:
            mm_sclaer = MinMaxScaler()

            # Fit the scaler
            mm_sclaer.fit(df.loc[:, [col]])
            mm_sclaers.append(mm_sclaer)

            # Normalize each value between 0~1 using the scaler
            df.loc[:, col] = mm_sclaer.transform(df.loc[:, [col]])
            scalers[col] = mm_sclaer

        # Store the encoder
        f = open(scaler_path, 'wb')
        pickle.dump(scalers, f)

    else:
        print('Normalize with existing Min Max Scaler')
        # load scalers
        f = open(scaler_path, 'rb')
        scalers = pickle.load(f)

        # Normalize each value between 0~1 using loaded scaler
        for col in numerical_columns:
            mm_scaler = scalers[col]
            df.loc[:, col] = mm_scaler.transform(df.loc[:, [col]])

    return df, scalers

## 1. Load Dataset  & concatenation

In [6]:
df_1 = pd.read_csv('./result_syn/df_syn_train_eng.csv', encoding='utf-8-sig')

In [7]:
df_2 = pd.read_csv('./result_syn/df_syn_test_eng.csv', encoding='utf-8-sig')

In [8]:
df=pd.concat([df_1, df_2]) 

## 2. Preprocess Data

In [9]:
# import datetime
# d = datetime.datetime.now()
# print (d)

In [9]:
df['Date']=df['Date'].astype(str)
df['Date']=pd.to_datetime(df['Date'])

In [10]:
df.sort_values('Date', axis=0, ascending=True, inplace=False)

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2020-01-01,32,B,88,12,11,10,CDO1QN6,QD86XU7,8QOEWI1,...,9503003700,CN,CN,8.0,A,G,2.3,18.17,1,1
18,2020-01-01,40,B,22,21,11,10,5PDPMM1,UYKDFV4,818J1DT,...,8203209000,CN,CN,0.0,FCN1,Y,527.4,202310.64,0,0
17,2020-01-01,33,B,11,21,12,10,E3Q8ABV,QZ3FFBX,IXHGLHQ,...,2701110000,CN,CN,0.0,FCN1,B,300000.0,11610000.00,0,0
16,2020-01-01,20,B,11,21,11,9,GO1EBDS,IWX4YYM,IWSDF6W,...,6206401000,CN,CN,6.5,E1,B,300.0,5730.00,0,0
15,2020-01-01,29,B,91,21,11,40,D88DTE8,BIHPWOB,QQFGD9M,...,8414599000,CN,CN,8.0,A,Y,80.0,5128.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8853,2021-06-30,40,B,11,21,11,40,7OB24ZT,LTHSUPK,3UKD5U6,...,3924909000,CN,CN,6.5,C,E,272.0,299.20,0,0
8852,2021-06-30,40,B,11,21,11,39,NJVSADX,5J7FWJW,O1FP6N2,...,4016991090,JP,JP,8.0,A,Y,8.5,1150.05,0,0
8851,2021-06-30,12,B,17,21,11,10,BZ072US,Q3O1BL4,CU53IM2,...,4820100000,CN,CN,0.0,C,G,1.2,23.16,0,0
8861,2021-06-30,39,B,11,21,11,11,E5BS5N9,I8ZXXUY,WJDBGJ1,...,3307490000,GB,GB,0.0,FGB1,E,150.0,945.00,1,1


In [11]:
df.isnull().sum()

Date                               0
Office ID                          0
Process Type                       0
Import Type                        0
Import Use                         0
Payment Type                       0
Mode of Transport                  0
Declarant ID                       0
Importer ID                        0
Seller ID                       4575
Courier ID                     37347
HS10 Code                          0
Country of Departure               2
Country of Origin                  2
Tax Rate                           0
Tax Type                           0
Country of Origin Indicator        0
Net Mass                           0
Item Price                         0
Fraud                              0
Critical Fraud                     0
dtype: int64

In [12]:
# Handle Missing value
df[['Seller ID']] = np.where(df[['Seller ID']].isnull(),0,1)
df[['Courier ID']] = np.where(df[['Courier ID']].isnull(),0,1)

In [13]:
df.isnull().sum()

Date                           0
Office ID                      0
Process Type                   0
Import Type                    0
Import Use                     0
Payment Type                   0
Mode of Transport              0
Declarant ID                   0
Importer ID                    0
Seller ID                      0
Courier ID                     0
HS10 Code                      0
Country of Departure           2
Country of Origin              2
Tax Rate                       0
Tax Type                       0
Country of Origin Indicator    0
Net Mass                       0
Item Price                     0
Fraud                          0
Critical Fraud                 0
dtype: int64

In [14]:
df = df.dropna(axis=0)

In [16]:
df['Fraud'].value_counts()

0    35710
1     9808
Name: Fraud, dtype: int64

In [17]:
df['Item Price']=df['Item Price'].astype('int')
df['Net Mass']=df['Net Mass'].astype('int')

## 3. Split data to train and validation data

In [18]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index(df['Date'])
df = df.sort_index()

In [19]:
train = df['2020-01-01':'2020-12-31']
train = train.reset_index(drop=True)
train

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2020-01-01,32,B,88,12,11,10,CDO1QN6,QD86XU7,1,...,9503003700,CN,CN,8.0,A,G,2,18,1,1
1,2020-01-01,29,B,12,21,11,40,K3E28IQ,MEDCNNU,1,...,3926909000,CN,CN,6.5,C,B,2,1,0,0
2,2020-01-01,40,B,10,21,40,39,00RURK1,92X3AXX,1,...,7318190000,CN,CN,8.0,A,B,2,190,0,0
3,2020-01-01,14,B,97,21,10,10,1XCM1XF,XE08QFH,1,...,3926909000,CN,CN,6.5,C,B,40,24,0,0
4,2020-01-01,42,B,11,21,11,39,6V8QPPW,LD8GM65,0,...,7102390000,CN,HK,5.0,A,G,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36587,2020-12-31,40,B,10,21,11,40,DB7JDWC,GVK70MT,1,...,8473302000,US,US,0.0,FUS1,E,1,3363,0,0
36588,2020-12-31,20,B,11,21,11,10,DGIRSXL,BMH6J4M,1,...,7318152000,CN,CN,0.0,FCN1,B,9225,20295,0,0
36589,2020-12-31,20,B,10,21,11,10,Q9ZG6R5,NAUQTEB,1,...,8419200000,LU,FR,0.0,C,E,45,165370,0,0
36590,2020-12-31,40,B,11,21,11,10,4OSUO77,MIBBA9T,1,...,9109900000,CN,CN,0.0,FCN1,G,65,210627,0,0


In [20]:
test = df['2021-04-01':'2021-06-30']
test = test.reset_index(drop=True)
test

Unnamed: 0,Date,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,...,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price,Fraud,Critical Fraud
0,2021-04-01,29,B,91,21,11,41,575N8BW,PEJWA0Y,1,...,8481201000,CN,CN,2.4,FCN1,Y,623,303587,1,1
1,2021-04-01,21,B,10,21,11,40,8ZM6GUW,9DIRDSY,1,...,4407299000,VN,VN,0.0,FVN2,S,23078,2734743,0,0
2,2021-04-01,39,B,12,21,10,9,1XCM1XF,SRCDUMH,1,...,710807000,CN,CN,27.0,C,E,60000,15912000,0,0
3,2021-04-01,15,B,11,21,11,10,KEGR4JZ,XSK62NY,1,...,4202999000,VN,VN,8.0,A,B,10,97,1,1
4,2021-04-01,30,B,88,21,19,39,607KRHF,DRMMKS4,1,...,8711301000,TH,TH,8.0,A,Y,195,580242,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8921,2021-06-30,30,B,11,21,11,9,QM7LO7M,LKVEEMK,1,...,8518109090,DE,DE,0.0,FEU1,G,128,22707,1,1
8922,2021-06-30,12,B,11,21,11,41,DO8IOFX,RALHUGK,1,...,7326909000,HK,CN,8.0,A,G,4,6,0,0
8923,2021-06-30,39,B,11,21,11,40,9O034UC,ML9KFEZ,1,...,8517629000,US,TW,0.0,CIT,G,1,2188,0,0
8924,2021-06-30,40,B,12,21,11,10,FXK30O6,YIIADKE,1,...,9503003919,CN,CN,4.8,FCN1,G,1365,91318,0,0


## 4. Lable Encoding

In [21]:
df.columns

Index(['Date', 'Office ID', 'Process Type', 'Import Type', 'Import Use',
       'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
       'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
       'Country of Origin', 'Tax Rate', 'Tax Type',
       'Country of Origin Indicator', 'Net Mass', 'Item Price', 'Fraud',
       'Critical Fraud'],
      dtype='object')

In [22]:
train.to_csv('./label_syn/df_enc_train2_eng.csv', index = False)
test.to_csv('./label_syn/df_enc_test_eng.csv', index = False)

In [25]:
# Numeric dataset preprocessing
numerical_columns = ['Net Mass', 'Item Price']
train_numerical_xs, _ = min_max_scaler(train, numerical_columns, scaler_path='./encoder_syn/min_max_scaler_eng.pkl')
test_numerical_xs, _ = min_max_scaler(test, numerical_columns, scaler_path='./encoder_syn/min_max_scaler_eng.pkl')

Normalize with existing Min Max Scaler
Normalize with existing Min Max Scaler


In [27]:
X_train = train[['Office ID', 'Process Type', 'Import Type', 'Import Use',
               'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
               'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
               'Country of Origin', 'Tax Rate', 'Tax Type',
               'Country of Origin Indicator']]

X_test = test[['Office ID', 'Process Type', 'Import Type', 'Import Use',
               'Payment Type', 'Mode of Transport', 'Declarant ID', 'Importer ID',
               'Seller ID', 'Courier ID', 'HS10 Code', 'Country of Departure',
               'Country of Origin', 'Tax Rate', 'Tax Type',
               'Country of Origin Indicator']]



y_train = train['Fraud']
y_test = test['Fraud']

In [32]:
#load the encoder file
filename = './encoder_syn/multiencoder_eng.pkl'

pkl_file = open(filename, 'rb')
new_encoder = pickle.load(pkl_file) 
pkl_file.close()

In [33]:
# valid, test label encoder
encoding_train = new_encoder.transform_pd(X_train)
encoding_test = new_encoder.transform_pd(X_test)

In [34]:
# concaternate ( label encoding data + MinMax scaling data)

X_train_concat = pd.concat([encoding_train,train_numerical_xs ],axis=1)
X_test_concat = pd.concat([encoding_test,test_numerical_xs ],axis=1)

In [35]:
X_train_concat.head()

Unnamed: 0,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3.331945e-08,5.289262e-09
1,2,1,2,2,1,2,2,2,1,1,2,1,1,2,2,2,3.331945e-08,2.938479e-10
2,3,1,3,2,2,3,3,3,1,2,3,1,1,1,1,2,3.331945e-08,5.58311e-08
3,4,1,4,2,3,1,4,4,1,1,2,1,1,2,2,2,6.66389e-07,7.052349e-09
4,5,1,5,2,1,3,5,5,2,1,4,1,2,3,1,1,0.0,0.0


In [36]:
X_test_concat.head()

Unnamed: 0,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price
0,2,1,7,2,1,6,430,3539,1,1,1660,1,1,11,3,4,1.037901e-05,8.920839e-05
1,14,1,3,2,1,2,448,4899,1,1,1051,11,12,4,30,5,0.0003844731,0.0008035984
2,13,1,2,2,3,4,4,1499,1,1,39,1,1,8,2,3,0.0009995835,0.004675707
3,23,1,5,2,1,1,759,10151,1,1,181,11,12,1,1,2,1.665973e-07,2.850324e-08
4,12,1,1,2,10,3,693,14725,1,1,3572,33,32,1,1,4,3.248646e-06,0.0001705029


In [38]:
X_train_concat.to_csv('./label_syn/encoding_train1_eng.csv', index=False)
X_test_concat.to_csv('./label_syn/encoding_test_eng.csv', index=False)

In [39]:
X_train.to_csv('./label_syn/X_train1_eng.csv', index=False)
X_test.to_csv('./label_syn/X_test_eng.csv', index=False)

In [40]:
y_train.to_csv('./label_syn/y_train1_eng.csv', index=False)
y_test.to_csv('./label_syn/y_test_eng.csv', index=False)