In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from collections import OrderedDict
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

## 1. Load Dataset 

In [2]:
X_train = pd.read_csv('./label_syn/encoding_train1_eng.csv')
X_test = pd.read_csv('./label_syn/encoding_test_eng.csv')

In [3]:
y_train= pd.read_csv('./label_syn/y_train1_eng.csv', encoding='utf-8-sig')
y_test= pd.read_csv('./label_syn/y_test_eng.csv', encoding='utf-8-sig')

In [4]:
y_test.value_counts()

Fraud
0        6968
1        1958
dtype: int64

## 2. Over sampling

In [9]:
from imblearn.over_sampling import RandomOverSampler

In [10]:
over_sampler = RandomOverSampler(random_state = 11)
X_train_over,y_train_over = over_sampler.fit_resample(X_train,y_train)

In [11]:
over_sampler = RandomOverSampler(random_state = 11)
X_test_over,y_test_over = over_sampler.fit_resample(X_test,y_test)

## 3. Select xgboost parameters

In [12]:
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [14]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [13]:
from scipy import stats
from scipy.stats import randint

In [15]:
xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5994130001745845, max_delta_step=0, max_depth=4,
              min_child_weight=2, monotone_constraints='()',
              n_estimators=424, n_jobs=20, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None, objective= 'binary:logistic', eval_metric='logloss')

## 3. Evaluate Performance

In [16]:
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score,roc_auc_score

def get_clf_eval2(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average ='macro')
    recall = recall_score(y_test, pred, average ='macro')
    f1 = f1_score(y_test, pred, average ='macro')
    
    print("confusion\n", confusion)
    print("accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f}, F1:{3:.4f}\n".format(accuracy, precision, recall, f1))
    

In [17]:
file_name = "./model_syn/xgb_reg_eng.pkl"

# load
w_xgb_model = pickle.load(open(file_name, "rb"))

In [18]:
w_preds = w_xgb_model.predict(X_test_over)
w_pred_proba = w_xgb_model.predict_proba(X_test_over)[:,1]

In [19]:
get_clf_eval2(y_test_over, w_preds, w_pred_proba)

confusion
 [[5728 1240]
 [3245 3723]]
accuracy: 0.6782, precision: 0.6943, recall: 0.6782, F1:0.6714



## 6. Compare predicted output vs ground truth

In [20]:
X_org = pd.read_csv('./label_syn/df_enc_test_eng.csv', encoding='utf-8-sig')

In [21]:
y_test

Unnamed: 0,Fraud
0,1
1,0
2,0
3,1
4,0
...,...
8921,1
8922,0
8923,0
8924,0


In [22]:
X_test

Unnamed: 0,Office ID,Process Type,Import Type,Import Use,Payment Type,Mode of Transport,Declarant ID,Importer ID,Seller ID,Courier ID,HS10 Code,Country of Departure,Country of Origin,Tax Rate,Tax Type,Country of Origin Indicator,Net Mass,Item Price
0,2,1,7,2,1,6,430,3539,1,1,1660,1,1,11,3,4,1.037901e-05,8.920839e-05
1,14,1,3,2,1,2,448,4899,1,1,1051,11,12,4,30,5,3.844731e-04,8.035984e-04
2,13,1,2,2,3,4,4,1499,1,1,39,1,1,8,2,3,9.995835e-04,4.675707e-03
3,23,1,5,2,1,1,759,10151,1,1,181,11,12,1,1,2,1.665973e-07,2.850324e-08
4,12,1,1,2,10,3,693,14725,1,1,3572,33,32,1,1,4,3.248646e-06,1.705029e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8921,12,1,5,2,1,4,363,3640,1,2,108,10,11,4,6,1,2.132445e-06,6.672404e-06
8922,27,1,5,2,1,6,219,11506,1,1,133,8,1,1,1,1,6.663890e-08,1.763087e-09
8923,13,1,5,2,1,2,1010,4178,1,2,409,5,9,4,8,1,1.665973e-08,6.429391e-07
8924,3,1,2,2,1,1,229,10067,1,1,419,1,1,6,3,1,2.274052e-05,2.683360e-05


In [24]:
pred_critical = w_xgb_model.predict(X_test)

In [25]:
a = np.array([pred_critical])
pred_df = pd.DataFrame({'Fraud_Prediction':a[0]})
pred_df

Unnamed: 0,Fraud_Prediction
0,1
1,0
2,0
3,0
4,1
...,...
8921,1
8922,1
8923,0
8924,1


In [26]:
sample=X_org[['Office ID','Declarant ID','Importer ID','HS10 Code','Fraud']]

In [27]:
dfa = pd.DataFrame(sample)
final_df = dfa.join(pred_df)
final_df

Unnamed: 0,Office ID,Declarant ID,Importer ID,HS10 Code,Fraud,Fraud_Prediction
0,29,575N8BW,PEJWA0Y,8481201000,1,1
1,21,8ZM6GUW,9DIRDSY,4407299000,0,0
2,39,1XCM1XF,SRCDUMH,710807000,0,0
3,15,KEGR4JZ,XSK62NY,4202999000,1,0
4,30,607KRHF,DRMMKS4,8711301000,0,1
...,...,...,...,...,...,...
8921,30,QM7LO7M,LKVEEMK,8518109090,1,1
8922,12,DO8IOFX,RALHUGK,7326909000,0,1
8923,39,9O034UC,ML9KFEZ,8517629000,0,0
8924,40,FXK30O6,YIIADKE,9503003919,0,1


In [28]:
final_df.to_csv('./label_syn/predict_evaluation_test_eng.csv', index=None, encoding='949')