In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from collections import OrderedDict
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

## 1. Load Dataset 

In [2]:
X_train = pd.read_csv('../label_syn/encoding_train1_eng.csv')
X_test = pd.read_csv('../label_syn/encoding_test_eng.csv')

In [3]:
y_train= pd.read_csv('../label_syn/y_train1_eng.csv', encoding='utf-8-sig')
y_test= pd.read_csv('../label_syn/y_test_eng.csv', encoding='utf-8-sig')

In [4]:
y_test.value_counts()

Fraud
0        6968
1        1958
dtype: int64

## 2. Over sampling

In [5]:
from imblearn.over_sampling import RandomOverSampler

In [6]:
over_sampler = RandomOverSampler(random_state = 11)
X_train_over,y_train_over = over_sampler.fit_resample(X_train,y_train)

In [7]:
over_sampler = RandomOverSampler(random_state = 11)
X_test_over,y_test_over = over_sampler.fit_resample(X_test,y_test)

## 3. Select xgboost parameters

In [8]:
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [9]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [10]:
from scipy import stats
from scipy.stats import randint

In [11]:
xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5994130001745845, max_delta_step=0, max_depth=4,
              min_child_weight=2, monotone_constraints='()',
              n_estimators=424, n_jobs=20, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None, objective= 'binary:logistic', eval_metric='logloss')

## 3. Evaluate Performance

In [12]:
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score,roc_auc_score

def get_clf_eval2(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average ='macro')
    recall = recall_score(y_test, pred, average ='macro')
    f1 = f1_score(y_test, pred, average ='macro')
    
    print("confusion\n", confusion)
    print("accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f}, F1:{3:.4f}\n".format(accuracy, precision, recall, f1))
    

In [14]:
file_name = "../model_syn/xgb_reg_eng.pkl"

# load
w_xgb_model = pickle.load(open(file_name, "rb"))

In [15]:
w_preds = w_xgb_model.predict(X_test_over)
w_pred_proba = w_xgb_model.predict_proba(X_test_over)[:,1]

In [16]:
get_clf_eval2(y_test_over, w_preds, w_pred_proba)

confusion
 [[5728 1240]
 [3245 3723]]
accuracy: 0.6782, precision: 0.6943, recall: 0.6782, F1:0.6714



In [28]:
# calculate precision of top n% suspicious items
def precision_top_n(y_test, pred_proba, percentage):
    top_n = int(percentage * len(y_test))
    y_test_top_n = y_test[np.argpartition(pred_proba, -(top_n))[-top_n:]]
    precision_top_n = round(np.sum(y_test_top_n) / len(y_test_top_n), 4)
    print("precision for {}% suspicious group : {}".format(int(percentage*100), precision_top_n))
    
precision_top_n(np.array(y_test_over["Fraud"]), w_pred_proba, 0.05)
precision_top_n(np.array(y_test_over["Fraud"]), w_pred_proba, 0.1)

precision for 5% suspicious group : 0.8994
precision for 10% suspicious group : 0.883


## 6. Compare predicted output vs ground truth

In [None]:
X_org = pd.read_csv('../label_syn/df_enc_test_eng.csv', encoding='utf-8-sig')

In [None]:
y_test

In [None]:
X_test

In [None]:
pred_critical = w_xgb_model.predict(X_test)

In [None]:
a = np.array([pred_critical])
pred_df = pd.DataFrame({'Fraud_Prediction':a[0]})
pred_df

In [None]:
sample=X_org[['Office ID','Declarant ID','Importer ID','HS10 Code','Fraud']]

In [None]:
dfa = pd.DataFrame(sample)
final_df = dfa.join(pred_df)
final_df

In [None]:
final_df.to_csv('../label_syn/predict_evaluation_test_eng.csv', index=None, encoding='949')