In [1]:
import os
import pickle
import pandas as pd
import numpy as np

## 1. Load Dataset 

In [6]:
X_train = pd.read_csv('./label_syn/encoding_train1.csv', encoding='utf-8-sig')
X_valid = pd.read_csv('./label_syn/encoding_test.csv', encoding='utf-8-sig')

In [7]:
y_train= pd.read_csv('./label_syn/y_train1.csv', encoding='utf-8-sig')
y_valid= pd.read_csv('./label_syn/y_test.csv', encoding='utf-8-sig')

## 2. Over sampling

In [14]:
from imblearn.over_sampling import RandomOverSampler

In [15]:
over_sampler = RandomOverSampler(random_state = 11)
X_train_over,y_train_over = over_sampler.fit_resample(X_train,y_train)
print('Oversampler 적용 전 학습용 피처/레이블 데이터 세트: ', X_train.shape, y_train.shape)
print('Oversampler 적용 후 학습용 피처/레이블 데이터 세트: ', X_train_over.shape, y_train_over.shape)
print('Oversampler 적용 후 레이블 값 분포: \n', y_train_over.value_counts())

Oversampler 적용 전 학습용 피처/레이블 데이터 세트:  (36592, 18) (36592, 1)
Oversampler 적용 후 학습용 피처/레이블 데이터 세트:  (57484, 18) (57484, 1)
Oversampler 적용 후 레이블 값 분포: 
 우범여부
0       28742
1       28742
dtype: int64


In [16]:
over_sampler = RandomOverSampler(random_state = 11)
X_valid_over,y_valid_over = over_sampler.fit_resample(X_valid,y_valid)
print('Oversampler 적용 전 학습용 피처/레이블 데이터 세트: ', X_valid.shape, y_train.shape)
print('Oversampler 적용 후 학습용 피처/레이블 데이터 세트: ', X_valid_over.shape, y_valid_over.shape)
print('Oversampler 적용 후 레이블 값 분포: \n', y_valid_over.value_counts())

Oversampler 적용 전 학습용 피처/레이블 데이터 세트:  (8926, 18) (36592, 1)
Oversampler 적용 후 학습용 피처/레이블 데이터 세트:  (13936, 18) (13936, 1)
Oversampler 적용 후 레이블 값 분포: 
 우범여부
0       6968
1       6968
dtype: int64


## 3. xgboost 파라미터 선정

In [18]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from xgboost import XGBClassifier

In [19]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## 3. 모델학습

### 3.1 학습/테스트 데이터 학습(xgboost) 성능

In [21]:
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score,roc_auc_score

def get_clf_eval2(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average ='macro')
    recall = recall_score(y_test, pred, average ='macro')
    f1 = f1_score(y_test, pred, average ='macro')
    
    print("오차행렬\n", confusion)
    print("정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}\n".format(accuracy, precision, recall, f1))
    
# 모델 학습 함수



def get_model_train_eval2(model, ftr_train = None, ftr_test = None, tgt_train = None, tgt_test = None):
    model.fit(ftr_train, tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:, 1]
    get_clf_eval2(tgt_test, pred, pred_proba)

In [22]:
xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5994130001745845, max_delta_step=0, max_depth=4,
              min_child_weight=2, monotone_constraints='()',
              n_estimators=424, n_jobs=20, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', validate_parameters=1, verbosity=None, objective= 'binary:logistic', eval_metric='logloss')

In [23]:
start_time = timer(None) 
get_model_train_eval2(xgb_model, ftr_train = X_train_over, ftr_test = X_valid_over, 
                      tgt_train = y_train_over, tgt_test = y_valid_over)
timer(start_time) 

오차행렬
 [[5767 1201]
 [3619 3349]]
정확도: 0.6541, 정밀도: 0.6752, 재현율: 0.6541, F1:0.6434


 Time taken: 0 hours 0 minutes and 2.24 seconds.


### 3.2 기존 학습(xgboost) 모델을 가져와서 검증

In [25]:
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score,roc_auc_score

def get_clf_eval2(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average ='macro')
    recall = recall_score(y_test, pred, average ='macro')
    f1 = f1_score(y_test, pred, average ='macro')
    
    print("오차행렬\n", confusion)
    print("정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}\n".format(accuracy, precision, recall, f1))
    
# 모델 학습 함수
def get_model_train_eval2(model, ftr_train = None, ftr_test = None, tgt_train = None, tgt_test = None):
    model.fit(ftr_train, tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:, 1]
    get_clf_eval2(tgt_test, pred, pred_proba)

In [26]:
# 모델 불러오기

file_name = "./model/xgb_reg.pkl"

# load
w_xgb_model = pickle.load(open(file_name, "rb"))

In [27]:
# predict, predict_proba 추출

w_preds = w_xgb_model.predict(X_valid_over)
w_pred_proba = w_xgb_model.predict_proba(X_valid_over)[:,1]

In [28]:
# get_clf_eval()를 이용해 사키릿런 래퍼 XGBoost로 만들어진 모델 예측 성능 평가

get_clf_eval2(y_valid_over, w_preds, w_pred_proba)

오차행렬
 [[5767 1201]
 [3619 3349]]
정확도: 0.6541, 정밀도: 0.6752, 재현율: 0.6541, F1:0.6434



In [None]:
# calculate precision of top n% suspicious items
def precision_top_n(y_test, pred_proba, percentage):
    top_n = int(percentage * len(y_test))
    y_test_top_n = y_test[np.argpartition(pred_proba, -(top_n))[-top_n:]]
    precision_top_n = round(np.sum(y_test_top_n) / len(y_test_top_n), 4)
    print("precision for {}% suspicious group : {}".format(int(percentage*100), precision_top_n))
    
precision_top_n(np.array(y_test_over["Fraud"]), w_pred_proba, 0.05)
precision_top_n(np.array(y_test_over["Fraud"]), w_pred_proba, 0.1)

## 6. 실제 /예측값 비교

In [31]:
X_org = pd.read_csv('./label_syn/df_enc_test.csv', encoding='utf-8-sig')

In [34]:
pred_critical = w_xgb_model.predict(X_valid)

In [35]:
a = np.array([pred_critical])
pred_df = pd.DataFrame({'우범여부_예측':a[0]})
pred_df

Unnamed: 0,우범여부_예측
0,1
1,0
2,0
3,0
4,0
...,...
8921,1
8922,1
8923,0
8924,1


In [36]:
sample=X_org[['신고세관부호','신고인부호','수입자','HS10단위부호','우범여부']]

In [37]:
dfa = pd.DataFrame(sample)
final_df = dfa.join(pred_df)
final_df

Unnamed: 0,신고세관부호,신고인부호,수입자,HS10단위부호,우범여부,우범여부_예측
0,29,575N8BW,PEJWA0Y,8481201000,1,1
1,21,8ZM6GUW,9DIRDSY,4407299000,0,0
2,39,1XCM1XF,SRCDUMH,710807000,0,0
3,15,KEGR4JZ,XSK62NY,4202999000,1,0
4,30,607KRHF,DRMMKS4,8711301000,0,0
...,...,...,...,...,...,...
8921,30,QM7LO7M,LKVEEMK,8518109090,1,1
8922,12,DO8IOFX,RALHUGK,7326909000,0,1
8923,39,9O034UC,ML9KFEZ,8517629000,0,0
8924,40,FXK30O6,YIIADKE,9503003919,0,1


In [39]:
# 파일 저장 #
final_df.to_csv('./label_syn/predict_evaluation_test.csv', index=None, encoding='949')