# 基于Intel® Distribution of Modin 和 Intel® Extension for Scikit-learn 及 Intel® DAAL加速的信用卡交易欺诈检测

整体介绍：使用modin和sklearnex加速数据的读取、处理和模型的训练，采用XGBoost Optimized for Intel® Architecture 作为分类模型，最后使用daal4py加快预测速度

In [1]:
import os
import numpy as np
import warnings

warnings.filterwarnings("ignore")

导入modin和sklearnex，设置 HDK 作为后端计算引擎，使用patch_sklearn()进行加速

In [2]:
import modin.pandas as pd

import modin.config as cfg
cfg.StorageFormat.put('hdk')

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn import config_context
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, average_precision_score
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


读入数据文件（数据文件放在同目录下）

In [3]:
import time
dt_start = time.time()

df = pd.read_csv('creditcard.csv')
print("read_csv time: ", time.time() - dt_start)

read_csv time:  3.59470272064209


划分数据集，预处理不平衡数据（SMOTE）

In [4]:
dt_start = time.time()

X = df.drop(columns=["Class"], axis = 1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 21)

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)#
X_res, y_res = sm.fit_resample(X_train, y_train)

X_train = X_res
y_train = y_res

print(X_train.columns.tolist())

print("SMOTE time: ", time.time() - dt_start)

['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
SMOTE time:  1.1676709651947021


训练XGBoost

In [5]:
dt_start = time.time()
from xgboost import XGBClassifier
import daal4py as d4p
 
xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(X_train, y_train)
print("Training time: ", time.time() - dt_start)

Training time:  36.793660402297974


直接用xgboost预测，预测时间较长。根据题目要求，使用F1分数和AUPRC作为评价依据。

In [6]:
#不使用daal加速预测，预测时间较长

dt_start = time.time()

xgb_prediction = xgb.predict(X_test)

print("Prediction time: ", time.time() - dt_start)

print("f1 score: ", f1_score(y_test, xgb_prediction))

print("AUPRC: ", average_precision_score(y_test, xgb_prediction))

Prediction time:  0.1606733798980713
f1 score:  0.825938566552901
AUPRC:  0.6827009327829815


使用daal4py加速，预测时间大大缩短，精度没有损失

In [7]:
#使用daal加速，预测时间显著缩短

daal_model = d4p.get_gbt_model_from_xgboost(xgb.get_booster())

dt_start = time.time()

daal_prediction = d4p.gbt_classification_prediction(nClasses = 2, resultsToEvaluate = "computeClassLabels|computeClassProbabilities").compute(X_test, daal_model)

print("Prediction time: ", time.time() - dt_start)

print("f1 score: ", f1_score(y_test, daal_prediction.prediction))

print("AUPRC: ", average_precision_score(y_test, daal_prediction.prediction))

Prediction time:  0.026927709579467773
f1 score:  0.825938566552901
AUPRC:  0.6827009327829815


使用网格搜索+交叉验证的随机森林

In [8]:
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

dt_start = time.time()

param_grid = {
#   训练时长太长，为重跑快一点直接把找到的最好参数写上去了
    'min_samples_split': [2],
    'n_estimators' : [11],
    'max_depth': [15],
    'max_features': [8]
}

scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'average_precision_score': make_scorer(average_precision_score)
}

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(criterion = 'entropy',random_state = 42)


def grid_search_wrapper(refit_score='accuracy_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(classifier, param_grid, refit=refit_score,
                           cv=skf, return_train_score=True,scoring=scorers, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    print("f1 score: ",f1_score(y_test, y_pred))
    print("AUPRC: ", average_precision_score(y_test, y_pred))
    return grid_search

grid_search_clf = grid_search_wrapper(refit_score='average_precision_score')

print("Training time: ", time.time() - dt_start)

Best params for average_precision_score
{'max_depth': 15, 'max_features': 8, 'min_samples_split': 2, 'n_estimators': 11}

Confusion matrix of Random Forest optimized for average_precision_score on the test data:
     pred_neg  pred_pos
neg     85267        27
pos        25       124
f1 score:  0.8266666666666668
AUPRC:  0.6837007441960327
Training time:  81.75254321098328


In [9]:
dt_start = time.time()
 
xgb_1 = XGBClassifier(
    learning_rate = 0.4,
    eval_metric='mlogloss'
)
xgb_1.fit(X_train, y_train)
print("Training time: ", time.time() - dt_start)

Training time:  33.207133769989014


In [10]:
daal_model = d4p.get_gbt_model_from_xgboost(xgb_1.get_booster())

dt_start = time.time()

daal_prediction = d4p.gbt_classification_prediction(nClasses = 2, resultsToEvaluate = "computeClassLabels|computeClassProbabilities").compute(X_test, daal_model)

print("Prediction time: ", time.time() - dt_start)

print("f1 score: ", f1_score(y_test, daal_prediction.prediction))

print("AUPRC: ", average_precision_score(y_test, daal_prediction.prediction))

Prediction time:  0.0382230281829834
f1 score:  0.8367346938775511
AUPRC:  0.7005588671327658
