In [1]:
#导入所需库
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import recall_score, precision_recall_curve, auc, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

import modin.pandas as mpd  

from lightgbm import LGBMClassifier

from sklearnex import patch_sklearn
patch_sklearn()

import daal4py as d4p

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
#读取数据
df = pd.read_csv('creditcard.csv')
df = mpd.DataFrame(df) 

In [3]:
#分割特征和标签
X = df.drop('Class', axis=1)
y = df['Class']

#划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
#过采样正样本
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

# LightGBM模型训练

In [5]:

lgb = LGBMClassifier(random_state=42)

start = time.time()
lgb.fit(X_train, y_train)  
end = time.time()

print('LightGBM model training time:', end - start, 'sec')

[LightGBM] [Info] Number of positive: 199008, number of negative: 199008
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7646
[LightGBM] [Info] Number of data points in the train set: 398016, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM model training time: 1.4185025691986084 sec


In [6]:
# 模型评估
y_pred = lgb.predict(X_test)

# 计算召回率
recall = recall_score(y_test, y_pred) 

# 计算F1
f1 = f1_score(y_test, y_pred)

# 计算AUPRC
precision, recall_for_auprc, thresholds = precision_recall_curve(y_test, y_pred)
auprc = auc(recall_for_auprc, precision)

print('Recall score:', round(recall, 3))
print('F1 score:', round(f1, 3))
print('AUPRC:', round(auprc, 3))

Recall score: 0.846
F1 score: 0.855
AUPRC: 0.855


# 使用DAAL加速预测

In [7]:

daal_model = d4p.get_gbt_model_from_lightgbm(lgb.booster_)
start = time.time()
y_pred_daal = d4p.gbt_classification_prediction(nClasses=2).compute(X_test, daal_model).prediction
end = time.time()
daal_time = end - start

#不使用DAAL
start = time.time()  
y_pred_original = lgb.predict(X_test)
end = time.time()
original_time = end - start

print('Prediction time with DAAL:', daal_time, 'sec')
print('Prediction time without DAAL:', original_time, 'sec')
print('DAAL obtained {:.1f}x speedup'.format(original_time / daal_time))

Prediction time with DAAL: 0.11145949363708496 sec
Prediction time without DAAL: 0.07911443710327148 sec
DAAL obtained 0.7x speedup


# 利用patch_sklearn加速随机森林训练

In [9]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import time


rf = RandomForestClassifier(random_state=42)

start = time.time()
rf.fit(X_train, y_train)
end = time.time()

print(f"RandomForest training time with acceleration: {end - start:.2f} sec")

y_pred = rf.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"F1 score: {f1:.3f}")


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


RandomForest training time with acceleration: 3.24 sec
F1 score: 0.890


In [11]:
import daal4py as d4p
import time

# 配置随机森林训练参数
rf_params = {
    'nClasses': 2,
    'nTrees': 100,
    'featuresPerNode': X_train.shape[1],
    'maxTreeDepth': 10,
    'minObservationsInLeafNode': 5,
    # 其他参数...
}

# 使用oneDAL的随机森林训练模型
daal_rf = d4p.decision_forest_classification_training(**rf_params)

start = time.time()
rf_model = daal_rf.compute(X_train, y_train)
end = time.time()

print(f"RandomForest training time with oneDAL: {end - start:.2f} sec")

# 使用训练好的模型进行预测
n_classes = 2

# 使用训练好的模型进行预测
predict_algorithm = d4p.decision_forest_classification_prediction(nClasses=n_classes)

# 使用测试数据进行预测
y_pred_daal = predict_algorithm.compute(X_test, rf_model.model).prediction.ravel()

# 计算 F1 分数
f1 = f1_score(y_test, y_pred_daal)
print(f"F1 score with oneDAL: {f1:.3f}")


RandomForest training time with oneDAL: 30.83 sec
F1 score with oneDAL: 0.267


# XGBoost

In [12]:
import xgboost as xgb
from sklearn.metrics import f1_score
import time

# 模型参数
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': 'hist',  # 使用直方图优化的版本
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

start = time.time()
bst = xgb.train(xgb_params, dtrain, num_boost_round=100)
end = time.time()

print(f"XGBoost training time with histogram optimization: {end - start:.2f} sec")

y_pred_xgb = bst.predict(dtest)
y_pred_xgb = (y_pred_xgb > 0.5).astype(int)  # 转换为类别

f1 = f1_score(y_test, y_pred_xgb)
print(f"F1 score with XGBoost: {f1:.3f}")


XGBoost training time with histogram optimization: 1.60 sec
F1 score with XGBoost: 0.885


总结：
LightGBM
训练较快，召回率0.846，较高，意味着模型能够识别出大部分的正类样本（欺诈交易）。F1分数：0.855较高，平衡了精确度和召回率。AUPRC 0.855，表明模型在处理不平衡数据集时表现良好。

DAAL (oneDAL) 加速预测
预测时间：0.111秒；原始预测时间：0.079秒，加速比0.7倍。
在这个特定场景下，DAAL未能提供预测加速，反而比原生LightGBM慢了一些。
Scikit-learn-intelex 加速训练
随机森林训练时间（加速后）：3.24秒。
F1分数0.890，较高，表明模型在预测欺诈交易时效果很好。
oneDAL 加速训练
随机森林训练时间（oneDAL）：30.83秒，比Scikit-learn-intelex慢很多，可能是由于数据传输或参数配置不当。
F1分数（oneDAL）：0.267，这个分数明显低于预期，表明可能存在问题，需要进一步调查和优化。
XGBoost
训练时间：1.60秒，和LightGBM相当，证明XGBoost也是一个快速的训练算法。
F1分数：0.885，这个分数与Scikit-learn-intelex加速后的随机森林相近，表明XGBoost在此问题上表现良好。