In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
#模型融合之Stacking 

In [2]:
# 数据集预览
data = pd.read_csv('./data/data_2.csv',encoding='gbk')
# 去重
data.drop_duplicates(inplace=True)
print(data.shape)
data.head()

(4455, 81)


Unnamed: 0,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,regional_mobility,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,0.01,0.99,0,0.9,0.55,0.313,17.0,27.0,26.0,3.0,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,0.02,0.94,2000,1.28,1.0,0.458,19.0,30.0,14.0,4.0,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,0.04,0.96,0,1.0,1.0,0.114,13.0,68.0,22.0,1.0,...,1600.0,1250.0,4200.0,87.0,1.0,1.0,4200.0,4200.0,2.0,6.0
3,0.0,0.96,2000,0.13,0.57,0.777,22.0,14.0,6.0,3.0,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
4,0.01,0.99,0,0.46,1.0,0.175,13.0,66.0,42.0,1.0,...,2300.0,1630.0,8300.0,79.0,2.0,2.0,8400.0,8250.0,22.0,120.0


In [3]:
# 划分训练集测试集
from sklearn.model_selection import train_test_split
y = data['status']
X = data.drop(['status'],axis = 1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2018)

In [4]:
X.shape

(4455, 80)

In [5]:
# 数据归一化
from sklearn.preprocessing import minmax_scale
X_train = minmax_scale(X_train)
X_test =  minmax_scale(X_test)

In [7]:
#模型评估
from sklearn.metrics import accuracy_score, roc_auc_score

def model_metrics(clf, X_train, X_test, y_train, y_test):
    # 预测
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    y_train_proba = clf.predict_proba(X_train)[:,1]
    y_test_proba = clf.predict_proba(X_test)[:,1]
    
    # 准确率
    print('[准确率]', end = ' ')
    print('训练集：', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
    print('测试集：', '%.4f'%accuracy_score(y_test, y_test_pred))
    
    # auc取值：用roc_auc_score或auc
    print('[auc值]', end = ' ')
    print('训练集：', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
    print('测试集：', '%.4f'%roc_auc_score(y_test, y_test_proba))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

from mlxtend.classifier import StackingClassifier

In [9]:
# 模型调优后得到的参数
lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf =  svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=11,min_samples_split=550,min_samples_leaf=80,max_features=19)
xgb = XGBClassifier(learning_rate =0.01, n_estimators=180, max_depth=3, min_child_weight=5, 
                    gamma=0.4, subsample=0.5, colsample_bytree=0.9, reg_alpha=1, 
                    objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=60, max_depth=3, min_child_weight=11, 
                    gamma=0.1, subsample=0.5, colsample_bytree=0.8, reg_alpha=1e-5, 
                    nthread=4,scale_pos_weight=1, seed=27)

In [None]:
#使用4种SVM、决策树、XGBoost和LightGBM作为初级分类器，LR作为次级分类器。

In [10]:
#1、将初级分类器产生的类别标签作为新特征
sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, 
                                       svm_sigmoid, dt, xgb, lgb], meta_classifier=lr)

In [11]:
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8291 测试集： 0.7876
[auc值] 训练集： 0.7014 测试集： 0.7277


In [12]:
#2、将初级分类器产生的输出类概率作为新特征
#对输出概率use_probas=True，有两种不同的处理方式。
sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb], 
                            meta_classifier=lr, use_probas=True,average_probas=False)

In [13]:
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8364 测试集： 0.7704
[auc值] 训练集： 0.8722 测试集： 0.8082
