使用Stacking方法来完成模型融合
---

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 读入数据
train=pd.read_csv('../data/train_data.csv')
test=pd.read_csv('../data/test_data.csv')

In [3]:
# 训练集与测试集的导入
train_y=train['status']
train_X=train.drop(['status'],axis=1)

test_y=test['status']
test_X=test.drop(['status'],axis=1)

In [4]:
# 数据归一化操作
from sklearn.preprocessing import minmax_scale
train_X=minmax_scale(train_X)
test_X=minmax_scale(test_X)

## 使用模型来拟合数据并且查看准确率

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

In [6]:
# 使用逻辑回归运行五折交叉验证，网格搜索来获取最优参数
from sklearn.model_selection import GridSearchCV

# C为正则化系数λ的倒数，必须为正数，默认为1，值越小，代表正则化越强。一般来说，只需要调节这个参数
grid_lr = GridSearchCV(LogisticRegression(), param_grid={"C":[0.01,0.05,0.1,0.2,0.5, 1, 10]}, cv=5)
grid_lr.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_lr.best_params_, grid_lr.best_score_))

The best parameters are {'C': 10} with a score of 0.80


In [7]:
# 使用SVM运行五折交叉验证，网格搜索来获取最优参数

# 惩罚系数C,核函数参数gamma,可以调节这两个餐素
grid_svm=GridSearchCV(SVC(probability=True),param_grid={"C":[0.1,0.5, 1, 10,20], "gamma": [1, 0.5,0.1, 0.01]})
grid_svm.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_svm.best_params_, grid_svm.best_score_))

The best parameters are {'C': 10, 'gamma': 0.1} with a score of 0.79


In [8]:
# 使用决策树运行五折交叉验证，网格搜索来获取最优参数

# 决策树的模型一般这是需要调节最大深度即可
grid_dt=GridSearchCV(DecisionTreeClassifier(),param_grid={"max_depth":[i for i in range(1,10)]},cv=5)
grid_dt.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_dt.best_params_, grid_dt.best_score_))

The best parameters are {'max_depth': 4} with a score of 0.76


In [9]:
# GBDT进行调参，这个过程可以分为如下进行

# 1。先对步长和迭代次数进行调参
param_test1 = {'n_estimators':range(20,81,10),'learning_rate':[0.05,0.1,0.2,0.5]}
grid_gbdt = GridSearchCV(estimator = GradientBoostingClassifier(min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
grid_gbdt.fit(train_X,train_y)
grid_gbdt.best_params_, grid_gbdt.best_score_

({'learning_rate': 0.05, 'n_estimators': 80}, 0.8007820916826951)

In [10]:
# 2.对其弱分类器决策树进行调参
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=80, min_samples_leaf=20, 
      max_features='sqrt', subsample=0.8, random_state=10), 
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(train_X,train_y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 5, 'min_samples_split': 300}, 0.8028316783445085)

In [11]:
# 3.对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=80,max_depth=13,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(train_X,train_y)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 70, 'min_samples_split': 800}, 0.7992744969014101)

In [12]:
# 对于RF，也可以按照上面的流程进行调参
# 1。先对步长和迭代次数进行调参
param_test1 = {'n_estimators':range(10,71,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(train_X,train_y)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 70}, 0.7935017280083538)

In [13]:
# 2.对其弱分类器决策树进行调参
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                  min_samples_leaf=20,max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(train_X,train_y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 5, 'min_samples_split': 70}, 0.7947707409289428)

In [14]:
# 3.对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 70, max_depth=13,
                                  max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(train_X,train_y)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 20, 'min_samples_split': 80}, 0.7966553035766241)

In [15]:
# KNN,主要是k值的选择，以及停止建子树的叶子节点阈值leaf_size的参数的选择
params={'n_neighbors':[i for i in range(1,20,2)],'leaf_size':[i for i in range(10,100,10)]}
gsearch4 = GridSearchCV(estimator = KNeighborsClassifier(),param_grid = params,cv=5)
gsearch4.fit(train_X,train_y)
gsearch4.best_params_, gsearch4.best_score_


({'leaf_size': 10, 'n_neighbors': 15}, 0.7558139534883721)

In [16]:
# 使用上面使用的最优的参数进行建模
#  LogisticRegression
lr=LogisticRegression(C=1)
lr.fit(train_X,train_y)

# SVM
svm=SVC(C=10,gamma=0.1,probability=True)
svm.fit(train_X,train_y)

# DecisionTree
dt=DecisionTreeClassifier(max_depth=4)
dt.fit(train_X,train_y)

# RF
rf=RandomForestClassifier(n_estimators=60,max_depth=13,min_samples_split=80,min_samples_leaf=10)
rf.fit(train_X,train_y)

# GBDT
gbdt=GradientBoostingClassifier(learning_rate=0.01,n_estimators=80,max_depth=13,min_samples_split=300,min_samples_leaf=60)
gbdt.fit(train_X,train_y)

# KNN
knn=KNeighborsClassifier(n_neighbors=19,leaf_size=10)
knn.fit(train_X,train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=19, p=2,
           weights='uniform')

In [17]:
# 当存在多个模型的时候，我们可以一起输出准确率，尝试使用列表的形式
names=["LR","SVM","DecisionTree","RF","KNN","GBDT"]
models=[lr,svm,dt,rf,knn,gbdt]

In [18]:
import numpy as np
df_list=[]
for name,model in zip(names,models):
    y_train_pred=model.predict(train_X)
    y_test_pred=model.predict(test_X)
    
    # accuracy
    train_accuracy=model.score(train_X,y_train_pred)
    test_accuracy=model.score(test_X,y_test_pred)
    
    # precision
    train_precision=precision_score(train_y,y_train_pred)
    test_precision=precision_score(test_y,y_test_pred)
    
    # recall
    train_recall=recall_score(train_y,y_train_pred)
    test_recall=recall_score(test_y,y_test_pred)
    
    # f1
    train_f1=f1_score(train_y,y_train_pred)
    test_f1=f1_score(test_y,y_test_pred)
    
    # auc
    y_train_pred=model.predict_proba(train_X)[:,1]
    y_test_pred=model.predict_proba(test_X)[:,1]
    
    train_auc=roc_auc_score(train_y,y_train_pred)
    test_auc=roc_auc_score(test_y,y_test_pred)
    
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)
pd.concat(df_list,axis=0,keys=names)

LR 训练集： accuracy:1.0,precision:0.744, recall:0.34, f1:0.467, auc:0.81
LR 测试集： accuracy:1.0,precision:0.669, recall:0.291, f1:0.405, auc:0.772


SVM 训练集： accuracy:1.0,precision:0.838, recall:0.346, f1:0.489, auc:0.864
SVM 测试集： accuracy:1.0,precision:0.632, recall:0.284, f1:0.392, auc:0.756


DecisionTree 训练集： accuracy:1.0,precision:0.749, recall:0.334, f1:0.462, auc:0.785
DecisionTree 测试集： accuracy:1.0,precision:0.577, recall:0.294, f1:0.389, auc:0.686


RF 训练集： accuracy:1.0,precision:0.865, recall:0.339, f1:0.487, auc:0.894
RF 测试集： accuracy:1.0,precision:0.695, recall:0.334, f1:0.451, auc:0.769


KNN 训练集： accuracy:1.0,precision:0.721, recall:0.166, f1:0.27, auc:0.77
KNN 测试集： accuracy:1.0,precision:0.621, recall:0.169, f1:0.265, auc:0.693


GBDT 训练集： accuracy:1.0,precision:0.895, recall:0.129, f1:0.225, auc:0.861
GBDT 测试集： accuracy:1.0,precision:0.829, recall:0.0906, f1:0.163, auc:0.747




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,1.0,0.743802,0.340479,0.467128,0.809639
LR,test,1.0,0.669065,0.290625,0.405229,0.772362
SVM,train,1.0,0.83792,0.345523,0.489286,0.863781
SVM,test,1.0,0.631944,0.284375,0.392241,0.756191
DecisionTree,train,1.0,0.748588,0.334174,0.462075,0.785285
DecisionTree,test,1.0,0.576687,0.29375,0.389234,0.686473
RF,train,1.0,0.864952,0.339218,0.487319,0.893763
RF,test,1.0,0.694805,0.334375,0.451477,0.7692
KNN,train,1.0,0.721311,0.166456,0.270492,0.770401
KNN,test,1.0,0.62069,0.16875,0.265356,0.692568


In [19]:
# 导入相关的stacking的工具，然后将比较好的模型来做一个融合
from mlxtend.classifier import StackingClassifier
# stacking融合算法的目标是在每个子模块1、子模块2的设计选择过程中要尽可能的保证：
# high biase
# low var
# 在子模块meta_classifier的时候，要保证：

# low biase
# high var

In [20]:
sclf = StackingClassifier(classifiers=[lr, gbdt,rf,svm], meta_classifier=svm)
sclf1 = StackingClassifier(classifiers=[gbdt,svm,knn], meta_classifier=knn)
sclf2 = StackingClassifier(classifiers=[gbdt,svm,dt], meta_classifier=lr)
sclf3 = StackingClassifier(classifiers=[svm,dt], meta_classifier=rf)

In [21]:
stack_names=['sclf','sclf1','sclf2','sclf3']
stack_models=[sclf,sclf1,sclf2,sclf3]

In [24]:
stack_df_list=[]
for name,model in zip(stack_names,stack_models):
    model.fit(train_X, train_y)  
    y_train_pred=model.predict(train_X)
    y_test_pred=model.predict(test_X)
    
    # accuracy
    train_accuracy=model.score(train_X,y_train_pred)
    test_accuracy=model.score(test_X,y_test_pred)
    
    # precision
    train_precision=precision_score(train_y,y_train_pred)
    test_precision=precision_score(test_y,y_test_pred)
    
    # recall
    train_recall=recall_score(train_y,y_train_pred)
    test_recall=recall_score(test_y,y_test_pred)
    
    # f1
    train_f1=f1_score(train_y,y_train_pred)
    test_f1=f1_score(test_y,y_test_pred)
    
    # auc
    y_train_pred=model.predict_proba(train_X)[:,1]
    y_test_pred=model.predict_proba(test_X)[:,1]
    
    train_auc=roc_auc_score(train_y,y_train_pred)
    test_auc=roc_auc_score(test_y,y_test_pred)
    
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    stack_df_list.append(df)
pd.concat(stack_df_list,axis=0,keys=names)

sclf 训练集： accuracy:1.0,precision:0.836, recall:0.406, f1:0.547, auc:0.695
sclf 测试集： accuracy:1.0,precision:0.611, recall:0.344, f1:0.44, auc:0.642


sclf1 训练集： accuracy:1.0,precision:0.838, recall:0.346, f1:0.489, auc:0.667
sclf1 测试集： accuracy:1.0,precision:0.632, recall:0.284, f1:0.392, auc:0.629


sclf2 训练集： accuracy:1.0,precision:0.761, recall:0.456, f1:0.571, auc:0.707
sclf2 测试集： accuracy:1.0,precision:0.547, recall:0.384, f1:0.451, auc:0.648


sclf3 训练集： accuracy:1.0,precision:0.761, recall:0.456, f1:0.571, auc:0.707
sclf3 测试集： accuracy:1.0,precision:0.547, recall:0.384, f1:0.451, auc:0.648




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,1.0,0.836364,0.406053,0.546689,0.694525
LR,test,1.0,0.611111,0.34375,0.44,0.641516
SVM,train,1.0,0.83792,0.345523,0.489286,0.667489
SVM,test,1.0,0.631944,0.284375,0.392241,0.629193
DecisionTree,train,1.0,0.760504,0.456494,0.570528,0.706954
DecisionTree,test,1.0,0.546667,0.384375,0.451376,0.647691
RF,train,1.0,0.760504,0.456494,0.570528,0.707264
RF,test,1.0,0.546667,0.384375,0.451376,0.648104


In [None]:
# 可以看到，stacking方法还是对提升准确率有一定效果的，但是也可以分析，某些stacking方法做起来之后，效果却不好了，
# 要懂原理再去使用stacking，这样才能最好的达到模型的效果
# reference:https://zhuanlan.zhihu.com/p/56086368