使用五折交叉验证法，GridSearch来寻找模型的最优参数
---

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 读入数据
train=pd.read_csv('../data/train_data.csv')
test=pd.read_csv('../data/test_data.csv')

In [4]:
# 训练集与测试集的导入
train_y=train['status']
train_X=train.drop(['status'],axis=1)

test_y=test['status']
test_X=test.drop(['status'],axis=1)

In [5]:
# 数据归一化操作
from sklearn.preprocessing import minmax_scale
train_X=minmax_scale(train_X)
test_X=minmax_scale(test_X)

## 使用模型来拟合数据并且查看准确率

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

In [9]:
# 使用逻辑回归运行五折交叉验证，网格搜索来获取最优参数
from sklearn.model_selection import GridSearchCV

# C为正则化系数λ的倒数，必须为正数，默认为1，值越小，代表正则化越强。一般来说，只需要调节这个参数
grid_lr = GridSearchCV(LogisticRegression(), param_grid={"C":[0.01,0.05,0.1,0.2,0.5, 1, 10]}, cv=5)
grid_lr.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_lr.best_params_, grid_lr.best_score_))

The best parameters are {'C': 1} with a score of 0.80


In [11]:
# 使用SVM运行五折交叉验证，网格搜索来获取最优参数

# 惩罚系数C,核函数参数gamma,可以调节这两个餐素
grid_svm=GridSearchCV(SVC(probability=True),param_grid={"C":[0.1,0.5, 1, 10,20], "gamma": [1, 0.5,0.1, 0.01]})
grid_svm.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_svm.best_params_, grid_svm.best_score_))

The best parameters are {'C': 10, 'gamma': 0.1} with a score of 0.79


In [13]:
# 使用决策树运行五折交叉验证，网格搜索来获取最优参数

# 决策树的模型一般这是需要调节最大深度即可
grid_dt=GridSearchCV(DecisionTreeClassifier(),param_grid={"max_depth":[i for i in range(1,10)]},cv=5)
grid_dt.fit(train_X,train_y)
print("The best parameters are %s with a score of %0.2f"
      % (grid_dt.best_params_, grid_dt.best_score_))

The best parameters are {'max_depth': 4} with a score of 0.78


In [19]:
# GBDT进行调参，这个过程可以分为如下进行

# 1。先对步长和迭代次数进行调参
param_test1 = {'n_estimators':range(20,81,10),'learning_rate':[0.05,0.1,0.2,0.5]}
grid_gbdt = GridSearchCV(estimator = GradientBoostingClassifier(min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
grid_gbdt.fit(train_X,train_y)
grid_gbdt.best_params_, grid_gbdt.best_score_

({'learning_rate': 0.05, 'n_estimators': 80}, 0.7947963783367844)

In [18]:
# 2.对其弱分类器决策树进行调参
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=80, min_samples_leaf=20, 
      max_features='sqrt', subsample=0.8, random_state=10), 
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(train_X,train_y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 13, 'min_samples_split': 300}, 0.7981856750464383)

In [20]:
# 3.对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=80,max_depth=13,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(train_X,train_y)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 60, 'min_samples_split': 800}, 0.7877742460823469)

In [23]:
# 对于RF，也可以按照上面的流程进行调参
# 1。先对步长和迭代次数进行调参
param_test1 = {'n_estimators':range(10,71,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(train_X,train_y)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 60}, 0.7890529667962056)

In [24]:
# 2.对其弱分类器决策树进行调参
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                  min_samples_leaf=20,max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(train_X,train_y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 13, 'min_samples_split': 70}, 0.7900313074352703)

In [26]:
# 3.对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 70, max_depth=13,
                                  max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(train_X,train_y)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 10, 'min_samples_split': 80}, 0.7891455636034637)

In [28]:
# KNN,主要是k值的选择，以及停止建子树的叶子节点阈值leaf_size的参数的选择
params={'n_neighbors':[i for i in range(1,20,2)],'leaf_size':[i for i in range(10,100,10)]}
gsearch4 = GridSearchCV(estimator = KNeighborsClassifier(),param_grid = params,cv=5)
gsearch4.fit(train_X,train_y)
gsearch4.best_params_, gsearch4.best_score_


({'leaf_size': 10, 'n_neighbors': 19}, 0.7645348837209303)

In [29]:
# 使用上面使用的最优的参数进行建模
#  LogisticRegression
lr=LogisticRegression(C=1)
lr.fit(train_X,train_y)

# SVM
svm=SVC(C=10,gamma=0.1,probability=True)
svm.fit(train_X,train_y)

# DecisionTree
dt=DecisionTreeClassifier(max_depth=4)
dt.fit(train_X,train_y)

# RF
rf=RandomForestClassifier(n_estimators=60,max_depth=13,min_samples_split=80,min_samples_leaf=10)
rf.fit(train_X,train_y)

# GBDT
gbdt=GradientBoostingClassifier(learning_rate=0.01,n_estimators=80,max_depth=13,min_samples_split=300,min_samples_leaf=60)
gbdt.fit(train_X,train_y)

# KNN
knn=KNeighborsClassifier(n_neighbors=19,leaf_size=10)
knn.fit(train_X,train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=19, p=2,
                     weights='uniform')

In [30]:
# 当存在多个模型的时候，我们可以一起输出准确率，尝试使用列表的形式
names=["LR","SVM","DecisionTree","RF","KNN","GBDT"]
models=[lr,svm,dt,rf,knn,gbdt]

In [31]:
import numpy as np
df_list=[]
for name,model in zip(names,models):
    y_train_pred=model.predict(train_X)
    y_test_pred=model.predict(test_X)
    
    # accuracy
    train_accuracy=model.score(train_X,y_train_pred)
    test_accuracy=model.score(test_X,y_test_pred)
    
    # precision
    train_precision=precision_score(train_y,y_train_pred)
    test_precision=precision_score(test_y,y_test_pred)
    
    # recall
    train_recall=recall_score(train_y,y_train_pred)
    test_recall=recall_score(test_y,y_test_pred)
    
    # f1
    train_f1=f1_score(train_y,y_train_pred)
    test_f1=f1_score(test_y,y_test_pred)
    
    # auc
    y_train_pred=model.predict_proba(train_X)[:,1]
    y_test_pred=model.predict_proba(test_X)[:,1]
    
    train_auc=roc_auc_score(train_y,y_train_pred)
    test_auc=roc_auc_score(test_y,y_test_pred)
    
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)
pd.concat(df_list,axis=0,keys=names)

LR 训练集： accuracy:1.0,precision:0.748, recall:0.291, f1:0.419, auc:0.801
LR 测试集： accuracy:1.0,precision:0.573, recall:0.536, f1:0.554, auc:0.798


SVM 训练集： accuracy:1.0,precision:0.845, recall:0.299, f1:0.441, auc:0.851
SVM 测试集： accuracy:1.0,precision:0.569, recall:0.559, f1:0.564, auc:0.795


DecisionTree 训练集： accuracy:1.0,precision:0.727, recall:0.353, f1:0.475, auc:0.778
DecisionTree 测试集： accuracy:1.0,precision:0.6, recall:0.443, f1:0.51, auc:0.739


RF 训练集： accuracy:1.0,precision:0.854, recall:0.313, f1:0.458, auc:0.896
RF 测试集： accuracy:1.0,precision:0.701, recall:0.278, f1:0.398, auc:0.789


KNN 训练集： accuracy:1.0,precision:0.745, recall:0.141, f1:0.237, auc:0.767
KNN 测试集： accuracy:1.0,precision:0.626, recall:0.165, f1:0.261, auc:0.716


GBDT 训练集： accuracy:1.0,precision:0.917, recall:0.086, f1:0.157, auc:0.862
GBDT 测试集： accuracy:1.0,precision:0.878, recall:0.104, f1:0.187, auc:0.779




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,1.0,0.748322,0.290743,0.418779,0.800847
LR,test,1.0,0.572755,0.536232,0.553892,0.797769
SVM,train,1.0,0.845018,0.298566,0.441233,0.850689
SVM,test,1.0,0.569322,0.55942,0.564327,0.795425
DecisionTree,train,1.0,0.726542,0.353325,0.475439,0.777887
DecisionTree,test,1.0,0.6,0.443478,0.51,0.739426
RF,train,1.0,0.854093,0.312907,0.458015,0.896399
RF,test,1.0,0.70073,0.278261,0.39834,0.789437
KNN,train,1.0,0.744828,0.140808,0.236842,0.767217
KNN,test,1.0,0.626374,0.165217,0.261468,0.715873
