In [1]:
import pandas as pd

# 原始数据导入

In [2]:
train_org = pd.read_csv('./train.csv')
test_org = pd.read_csv('./test.csv')
print(train_org.shape, test_org.shape)

(210000, 146) (90000, 146)


In [3]:
train_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210000 entries, 0 to 209999
Columns: 146 entries, user_pin to label
dtypes: int64(145), object(1)
memory usage: 233.9+ MB


In [4]:
train_x = train_org.iloc[:, 1:-2]
train_y = train_org['label']
test_x = test_org.iloc[:, 1:-2]
test_y = test_org['label']

In [5]:
train_x.shape

(210000, 143)

In [6]:
train_y.sum()

69857

In [7]:
test_y.sum()

30143

# 数据预处理

## 每个特征缺失值比例查看

In [8]:
name_list = train_x.columns.values.tolist()

In [9]:
# 查看单个特征缺失值比例
def get_null_percent(chara_name):
    false_true_arr = train_org[chara_name].isnull()
    total = len(false_true_arr)
    true_result = 0
    for part in false_true_arr:
        if part == 'true':
            true_result += 1
    true_percent = true_result / total
    return true_percent

In [10]:
get_null_percent(name_list[0])

0.0

In [11]:
# 查看所有特征缺失值比例
def get_total_null_character(character_name_list):
    dic = {}
    for character in character_name_list:
        chara_null_percent = get_null_percent(character)
        dic[character] = chara_null_percent
    return dic

In [12]:
# 打印出有缺失值的特征
def get_null_character(dic_chara):
    for k, v in dic_chara.items():
        if v != 0.0:
            print(k)

In [13]:
dic_chara = get_total_null_character(name_list)
get_null_character(dic_chara)

## 去掉取值变化小的特征

In [14]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
result = sel.fit_transform(train_x)
result.shape

(210000, 17)

In [15]:
test_result = sel.fit_transform(test_x)
test_result.shape

(90000, 17)

## 特征选择

### 单个特征的差异性

In [16]:
from sklearn.feature_selection import SelectKBest

In [17]:
from sklearn.feature_selection import chi2

In [18]:
X_new = SelectKBest(chi2, k=10).fit_transform(result, train_y)

In [19]:
X_new.shape

(210000, 10)

In [20]:
X_new.shape

(210000, 10)

In [21]:
test_X_new = SelectKBest(chi2, k=10).fit_transform(test_result, test_y)
test_X_new.shape

(90000, 10)

# 模型选择

## SVM

In [32]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### matching

In [29]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_new, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [34]:
y_pred = svm_model.predict(test_X_new)
print(accuracy_score(y_pred, test_y))

0.9957222222222222


### adapting

In [25]:
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 
                     'C': [1, 10, 100, 1000]}]

In [36]:
clf = GridSearchCV(svm.SVC(), 
                    tuned_parameters, 
                    cv=5)
clf.fit(X_new, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
print(clf.best_params_)
print(clf.score(test_X_new, test_y))
print(clf.best_score_)

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.9957222222222222
0.9958952380952381


In [41]:
# 对拟合好的参数进行验证
svm_model = svm.SVC(kernel='rbf', C=1, gamma=1e-3)
svm_model.fit(X_new, train_y)
y_pred = svm_model.predict(test_X_new)
print(accuracy_score(y_pred, test_y))

0.9957222222222222


## GBDT

In [48]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline

### matching

In [49]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X_new, train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=10,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [50]:
gbm_predict = gbm0.predict(test_X_new)
print(accuracy_score(gbm_predict, test_y))

0.9957222222222222


### adapting

In [51]:
# 步长(learning rate)和迭代次数(n_estimators)
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    min_samples_split=300,
    min_samples_leaf=20,
    max_depth=8,
    max_features='sqrt', 
    subsample=0.8,
    random_state=10), 
param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(X_new,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'n_estimators': range(20, 81, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [52]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_



({'mean_fit_time': array([ 4.358741  ,  6.93688679,  8.95388484, 10.8306365 , 12.6355412 ,
         13.73697839, 15.37028689]),
  'std_fit_time': array([0.24748577, 0.32585169, 0.30424309, 0.3178031 , 0.43785209,
         0.41203316, 0.16170451]),
  'mean_score_time': array([0.04823494, 0.06865683, 0.0884645 , 0.10067286, 0.11768489,
         0.13009386, 0.13669858]),
  'std_score_time': array([0.0021378 , 0.00423454, 0.00589026, 0.00403248, 0.00695108,
         0.00385075, 0.00615505]),
  'param_n_estimators': masked_array(data=[20, 30, 40, 50, 60, 70, 80],
               mask=[False, False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'n_estimators': 20},
   {'n_estimators': 30},
   {'n_estimators': 40},
   {'n_estimators': 50},
   {'n_estimators': 60},
   {'n_estimators': 70},
   {'n_estimators': 80}],
  'split0_test_score': array([0.99944974, 0.99946696, 0.99947648, 0.99946812, 0.99946882,
         0.99945214, 0.99945221]),

In [None]:
# 'n_estimators': 50

In [54]:
# max_depth和min_samples_split
param_test2 = {'max_depth':range(3,11,2), 
               'min_samples_split':range(5,21,3)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60, 
    min_samples_leaf=20, 
    max_features='sqrt', 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X_new,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'max_depth': range(5, 11, 2), 'min_samples_split': range(10, 101, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [55]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_



({'mean_fit_time': array([ 9.04895172,  9.08649316,  8.65672936,  9.31379056,  9.1121151 ,
         11.86969857, 11.91869721, 11.31046348, 11.59165564, 11.43051066,
         15.46212687, 15.52702885, 15.09704871, 14.67758851, 14.82068086]),
  'std_fit_time': array([0.76377392, 0.76657051, 0.12296478, 0.9128356 , 0.21404454,
         0.90654399, 0.68222452, 0.41032738, 0.40530391, 0.33649583,
         0.80638798, 0.35113152, 0.54572749, 0.74851858, 0.37376082]),
  'mean_score_time': array([0.09466801, 0.08225927, 0.09327164, 0.09126544, 0.08619986,
         0.10407605, 0.11376152, 0.10728116, 0.10327373, 0.1144824 ,
         0.12649374, 0.11648417, 0.1224875 , 0.12789159, 0.11888576]),
  'std_score_time': array([0.00940054, 0.00240188, 0.01198254, 0.00928908, 0.00645841,
         0.00740852, 0.0134478 , 0.00292812, 0.00891402, 0.01328483,
         0.01115572, 0.00500745, 0.01100993, 0.01568836, 0.0031895 ]),
  'param_max_depth': masked_array(data=[5, 5, 5, 5, 5, 7, 7, 7, 7, 7, 9, 9, 9, 

In [None]:
# 'max_depth': 5, 'min_samples_split': 10

In [57]:
# min_samples_split和min_samples_leaf
param_test3 = {'min_samples_split':range(10,101,20), 
               'min_samples_leaf':range(10,101,20)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60,
    max_depth=7,
    max_features='sqrt', 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X_new,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sam...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'min_samples_split': range(10, 101, 20), 'min_samples_leaf': range(10, 101, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [58]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_



({'mean_fit_time': array([11.16550465, 10.90639195, 10.75821609, 10.7023767 , 10.67936087,
         10.71338596, 10.73480105, 10.78683715, 10.53946209, 10.63392558,
         10.61591249, 10.67264199, 10.69096889, 10.65713778, 10.59129124,
         10.66402278, 10.63152647, 10.61670947, 10.66313863, 10.61030464,
         10.50283599, 10.49082398, 10.43958488, 10.71663504, 10.32483249]),
  'std_fit_time': array([0.21169001, 0.27971752, 0.26935297, 0.3227956 , 0.29711346,
         0.14781354, 0.14379762, 0.12937096, 0.30105891, 0.2696234 ,
         0.12087571, 0.13442665, 0.23058649, 0.19797884, 0.17405035,
         0.17624167, 0.13305607, 0.12474601, 0.17168817, 0.11212271,
         0.13096405, 0.21684752, 0.14564345, 0.43003954, 0.15319588]),
  'mean_score_time': array([0.10367527, 0.10127344, 0.10507636, 0.11128078, 0.10047193,
         0.10547557, 0.102074  , 0.10147319, 0.10007186, 0.11087952,
         0.10487566, 0.10387487, 0.10687737, 0.10567989, 0.10427885,
         0.10587687, 0

In [None]:
# 'min_samples_leaf': 90, 'min_samples_split': 10

In [59]:
# 先拟合一下，看看结果
gbm1 = GradientBoostingClassifier(learning_rate=0.1, 
                                  n_estimators=50,
                                  max_depth=5, 
                                  min_samples_leaf =90, 
                                  min_samples_split =10, 
                                  max_features='sqrt', 
                                  subsample=0.8, 
                                  random_state=10)
gbm1.fit(X_new, train_y)
y_pred = gbm1.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

Accuracy : 0.9957


In [64]:
# 对最大特征数max_features进行网格搜索
param_test4 = {'max_features':range(3,10,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=50,
    max_depth=7, 
    min_samples_leaf =90, 
    min_samples_split =10, 
    subsample=0.8, 
    random_state=10), 
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X_new,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=90, min_samp...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'max_features': range(3, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [65]:
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_



({'mean_fit_time': array([9.62074418, 8.03120537, 8.75163922, 8.78343496]),
  'std_fit_time': array([0.57009488, 0.47005877, 0.59398091, 0.44577684]),
  'mean_score_time': array([0.09146585, 0.06704903, 0.07505431, 0.05824232]),
  'std_score_time': array([0.00634758, 0.00978417, 0.03421812, 0.0080164 ]),
  'param_max_features': masked_array(data=[3, 5, 7, 9],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'max_features': 3},
   {'max_features': 5},
   {'max_features': 7},
   {'max_features': 9}],
  'split0_test_score': array([0.99946515, 0.99946379, 0.99945864, 0.99946271]),
  'split1_test_score': array([0.99953026, 0.99946821, 0.99945127, 0.99944732]),
  'split2_test_score': array([0.9996133 , 0.99959496, 0.99960746, 0.99960044]),
  'split3_test_score': array([0.99947271, 0.99950329, 0.99950767, 0.99949854]),
  'split4_test_score': array([0.99943359, 0.99944619, 0.99944337, 0.99944777]),
  'mean_test_score': array(

In [66]:
# 对子采样的比例进行网格搜索
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=60,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =1200, 
    max_features=9, 
    random_state=10), 
param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gsearch5.fit(X_new,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features=9, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=60, min_samples...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [67]:
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_



({'mean_fit_time': array([ 9.2664834 ,  9.98852768, 11.41073852, 11.42534032, 10.84621248,
         10.81431661]),
  'std_fit_time': array([0.95863216, 0.56368619, 0.64458005, 0.34469778, 0.58669094,
         0.83463582]),
  'mean_score_time': array([0.06124434, 0.06310782, 0.07445602, 0.0678484 , 0.0662478 ,
         0.0652494 ]),
  'std_score_time': array([0.00278776, 0.00354351, 0.01743235, 0.00830759, 0.00624643,
         0.00240009]),
  'param_subsample': masked_array(data=[0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
               mask=[False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'subsample': 0.6},
   {'subsample': 0.7},
   {'subsample': 0.75},
   {'subsample': 0.8},
   {'subsample': 0.85},
   {'subsample': 0.9}],
  'split0_test_score': array([0.99946808, 0.99946727, 0.9994687 , 0.99946498, 0.99947625,
         0.99946617]),
  'split1_test_score': array([0.99945742, 0.99944878, 0.99944634, 0.99946193, 0.99943203,
         0.

In [69]:
# 可以减半步长，最大迭代次数加倍
gbm2 = GradientBoostingClassifier(
    learning_rate=0.05, 
    n_estimators=120,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.6, 
    random_state=10)
gbm2.fit(X_new,train_y)
y_pred = gbm2.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

Accuracy : 0.9957


In [70]:
# 继续将步长缩小5倍，最大迭代次数增加5倍
gbm3 = GradientBoostingClassifier(
    learning_rate=0.01, 
    n_estimators=600,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.7, 
    random_state=10)
gbm3.fit(X_new,train_y)
y_pred = gbm3.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

Accuracy : 0.9957


In [71]:
# 继续步长缩小一半，最大迭代次数增加2倍
gbm4 = GradientBoostingClassifier(
    learning_rate=0.005, 
    n_estimators=1200,
    max_depth=7, 
    min_samples_leaf =60, 
    min_samples_split =10, 
    max_features=9, 
    subsample=0.7, 
    random_state=10)
gbm4.fit(X_new,train_y)
y_pred = gbm4.predict(test_X_new)
print("Accuracy : %.4g" % metrics.accuracy_score(test_y, y_pred))

Accuracy : 0.9957


## RandomForestClassifier

In [27]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

### matching

In [28]:
rfc = RandomForestClassifier()
rfc.fit(X_new, train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
rfc_y_predict = rfc.predict(test_X_new)
rfc_y_predict.sum()

30521

In [30]:
print(accuracy_score(rfc_y_predict, test_y))

0.9956666666666667


In [31]:
print(rfc.score(test_X_new, test_y))

0.9956666666666667


In [32]:
print(classification_report(test_y, rfc_y_predict, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     59857
           1       0.99      1.00      0.99     30143

   micro avg       1.00      1.00      1.00     90000
   macro avg       0.99      1.00      1.00     90000
weighted avg       1.00      1.00      1.00     90000



### adapting

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

import matplotlib.pylab as plt
%matplotlib inline

In [40]:
# 首先对n_estimators进行网格搜索, 得到了最佳的弱学习器迭代次数
param_test1 = {'n_estimators':range(10,101,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                                           min_samples_leaf=20,
                                                           max_depth=8,
                                                           max_features='sqrt',
                                                           random_state=10), 
                                                           param_grid = param_test1, 
                                                           scoring='roc_auc',cv=5)
gsearch1.fit(X_new, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(10, 101, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [41]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_



({'mean_fit_time': array([0.62796159, 1.20210514, 1.72704363, 2.29552846, 2.86190038,
         3.4879992 , 3.93832574, 4.39359508, 4.93154502, 5.45520644]),
  'std_fit_time': array([0.00879683, 0.03333474, 0.00706163, 0.03451269, 0.01421934,
         0.15356639, 0.04845475, 0.07327373, 0.0721551 , 0.02950087]),
  'mean_score_time': array([0.04619908, 0.08047771, 0.11163511, 0.1413825 , 0.17681546,
         0.20938001, 0.23835807, 0.27423224, 0.30156469, 0.33152795]),
  'std_score_time': array([0.00205187, 0.00626464, 0.00429324, 0.00158087, 0.00365319,
         0.00319323, 0.00184544, 0.00552753, 0.00466671, 0.00237498]),
  'param_n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
               mask=[False, False, False, False, False, False, False, False,
                     False, False],
         fill_value='?',
              dtype=object),
  'params': [{'n_estimators': 10},
   {'n_estimators': 20},
   {'n_estimators': 30},
   {'n_estimators': 40},
   {'n_e

In [None]:
# 'n_estimators': 60

In [None]:
# 对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
param_test2 = {'max_depth':range(3,11,2), 'min_samples_split':range(10,51,10)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           min_samples_leaf=20,
                                                           max_features='sqrt',
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X_new, train_y)

In [None]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [44]:
# 看看现在模型的袋外分数
rf1 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=7, 
                             min_samples_split=50,
                             min_samples_leaf=20,
                             max_features='sqrt',
                             oob_score=True, 
                             random_state=10)
rf1.fit(X_new, train_y)
print(rf1.oob_score_)

0.9958952380952381


In [None]:
# 对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参

param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13,
                                                           max_features='sqrt',
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X_new, train_y)

In [None]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# 对最大特征数max_features做调参
param_test4 = {'max_features':range(3,11,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13, 
                                                           min_samples_split=120,
                                                           min_samples_leaf=20 ,
                                                           oob_score=True, 
                                                           random_state=10),
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X_new, train_y)

In [None]:
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
# 用我们搜索到的最佳参数，我们再看看最终的模型拟合
rf2 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=13, 
                             min_samples_split=120,
                             min_samples_leaf=20,
                             max_features=7 ,
                             oob_score=True, 
                             random_state=10)
rf2.fit(X_new, train_y)
print(rf2.oob_score_)