# 再一次调整n_estimators

这一步的关键是调小学习率，增大弱学习器的数目

In [4]:
from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

path = 'Desktop/RentListingInquries/code/data/'
test = pd.read_csv(path+'RentListingInquries_FE_test.csv')
train = pd.read_csv(path+'RentListingInquries_FE_train.csv')

y_train = train['interest_level']
x_train = train.drop('interest_level',axis=1)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
def modelfit(alg, X_train, y_train, cv_folds=None, early_stopping_rounds=10):
    xgb_param = alg.get_xgb_params()
    xgb_param['num_class'] = 3       #因为是三类分类问题
    
    xgtrain = xgb.DMatrix(X_train, label = y_train)
        
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,
             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
    #直接采用xgb的cv函数
    
    cvresult.to_csv('2_nestimators.csv', index_label = 'n_estimators')
    n_estimators = cvresult.shape[0]
    alg.set_params(n_estimators = n_estimators)
    alg.fit(X_train, y_train, eval_metric='mlogloss')
    
xgb4 = XGBClassifier(
        learning_rate = 0.02,
        n_estimators=1800,      #将leaning_rate调小，再增大n_estimators
        max_depth=5,
        min_child_weight=1,
        gamma = 0,
        objective='multi:softprob',
        seed=3
)
modelfit(xgb4,x_train,y_train,cv_folds=kfold)
cvresult = pd.DataFrame.from_csv('2_nestimators.csv')
print(cvresult)

              test-mlogloss-mean  test-mlogloss-std  train-mlogloss-mean  \
n_estimators                                                               
0                       1.086267           0.000037             1.086127   
1                       1.074347           0.000081             1.074061   
2                       1.062829           0.000123             1.062397   
3                       1.051686           0.000147             1.051115   
4                       1.040902           0.000182             1.040195   
5                       1.030468           0.000214             1.029624   
6                       1.020357           0.000239             1.019382   
7                       1.010586           0.000269             1.009476   
8                       1.001116           0.000312             0.999870   
9                       0.991938           0.000335             0.990553   
10                      0.983036           0.000368             0.981522   
11          



In [6]:
xgb4.get_xgb_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.02,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 1610,
 'nthread': 1,
 'objective': 'multi:softprob',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 3,
 'silent': 1,
 'subsample': 1}

找到最佳参数为1610

保存模型，并进行预测

In [13]:
test.head()

Unnamed: 0,bathrooms,bedrooms,price,price_bathrooms,price_bedrooms,room_diff,room_num,Year,Month,Day,...,virtual,walk,walls,war,washer,water,wheelchair,wifi,windows,work
0,1.0,1,2950,1475.0,1475.0,0.0,2.0,2016,6,11,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2850,1425.0,950.0,-1.0,3.0,2016,6,24,...,0,0,0,1,0,0,0,0,0,0
2,1.0,1,3758,1879.0,1879.0,0.0,2.0,2016,6,3,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2,3300,1650.0,1100.0,-1.0,3.0,2016,6,11,...,0,0,0,0,0,0,1,0,0,0
4,2.0,2,4900,1633.333333,1633.333333,0.0,4.0,2016,4,12,...,0,0,0,1,0,0,0,0,0,0


In [16]:
y_pred = pd.DataFrame(xgb4.predict_proba(test))
y_pred.columns = ["high", "medium", "low"]
y_pred.to_csv("result.csv", index=False)

In [17]:
print(y_pred)

           high    medium       low
0      0.094963  0.376525  0.528511
1      0.250027  0.423065  0.326908
2      0.051716  0.109980  0.838304
3      0.049186  0.342774  0.608040
4      0.071391  0.247828  0.680780
5      0.003263  0.099394  0.897344
6      0.023500  0.305606  0.670894
7      0.118151  0.533122  0.348728
8      0.083946  0.456714  0.459340
9      0.062358  0.275904  0.661738
10     0.004559  0.041413  0.954027
11     0.049609  0.466949  0.483442
12     0.067255  0.384020  0.548725
13     0.008663  0.035522  0.955815
14     0.008356  0.065403  0.926241
15     0.011502  0.115606  0.872892
16     0.090720  0.394696  0.514583
17     0.000318  0.007151  0.992531
18     0.007926  0.077327  0.914747
19     0.113369  0.471466  0.415165
20     0.034955  0.256890  0.708156
21     0.015509  0.165372  0.819119
22     0.086026  0.344184  0.569790
23     0.281905  0.490924  0.227171
24     0.007416  0.180890  0.811694
25     0.178922  0.550274  0.270803
26     0.013526  0.182144  0