# 模型调优过程

In [1]:
import sys
sys.path.append('/home/aistudio/external-libraries')

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

# import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

In [2]:
train = pd.read_csv('../training/train_data_180.csv')
test = pd.read_csv('../training/test_data_180.csv')

train_X = train.drop(['price','SaleID'], axis=1)
train_y = train['price']
test_X = test.drop(['SaleID'], axis=1)

feat_name = train_X.columns

# 交叉验证折数
S_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)
kfold = KFold(n_splits=5, shuffle=True, random_state=33)

print(train_X.shape)
print(test_X.shape)

In [4]:
# lightGBM这里的y，用的是log1p变换后的y
tr_X,val_X,tr_y,val_y = train_test_split(train_X,train_y,test_size=0.2, random_state=2000)

# 用lightGBM模型时，数据需要转换成Dataset类型
train_data_lgb = lgbm.Dataset(tr_X, label=tr_y, free_raw_data=True)
val_data_lgb = lgbm.Dataset(val_X, label=val_y, free_raw_data=True)

## 定义了一个统计函数，方便后续信息统计
def Sta_inf(data):
    print('_min',np.min(data))
    print('_max:',np.max(data))
    print('_mean',np.mean(data))
    print('_ptp',np.ptp(data))
    print('_std',np.std(data))
    print('_var',np.var(data))

In [None]:
# xgboost这里的y，用的是原始y
tr_X,val_X,tr_y,val_y = train_test_split(train_X,np.expm1(train_y),test_size=0.2, random_state=2000)

# 用XGBoost模型时，数据需要转换成DMatrix类型
train_data_xgb = xgb.DMatrix(tr_X, label=tr_y)
val_data_xgb = xgb.DMatrix(val_X, label=val_y)

In [5]:
train_X.shape + test_X.shape

(150000, 62, 50000, 62)

# 模型一: LinearRegression

In [None]:
model_lr = linear_model.LinearRegression()
model_lr = model_lr.fit(train_X, train_y)

feat_name = train_X.columns

print('intercept:'+ str(model_lr.intercept_))
sorted(dict(zip(feat_name, model_lr.coef_)).items(), key=lambda x:x[1], reverse=True)

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer

scores = cross_val_score(model_lr, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer((mean_absolute_error)))
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished


array([2388.74333435, 2407.09506836, 2425.05182767, 2375.28063236,
       2398.05819435])

In [20]:
print('AVG:', np.mean(scores))

scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = ['cv' + str(x) for x in range(1, 6)]
scores.index = ['MAE']
scores

AVG: 2398.845811420034


Unnamed: 0,cv1,cv2,cv3,cv4,cv5
MAE,2388.743334,2407.095068,2425.051828,2375.280632,2398.058194


In [21]:
split_point = len(train) // 5 * 4
train = train.loc[:split_point].dropna()
val = train.loc[split_point:].dropna()

train_X = train.drop(['price','train'], axis=1)
train_y = train['price']
val_X = val.drop(['price','train'], axis=1)
val_y = val['price']

model_LR = linear_model.LinearRegression()

model_LR = model_LR.fit(train_X, train_y)
mean_absolute_error(val_y, model_LR.predict(val_X))

762.6133193969727

# 模型二：随机森林

In [26]:
from sklearn.ensemble import RandomForestRegressor
RF1 = RandomForestRegressor()

# 交叉验证用于评估模型性能和进行参数调优（模型选择）
#分类任务中交叉验证缺省是采用StratifiedKFold
#数据集不大，采用3折交叉验证
mae = cross_val_score(RF1, train_X, train_y, cv=5, scoring=make_scorer(mean_absolute_error))
print ('MAE of each fold is: ', mae)
print('cv MAE is:', mae.mean())

MAE of each fold is:  [685.07658014 673.26679167 676.18096667 679.44129583 684.36042917]
cv MAE is: 679.6652126951655


从结果看，默认的随机森林MAE分值（679.665）比线性模型（726.613）低

In [27]:
result = pd.DataFrame(mae)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

Unnamed: 0,0
cv1,685.07658
cv2,673.266792
cv3,676.180967
cv4,679.441296
cv5,684.360429


In [28]:
mae = pd.DataFrame(mae.reshape(1,-1))
mae.columns = ['cv' + str(x) for x in range(1, 6)]
mae.index = ['MAE']
mae

Unnamed: 0,cv1,cv2,cv3,cv4,cv5
MAE,685.07658,673.266792,676.180967,679.441296,684.360429


## 随机森林超参数调优

随机森林的超参会有很多：
Bagging参数：
1. 树的数目n_estimators

与决策树的共同的超参数：
1. max_depth（树的深度）或max_leaf_nodes（叶子结点的数目）；
2. min_samples_leaf（叶子结点的最小样本数）、min_samples_split（中间结点的最小样本树）、min_weight_fraction_leaf（叶子节点的样本权重占总权重的比例）
3. max_features（最大特征数目）。与决策树max_features通常越大模型性能越好不同，随机森林中max_features较小，每个基学习器之间的相关性更小，集成模型的性能可能反而会更好；

另外在随机森林中，由于学习每个基学习器只用了一部分样本，可用其余样本（包外样本）做校验，从而不必显式进行交叉验证。
设置参数oob_score=True

### n_estimators

In [None]:
#需要调优的参数 n_estimators
tuned_n_estimators = range(50,300,50)
accuracy_s = np.zeros(len(tuned_n_estimators))

#初始max_depth设为单棵树的max_depth，max_features比推荐值sqrt(D)略大,
#min_samples_leaf比单棵树的min_samples_leaf略小（详见CART参数调优）
for j, one_n_estimators in enumerate(tuned_n_estimators):
    RF2 = RandomForestRegressor(criterion= 'mae', n_estimators = one_n_estimators, max_depth =10, max_features = 10, min_samples_leaf=2,oob_score=True,n_jobs=-1,random_state=33)
    RF2.fit(train_X, train_y)
    accuracy_s[j] = RF2.oob_score_ 

In [None]:
plt.plot(tuned_n_estimators, accuracy_s)
plt.xlabel('n_estimators' )                                                                                                      
plt.ylabel('R2 score' )

### max_features(在上一步参数确定的前提下)

In [None]:
turned_params = range(10, 40, 2)
r2_scores = np.zeros(len(turned_params))

for j, one_para in enumerate(turned_params):
    RF2 = RandomForestRegressor(n_estimators = 200, criterion = 'mae', max_depth=10, min_sample_leaf=2, oob_score=True, 
                               verbose=10, n_jobs=-1, random_state=33)
    RF2.fit(train_X, train_y)
    r2_scores[j] = RF2.oob_score_ 
    
plt.plot(turned_params, r2_scores)
plt.xlabel('max_features')
plt.ylabel('R2 score')

### max_depth (在上一步参数确定的前提下)

In [None]:
turned_params = range(10, 50, 5)
r2_scores = np.zeros(len(turned_params))

for j, one_para in enumerate(turned_params):
    RF2 = RandomForestRegressor(n_estimators=200, max_features=, min_sample_leaf=20, oob_socre=True, 
                                n_jobs=-1, random_state=33, verbose=10)
    RF2.fit(train_X, train_y)
    r2_scores[j] = oob_score_

### min_samples_leaf (在上一步参数确定的前提下)

In [None]:
turned_params = range(1, 10, 2)
r2_scores = np.zeros(len(turned_params))

for j, one_para in enumerate(turned_params):
    RF2 = RandomForestRegressor(n_estimators=200, max_features=, max_depth = , oob_socre=True, 
                                n_jobs=-1, random_state=33, verbose=10)
    RF2.fit(train_X, train_y)
    r2_scores[j] = oob_score_

### 用最佳参数组合，训练模型

In [None]:
RF2 = RandomForestRegressor(n_estimators=200, max_features=, max_depth = , oob_socre=True, 
                                n_jobs=-1, random_state=33, verbose=10)
RF2.fit(train_X, train_y)
feat_names = X_train.columns 

import pickle
pickle.dump(RF2, open('./RandomForestRegressor_V1.pkl', 'wb'))

In [None]:
RF2.feature_importances_

In [None]:
df = pd.DataFrame(data = {'columns': list(feat_names), 'importance': list(RF2.feature_importances_.T)})
df = df.sort_values(by=['importance'], ascending=False)
plt.bar(range(len(RF2.feature_importances_)), RF2.feature_importances_)
plt.show()

# 模型三： XGBoost

XGBoost的主要超参数包括：
1. 树的数目n_estimators 和 学习率 learning_rate
2. 树的最大深度max_depth
3. 叶子结点的最小样本数:min_child_weight
4. 每棵树的列采样比例：colsample_bytree
5. 每棵树的行采样比例：subsample
6. 正则化参数lambda_l1(reg_alpha), lambda_l2(reg_lambda)

对n_estimators，XGBoost学习的过程内嵌了cv，速度快。
其他参数用GridSearchCV。

### step1：学习率为0.1，粗调基学习器的数目n_estimators

In [None]:
MAX_ROUNDS = 10000
params = {
          'eval_metric':'mae'，
          'objective': 'reg:squarederror',
          'learning_rate': 0.1,
          #'n_estimators': 1000
          'min_child_weight': 1,
          'max_depth': 7,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
          'n_jobs': -1,   

         }

#直接调用xgboost内嵌的交叉验证（cv），可对连续的n_estimators参数进行快速交叉验证，GridSearchCV只能对有限个参数进行交叉验证
def get_n_estimators(params, X_train , y_train , early_stopping_rounds=10):
    xgb_params = params.copy()
    
    #直接调用xgboost，而非sklarn的wrapper类
    xgb_train = xgb.DMatrix(X_train, label = y_train)
        
    cv_result = xgb.cv(xgb_params, xgb_train, num_boost_round=MAX_ROUNDS, nfold=3,
             metrics='mae', early_stopping_rounds=early_stopping_rounds,seed=3)
  
    cv_result.to_csv('./xgb_n_estimators_v1.csv', index_label = 'n_estimators')
    
    #最佳参数n_estimators
    n_estimators = cv_result.shape[0]
    print('best n_estimators:' , n_estimators)
    print('best cv score:' , cv_result['test-mae-mean'][n_estimators-1])
     
    return n_estimators

n_estimators_1 = get_n_estimators(params , train_X , train_y)

### step2：调整树的参数：max_depth & min_child_weight

这两个参数尽可能一起调，因为max_depth和min_child_weight都直接影响树模型的复杂度。
如果计算资源有限，也可类似坐标轴下降，先调其中一个，然后调另一个。
如果是分类任务，且不同类的样本数目不均衡，最好先调min_child_weight，以免max_depth对少数类样本过拟合。

In [None]:
#max_depth 建议3-10， min_child_weight=1／sqrt(ratio_rare_event) =5.5
max_depth = range(5,10,2)
min_child_weight = range(1,6,2)
tuned_params = dict(max_depth=max_depth, min_child_weight=min_child_weight)

params = {
          'objective': 'reg:squarederror',
          'learning_rate': 0.1,
          'n_estimators': n_estimators_1,   #第一轮参数调整得到的n_estimators最优值
#           'min_child_weight': 3,
#           'max_depth': 5,
          'subsample': 0.8,
          'colsample_bytree': 0.9,
          'nthread': 8
         }

xgb_g = XGBRegressor(silent=False,  **params)

grid_search = GridSearchCV(xgb_g, param_grid = tuned_params, scoring=make_scorer((mean_absolute_error)),
                           n_jobs=-1, cv=kfold,verbose=5, refit = False)
grid_search.fit(train_X , train_y)

结果可视化

In [None]:
# summarize results
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
test_means = grid_search.cv_results_['mean_test_score']
test_stds = grid_search.cv_results_['std_test_score']
train_means = grid_search.cv_results_['mean_train_score']
train_stds = grid_search.cv_results_['std_train_score']

pd.DataFrame(grid_search.cv_results_).to_csv('max_depth_min_child_weights_1.csv')

# plot results
test_scores = np.array(test_means).reshape(len(max_depth), len(min_child_weight))
train_scores = np.array(train_means).reshape(len(max_depth), len(min_child_weight))

for i, value in enumerate(max_depth):
    plt.plot(min_child_weight, -test_scores[i], label= 'test_max_depth:'   + str(value))

plt.legend()
plt.xlabel('min_child_weight' )                                                                                                      
plt.ylabel('Loss' )
plt.savefig('max_depth_and_min_child_weght_1.png' )

### step3：行采样比例

In [None]:
subsample_s = [i/10.0 for i in range(5,10)]
tuned_params = dict(subsample=subsample_s)

params = {'objective': 'reg:squarederror',
        #   'learning_rate': 0.1,
          'n_estimators': 1750,   #第一轮参数调整得到的n_estimators最优值
          'min_child_weight': 3,
          'max_depth': 10,
          'subsample': 0.9,
          'colsample_bytree': 0.8,
          'nthread': 8
         }

xgb_g = xgb.XGBRegressor(silent=False,  **params)

grid_search = GridSearchCV(xgb_g, param_grid = tuned_params, scoring=make_scorer(mean_absolute_error),n_jobs=-1, \
                      cv=kfold,verbose=5, refit = False)
grid_search.fit(train_X , train_y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
test_means = grid_search.cv_results_[ 'mean_test_score' ]

pd.DataFrame(grid_search.cv_results_).to_csv('subsample.csv')

plt.plot(subsample_s, -test_means)

plt.legend()
plt.xlabel('subsample' )                                                                                                      
plt.ylabel('Loss' )
plt.savefig('subsample.png' )

### step4：列采样比例

In [None]:
colsample_bytree_s = [i/10.0 for i in range(5,10)]
tuned_params = dict(colsample_bytree=colsample_bytree_s)

params = {'objective': 'reg:squarederror',
          'learning_rate': 0.1,
          'n_estimators': 1750,   #第一轮参数调整得到的n_estimators最优值
          'min_child_weight': 3,
          'max_depth': 10,
          'subsample': 0.9,
#           'colsample_bytree': 0.8,
          'nthread': 8
         }

xgb_g = xgb.XGBRegressor(silent=False,  **params)

grid_search = GridSearchCV(xgb_g, param_grid = tuned_params, scoring=make_scorer(mean_absolute_error),
                           n_jobs=-1, cv=kfold, verbose=5, refit = False)
grid_search.fit(X_train , y_train)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
test_means = grid_search.cv_results_['mean_test_score']

pd.DataFrame(grid_search.cv_results_).to_csv('colsample_bytree.csv')

plt.plot(colsample_bytree_s, test_means)

plt.legend()
plt.xlabel('colsample_bytree')                                                                                                      
plt.ylabel('Loss' )
plt.savefig('colsample_bytree.png' )

In [None]:
test_means

### step5. 用树的最佳参数，再次调整学习率和基学习器的数目

In [None]:
# 用所有训练好的参数，降低学习率，调整基学习器数目。这里设置了0.05
params = {
          'learning_rate': 0.05,
        #   'n_estimators': 1750, 
          'min_child_weight': 3,
          'max_depth': 10,
          'subsample': 0.9,
          'colsample_bytree': 0.7,
          'reg_lambda':5,
          'reg_alpha':3,
          'nthread': 8
}

n_estimators_2 = get_n_estimators(params, X_train , y_train)

### step6. 用所有训练数据，采用最佳参数重新训练模型
由于样本数目增多，模型复杂度稍微扩大一点？
max_depth增多1
min_child_weight按样本比例,从3增加到4

In [None]:
params = {
          'learning_rate': 0.01,
          'n_estimators': 7000, 
          'min_child_weight': 3,
          'max_depth': 10,
          'subsample': 0.9,
          'colsample_bytree': 0.7,
          'reg_lambda':5,
          'reg_alpha':3,
          'nthread': 8
         }
        
xgb_r = xgb.XGBRegressor(silent=False, **params)
xgb_r.fit(train_X, train_y)

### step8. 保存模型

In [None]:
import pickle
pickle.dump(xgb_r, open("./used_car/model/used_car_XGBoost_v1.pkl", 'wb'))

df = pd.DataFrame({"columns":list(feat_name), "importance":list(xgb_r.feature_importances_.T)})
df = df.sort_values(by=['importance'],ascending=False)

# 模型四：LightGBM

### LightGBM超参数调优

主要超参包括：
1. 树的数目n_estimators 和 学习率 learning_rate
2. 树的最大深度max_depth 和 树的最大叶子节点数目num_leaves（注意：XGBoost只有max_depth，LightGBM采用叶子优先的方式生成树，num_leaves很重要，设置成比 2^max_depth 小）
3. 叶子结点的最小样本数:min_data_in_leaf(min_data, min_child_samples)
4. 每棵树的列采样比例：feature_fraction/colsample_bytree
5. 每棵树的行采样比例：bagging_fraction （需同时设置bagging_freq=1）/subsample
6. 正则化参数lambda_l1(reg_alpha), lambda_l2(reg_lambda)

7. 两个非模型复杂度参数，但会影响模型速度和精度。可根据特征取值范围和样本数目修改这两个参数
1）特征的最大bin数目max_bin：默认255；
2）用来建立直方图的样本数目subsample_for_bin：默认200000。

对n_estimators，用LightGBM内嵌的cv函数调优，因为同XGBoost一样，LightGBM学习的过程内嵌了cv，速度极快。
其他参数用GridSearchCV

### step1. 先调试estimators

In [None]:
import lightgbm as lgbm
from lightgbm.sklearn import LGBMClassifier

MAX_ROUNDS = 1000

#调用lightgbm内嵌的交叉验证(cv)，可对连续的n_estimators参数进行快速交叉验证， GridSearchCV只能对有限个参数进行交叉验证速度相对较慢
def get_n_estimators(params , X_train , y_train , early_stopping_rounds=10):
    lgbm_params = params.copy()
    lgbmtrain = lgbm.Dataset(X_train , y_train )
     
    cv_result = lgbm.cv(lgbm_params , lgbmtrain , num_boost_round=MAX_ROUNDS , nfold=3,  metrics='mse' , early_stopping_rounds=early_stopping_rounds,seed=3 )
     
    print('best n_estimators:' , len(cv_result['mse-mean']))
    print('best cv score:' , cv_result['mse-mean'][-1])
     
    return len(cv_result['mse-mean'])

params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'learning_rate': 0.1,
          'num_leaves': 60,
          'max_depth': 6,
          'max_bin': 255,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1,
         }

n_estimators_1 = get_n_estimators(params , train_X , train_y)

### step2. num_leaves & max_depth=7
num_leaves建议70-80，搜索区间50-80,值越大模型越复杂，越容易过拟合
相应的扩大max_depth=7

In [None]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'learning_rate': 0.1,
          'n_estimators':n_estimators_1,
          'max_depth': 7,
          'max_bin': 255,     
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,  
          'n_jobs': -1,
         }
lg = LGBMRegressor(silent=False,  **params)

num_leaves_s = range(50,90,10)
tuned_parameters = dict( num_leaves = num_leaves_s)

grid_search = GridSearchCV(lg, param_grid=tuned_parameters, cv = kfold, scoring=make_scorer(mean_absolute_error)), 
                                  verbose=5, refit = False)
grid_search.fit(X_train , y_train)

In [None]:
# examine the best model
print(-grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
# plot CV误差曲线
test_means = grid_search.cv_results_[ 'mean_test_score' ]
test_stds = grid_search.cv_results_[ 'std_test_score' ]
train_means = grid_search.cv_results_[ 'mean_train_score' ]
train_stds = grid_search.cv_results_[ 'std_train_score' ]

n_leafs = len(num_leaves_s)

x_axis = num_leaves_s
plt.plot(x_axis, -test_means)
plt.errorbar(x_axis, -test_means, yerr=test_stds,label =' Test')
plt.errorbar(x_axis, -train_means, yerr=train_stds,label = ' Train')
plt.xlabel('num_leaves' )
plt.ylabel('Loss' )
plt.show()

### step3. min_data_in_leaf
叶子节点的最小样本数目

搜索范围：10-50

In [None]:
params = {'boosting_type': 'gbdt',
          'objective':'regression',
          'learning_rate': 0.1,
          'n_estimators': n_estimators_1,
          'max_depth': 7,
          'max_bin': 255,
          'num_leaves': 70,
          'min_data_in_leaf': 50,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1
}

lg = LGBMRegressor(silent=False, **params)

min_data_leaf_s = range(10, 100, 10)
turned_parameters = dict(min_data_in_leaf = min_data_in_leaf_s)

grid_search = GridSearchCV(lg, param_grid = params, cv = kfold, scoring = make_scorer(mean_absolute_error),
                           verbose=5, refit=False, n_jobs=-1)
grid_search.fit(train_X, train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
# plot CV误差曲线
test_means = grid_search.cv_results_['mean_test_score']
test_stds = grid_search.cv_results_['std_test_score']
train_means = grid_search.cv_results_['mean_train_score']
train_stds = grid_search.cv_results_['std_train_score']

n_leafs = len(num_leaves_s)

x_axis = num_leaves_s
plt.plot(x_axis, -test_means)
#plt.errorbar(x_axis, -test_means, yerr=test_stds,label = ' Test')
#plt.errorbar(x_axis, -train_means, yerr=train_stds,label = ' Train')
plt.xlabel('num_leaves' )
plt.ylabel('Loss' )
plt.show()

### step4: 行采样参数 sub_samples/bagging_fraction

In [None]:
params = {'boosting_type': 'gbdt',
          'objective':'regression',
          'learning_rate': 0.1,
          'n_estimators': n_estimators_1,
          'max_depth': 7,
          'max_bin': 255,
          'num_leaves': 70,
          'min_data_in_leaf': 50,
          'feature_fraction': 0.8,
          #'subsample': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1
}

lg = LGBMRegressor(silent=False,  **params)

subsample_s = [i/10.0 for i in range(5,10)]
tuned_parameters = dict( subsample = subsample_s)

grid_search = GridSearchCV(lg, param_grid=tuned_parameters, cv = kfold, scoring=make_scorer(mean_absolute_error), 
                           n_jobs=-1,  verbose=5, refit = False)
grid_search.fit(train_X , train_y)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
# plot CV误差曲线
test_means = grid_search.cv_results_['mean_test_score']
test_stds = grid_search.cv_results_['std_test_score']
train_means = grid_search.cv_results_['mean_train_score']
train_stds = grid_search.cv_results_['std_train_score']

x_axis = subsample_s

plt.plot(x_axis, -test_means)
#plt.errorbar(x_axis, -test_scores[:,i], yerr=test_stds[:,i] ,label = str(max_depths[i]) +' Test')
#plt.errorbar(x_axis, -train_scores[:,i], yerr=train_stds[:,i] ,label = str(max_depths[i]) +' Train')

plt.show()

### step5. 列采样参数 feature_fraction/colsample_bytree/sub_feature

In [None]:
params = {'boosting_type': 'gbdt',
          'objective':'regression',
          'learning_rate': 0.1,
          'n_estimators': n_estimators_1,
          'max_depth': 7,
          'max_bin': 255,
          'num_leaves': 70,
          'min_data_in_leaf': 50,
          #'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1
}
lg = LGBMRegressor(silent=False,  **params)

colsample_bytree_s = [i/10.0 for i in range(5,10)]
tuned_parameters = dict(colsample_bytree = colsample_bytree_s)

grid_search = GridSearchCV(lg, param_grid=tuned_parameters, cv = kfold, scoring=make_scorer(mean_absolute_error), 
                            n_jobs=-1, verbose=5, refit = False)
grid_search.fit(train_X , train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
# plot CV误差曲线
test_means = grid_search.cv_results_['mean_test_score']
test_stds = grid_search.cv_results_['std_test_score']
train_means = grid_search.cv_results_['mean_train_score']
train_stds = grid_search.cv_results_['std_train_score']

x_axis = colsample_bytree_s

plt.plot(x_axis, -test_means)
#plt.errorbar(x_axis, -test_scores[:,i], yerr=test_stds[:,i] ,label = str(max_depths[i]) +' Test')
#plt.errorbar(x_axis, -train_scores[:,i], yerr=train_stds[:,i] ,label = str(max_depths[i]) +' Train')

plt.show()

### step6. 减小学习率，调整n_estimators

In [None]:
params = {'boosting_type': 'gbdt',
          'objective':'regression',
          'learning_rate': 0.05,
#           'n_estimators': n_estimators_1,
          'max_depth': 7,
          'max_bin': 255,
          'num_leaves': 70,
          'min_data_in_leaf': 50,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1
}
n_estimators_2 = get_n_estimators(params , train_X , train_y)

### step7. 用所有训练数据，采用最佳参数重新训练模型
由于样本数目增多，模型复杂度稍微扩大一点？
num_leaves增多5
min_child_samples按样本比例增加到40

In [None]:
params = {'boosting_type': 'gbdt',
          'objective':'regression',
          'learning_rate': 0.1,
          'n_estimators': n_estimators_2,
          'max_depth': 7,
          'max_bin': 255,
          'num_leaves': 70,
          'min_data_in_leaf': 50,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 1,  
          'lambda_l1': 1,
          'lambda_l2': 2,
          'n_jobs': -1
}

lg = LGBMRegressor(silent=False,  **params)
lg.fit(train_X, train_y)

### step8. 保存模型

In [None]:
import pickle

pickle.dump(lg, open("Otto_LightGBM_org_tfidf.pkl", 'wb'))

df = pd.DataFrame({"columns":list(feat_names), "importance":list(lg.feature_importances_.T)})
df = df.sort_values(by=['importance'],ascending=False)

plt.bar(range(len(lg.feature_importance_)), lg.feature_importance_)
plt.show()

### step9. 预测测试集

In [None]:
# 加载训练好的模型, 预测测试集结果，并保存
import pickle as cPickle
gbm = cPickle.load(open("../model/used_car_LightGBM_v2.pkl", 'rb'))

test_pred = gbm.predict(test_X)
sub = pd.DataFrame()
sub['SaleID'] = test_X.SaleID
sub['price'] = test_pred
sub.to_csv('../submission/submission_lgb_02.csv',index=False)
print('Done.')