In [37]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

plt.style.use({'figure.figsize':(15,10)})

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [3]:
df = pd.read_csv('data/bilibili_rank100_data.csv')

In [4]:
df.shape

(7302, 14)

In [5]:
#按标题去重
df = df.drop_duplicates(subset=['title'],keep='first',inplace=False)

In [6]:
#缺失值处理
df = df.drop(df[df['time']>1000].index)
df[df.isnull().values==True]

Unnamed: 0,author,title,tag,partition,funs,like,coins,collect,share,danmu,reply,time,like_rate,views
158,星有野,【我怎么才能让你相信国创动画】,,guochuang,2061928,253632,112476,42338,8076,6495,5382,109.300000,0.105747,2398481
5503,是阿胜呀-,熊出没四个穿帮镜头,,guochuang,325,994,71,629,9,10,10,161.476873,0.001262,787500
5504,次元小主,圆满了！时隔16年，官方终于出又大电影了！,,guochuang,107330,19904,3049,13581,122,172,181,166.302921,0.012275,1621472
5507,-_艾达王_-,《尸兄59》白小飞蜕变为白龙守护者！尸王了结心路历程！人类集结所有战力和尸王决战！！最终大战...,,guochuang,74638,11862,9127,2028,67,404,466,1.323056,0.095506,124202
5518,龙珠哥哥呀,智者的对决，使用脑发达药水后，智商提升100倍，舌战相当激烈！,,guochuang,9541,14408,31,675,8,91,160,5.175903,0.063780,225902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7253,热点库,11岁男孩掏出攒的钱，妈妈瞬间惊呆，网友：成年人都没这么多,,rookie,1035,32260,37,1386,145,96,1318,5.711146,0.045203,713670
7269,生活纪实录,女子为证清白主动要求做DNA，报告出来后，自己都不知谁是孩子爹,,rookie,1242,14821,722,2385,7891,2832,2998,5.794375,0.017965,825002
7270,大富翁酒吧老板-阿杰,我是不是最惨的酒吧老板。我在杭州滨江花了300w开了这么一家酒吧,,rookie,3716,37489,504,3536,1831,828,2566,4.975764,0.034894,1074373
7291,bili_64364371448,江西某211某食堂木桶饭现状,,rookie,26,7859,46,530,1027,130,1086,4.596424,0.018683,420649


In [7]:
#重置索引
df=df.reset_index(drop=True, inplace=False)

In [8]:
#分区按序号编码
def LabelEncoding(df):
    x, dfc = 'partition', df
    key = dfc[x].unique()  # 将唯一值作为关键字
    value = [i for i in range(len(key))]  # 键值
    Dict = dict(zip(key, value))  # 字典，即键值对
    for i in range(len(key)):
        for j in range(dfc.shape[0]):
            if key[i] == dfc[x][j]:
                dfc[x][j] = Dict[key[i]]
    dfc[x] = dfc[x].astype(np.int64)
    return dfc

df = LabelEncoding(df)

In [9]:
df.shape

(5905, 14)

In [10]:
df = df.drop(["author","title","tag"],axis = 1)
X = df.drop(["views"],axis = 1)
y = df["views"]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=99)

In [11]:
X_train.shape, X_test.shape

((4428, 10), (1477, 10))

## 调整GBDT的参数

`n_estimators`:代表弱学习器的个数,过多容易过拟合,在实际的调参过程中,常常将它和参数learning_rate一起来考虑

`learning_rate`:各个基础模型，在梯度下降训练时的步长

`max_depth`:每棵子树的深度，默认为3。 如果数据量和特征都不多，可以忽略。但较大时，建议限制深度在10-100之间

`min_samples_split`:子树继续划分的条件，默认为2。 如果大于这个值就需要继续划分子树

`min_samples_leaf`:叶子节点最少的样本数，默认1。 如果大于这个值就会和兄弟节点一起被剪枝

`subsample`:基础模型在训练的时候使用的样本数量，默认为1，全样本。如果样本量比较大，可以考虑部分样本

`max_features`：基础模型训练时候使用的特征数量

In [12]:
n_estimators = 100
learning_rate = 0.1
max_depth = 8
min_samples_split = 300
min_samples_leaf = 20
subsample = 0.8
max_features = 'sqrt'

parameters = {}
parameters['n_estimators'] = n_estimators
parameters['learning_rate'] = learning_rate
parameters['max_depth'] = max_depth
parameters['min_samples_split'] = min_samples_split
parameters['min_samples_leaf'] = min_samples_leaf
parameters['subsample'] = subsample
parameters['max_features'] = max_features

scores = []

cv_params = {'n_estimators':range(50,500,50), 
             'learning_rate':[i/10.0 for i in range(1,4)]
            }

gbm = GridSearchCV(GradientBoostingRegressor(
                                                min_samples_split=min_samples_split,
                                                min_samples_leaf=min_samples_leaf,
                                                max_depth=max_depth,
                                                max_features=max_features,
                                                subsample=subsample,
                                                random_state=99,
                                            ),
                    
                    param_grid = cv_params,
                    cv = 5,
)

gbm.fit(X_train,y_train)
print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

{'mean_fit_time': array([0.23472452, 0.45977111, 0.69133511, 0.89780993, 1.11781178,
       1.34998012, 1.57320447, 1.79839754, 2.0271749 , 0.21981454,
       0.44501023, 0.67220168, 0.90158935, 1.12658777, 1.35417948,
       1.57977581, 1.80716095, 2.03496971, 0.22100592, 0.44640055,
       0.67660217, 0.90197859, 1.12299328, 1.35497723, 1.57958674,
       1.81333838, 2.02001028]), 'std_fit_time': array([0.042714  , 0.00863365, 0.0129019 , 0.00407771, 0.00634293,
       0.0092125 , 0.01040269, 0.01271377, 0.00824037, 0.00221927,
       0.00236863, 0.00316763, 0.00620095, 0.0070044 , 0.00962345,
       0.00724719, 0.00823306, 0.01603053, 0.00241206, 0.00291049,
       0.00348799, 0.00467382, 0.00518624, 0.00753265, 0.00994953,
       0.03310004, 0.01303578]), 'mean_score_time': array([0.00316167, 0.00459576, 0.0051939 , 0.00597363, 0.00678768,
       0.008178  , 0.00837736, 0.00957894, 0.01056123, 0.00279293,
       0.00398922, 0.00478811, 0.00519609, 0.0065721 , 0.00718069,
       0.0

In [13]:
learning_rate = gbm.best_params_['learning_rate']
n_estimators = gbm.best_params_['n_estimators']
parameters['learning_rate'] = learning_rate
parameters['n_estimators'] = n_estimators
scores.append(gbm.best_score_)

cv_params = {'n_estimators':[410,420,430,440,450,460,470,480,490,500]}

gbm = GridSearchCV(GradientBoostingRegressor(
                                                learning_rate = learning_rate,
                                                min_samples_split = min_samples_split,
                                                min_samples_leaf = min_samples_leaf,
                                                max_depth = max_depth,
                                                max_features = max_features,
                                                subsample = subsample,
                                                random_state = 99,
                                            ),
                    
                    param_grid = cv_params,
                    cv = 5,
)

gbm.fit(X_train,y_train)
print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

{'mean_fit_time': array([1.87498388, 1.91947837, 1.96716018, 2.01164122, 2.06147242,
       2.09899979, 2.1546433 , 2.19911232, 2.24142919, 2.35152626]), 'std_fit_time': array([0.00708771, 0.00532664, 0.00708324, 0.01149777, 0.01002564,
       0.01142189, 0.01362409, 0.01104025, 0.01046069, 0.04177567]), 'mean_score_time': array([0.00917916, 0.0095705 , 0.00996752, 0.01016088, 0.01038575,
       0.01075377, 0.01076751, 0.01117783, 0.01095858, 0.0117557 ]), 'std_score_time': array([3.96986004e-04, 4.92851390e-04, 1.13326849e-05, 3.79114732e-04,
       4.86120761e-04, 3.91550198e-04, 7.50213894e-04, 3.95746500e-04,
       2.60645489e-05, 4.24981154e-04]), 'param_n_estimators': masked_array(data=[410, 420, 430, 440, 450, 460, 470, 480, 490, 500],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 410}, {'n_estimators': 420}, {'n_estimators': 430}, {'n_est

In [14]:
n_estimators = gbm.best_params_['n_estimators']
parameters['n_estimators'] = n_estimators
scores.append(gbm.best_score_)

cv_params = {'max_depth':range(3,14,2), 
             'min_samples_split':range(100,801,200)
            }

gbm = GridSearchCV(GradientBoostingRegressor(
                                                learning_rate = learning_rate,
                                                n_estimators = n_estimators,
                                                min_samples_leaf = min_samples_leaf,
                                                max_features = max_features,
                                                subsample = subsample,
                                                random_state = 99,
                                            ),
                    
                    param_grid = cv_params,
                    cv = 5,
)

gbm.fit(X_train,y_train)
print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

{'mean_fit_time': array([0.94108429, 0.91854391, 0.94367709, 0.90178976, 1.35297627,
       1.33961768, 1.33124042, 1.3196722 , 1.79300566, 1.75830665,
       1.73136539, 1.71482205, 2.20551419, 2.13288102, 2.09559712,
       2.04793563, 2.57131963, 2.48297305, 2.42350969, 2.36628394,
       2.93334942, 2.80709476, 2.71793408, 2.62718396]), 'std_fit_time': array([0.01799153, 0.02589973, 0.00277841, 0.0045383 , 0.00100799,
       0.01205942, 0.00602431, 0.00872541, 0.00763104, 0.01060216,
       0.01525844, 0.00898483, 0.01894987, 0.01886982, 0.00956593,
       0.01509138, 0.0087699 , 0.02517162, 0.01592386, 0.0189671 ,
       0.01394255, 0.02833192, 0.02894222, 0.04148396]), 'mean_score_time': array([0.00498662, 0.00498672, 0.00458784, 0.00499258, 0.00658417,
       0.00658298, 0.00638351, 0.00638289, 0.00897598, 0.00897598,
       0.00837746, 0.00856938, 0.01156898, 0.01117601, 0.01077104,
       0.01056771, 0.01475992, 0.01396275, 0.0131649 , 0.01276689,
       0.0169446 , 0.0159574 

In [15]:
max_depth = gbm.best_params_['max_depth']
min_samples_split = gbm.best_params_['min_samples_split']
parameters['max_depth'] = max_depth
parameters['min_samples_split'] = min_samples_split
scores.append(gbm.best_score_)

cv_params = {'min_samples_split':range(450,550,10), 
             'min_samples_leaf':range(10,30,5)
            }

gbm = GridSearchCV(GradientBoostingRegressor(
                                                learning_rate = learning_rate,
                                                n_estimators = n_estimators,
                                                max_depth = max_depth,
                                                max_features = max_features,
                                                subsample = subsample,
                                                random_state = 99,
                                            ),
                    
                    param_grid = cv_params,
                    cv = 5,
)

gbm.fit(X_train,y_train)
print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

{'mean_fit_time': array([1.33503065, 1.33324385, 1.33403263, 1.32206583, 1.3272438 ,
       1.32764997, 1.32246418, 1.32665391, 1.34240842, 1.32745562,
       1.33563266, 1.32964182, 1.32824831, 1.33363862, 1.33801656,
       1.33304262, 1.33503537, 1.32864714, 1.32646089, 1.32924204,
       1.33303661, 1.3288486 , 1.32964611, 1.33283606, 1.32586303,
       1.32824512, 1.33382907, 1.32805533, 1.33224487, 1.32844934,
       1.32364693, 1.33304009, 1.32805176, 1.3190639 , 1.32466464,
       1.33503065, 1.31886353, 1.32626061, 1.32326279, 1.32385011]), 'std_fit_time': array([0.01184703, 0.00923927, 0.01087177, 0.0051143 , 0.00600749,
       0.01154637, 0.00752378, 0.00619806, 0.03108704, 0.00790503,
       0.00637741, 0.00891296, 0.00668137, 0.00586228, 0.014583  ,
       0.00421485, 0.00772647, 0.00913588, 0.00579756, 0.0106571 ,
       0.01037227, 0.00632003, 0.00649096, 0.00603678, 0.00463515,
       0.00826427, 0.00903908, 0.00786703, 0.00909598, 0.00768026,
       0.00494865, 0.01194

In [16]:
min_samples_leaf = gbm.best_params_['min_samples_leaf']
min_samples_split = gbm.best_params_['min_samples_split']
parameters['min_samples_leaf'] = min_samples_leaf
parameters['min_samples_split'] = min_samples_split
scores.append(gbm.best_score_)

cv_params = {'max_features':['auto', 'sqrt', 'log2', 'None'],
             'subsample':[i/10.0 for i in range(1,10,2)]
            }

gbm = GridSearchCV(GradientBoostingRegressor(
                                                learning_rate = learning_rate,
                                                n_estimators = n_estimators,
                                                max_depth = max_depth,
                                                min_samples_split = min_samples_split,
                                                min_samples_leaf = min_samples_leaf,
                                                random_state = 99,
                                            ),
                    
                    param_grid = cv_params,
                    cv = 5,
)

gbm.fit(X_train,y_train)
print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

{'mean_fit_time': array([1.72545958e-01, 1.48941088e+00, 2.45542855e+00, 3.42166276e+00,
       4.36950479e+00, 1.76734591e-01, 6.07569551e-01, 9.15951443e-01,
       1.19500446e+00, 1.45631714e+00, 1.71144056e-01, 6.11364079e-01,
       9.12150764e-01, 1.19581847e+00, 1.46568079e+00, 2.19244957e-03,
       1.79514885e-03, 1.99456215e-03, 1.79514885e-03, 1.79519653e-03]), 'std_fit_time': array([4.84344578e-03, 3.21390575e-03, 1.03357031e-02, 2.10077465e-02,
       1.56134996e-02, 2.62530310e-03, 7.00111295e-03, 6.03636570e-03,
       1.33977333e-02, 4.95135720e-03, 1.36191565e-03, 4.13338371e-03,
       5.30522101e-03, 1.11780096e-02, 2.20374106e-02, 9.74664686e-04,
       3.99041187e-04, 1.78416128e-07, 3.98802768e-04, 3.98945819e-04]), 'mean_score_time': array([0.0019938 , 0.00618439, 0.0059907 , 0.00618358, 0.0061832 ,
       0.00199423, 0.00618253, 0.0063827 , 0.00618396, 0.00638375,
       0.00179372, 0.00618882, 0.00618315, 0.00618401, 0.00658259,
       0.        , 0.        , 0

In [17]:
max_features = gbm.best_params_['max_features']
subsample = gbm.best_params_['subsample']
parameters['max_features'] = max_features
parameters['subsample'] = subsample
scores.append(gbm.best_score_)

In [18]:
print(parameters)
print(scores)

{'n_estimators': 450, 'learning_rate': 0.2, 'max_depth': 5, 'min_samples_split': 480, 'min_samples_leaf': 10, 'subsample': 0.9, 'max_features': 'sqrt'}
[0.920013274473866, 0.920013274473866, 0.9281511771935216, 0.9365846597153178, 0.9408208288655938]


In [19]:
# 获取最佳模型
gbm.best_estimator_

GradientBoostingRegressor(learning_rate=0.2, max_depth=5, max_features='sqrt',
                          min_samples_leaf=10, min_samples_split=480,
                          n_estimators=450, random_state=99, subsample=0.9)

In [32]:
# 确定最终模型
gbdtFinal = gbm.best_estimator_
gbdtFinal.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.2, max_depth=5, max_features='sqrt',
                          min_samples_leaf=10, min_samples_split=480,
                          n_estimators=450, random_state=99, subsample=0.9)

In [33]:
# 进行预测
train_preds = gbdtFinal.predict(X_train)
test_preds = gbdtFinal.predict(X_test)

In [34]:
print(train_preds.shape)
print(test_preds.shape)

(4428,)
(1477,)


In [35]:
print("\nModel Report")
print("MSE Train : %f" % mean_squared_error(y_train, train_preds))
print("MSE Test: %f" % mean_squared_error(y_test, test_preds))
print("RMSE Train: %f" % mean_squared_error(y_train, train_preds)**0.5)
print("RMSE Test: %f" % mean_squared_error(y_test, test_preds)**0.5)


Model Report
MSE Train : 2915091730.258037
MSE Test: 45265795732.859093
RMSE Train: 53991.589440
RMSE Test: 212757.598531


In [40]:
pickle.dump(gbdtFinal, open("gbdtFinal.pkl", "wb"))