In [1]:
import numpy as np
import pandas as pd
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
import xgboost as xgb

### 1. 读取特征数据

In [2]:
df = pd.read_csv("./data/cleaned_df.csv", index_col=0) 

In [3]:
X = df.values.tolist()

### 2. 读取label数据

In [4]:
d = pd.read_csv("./data/filter_time_data.txt_new", sep="\t\t", header=None)

  """Entry point for launching an IPython kernel.


In [5]:
d.columns = ['电影ID', '时间', '当前热度值', '新增热度值', '当前点击量', '新增点击量', '当前评论数', '新增评论数', '当前点赞数', '新增点赞数',
              '当前踩数', '新增踩数', '当前评分数', '推广位', '影片推广位等级', '当前小时弹幕数', '当前小时新增弹幕数', '当前小时评分人数',
              '当前小时新增评分人数', '当前小时五星评分人数', '当前小时四星评分人数', '当前小时三星评分人数', '当前小时二星评分人数',
              '当前小时一星评分人数', '当日爱奇艺播放指数', '影片上线时间', '导演', '编剧', '制片', '美术', '主演']

In [6]:
y_new_add_amount = d['新增点击量'].values.tolist()

### 3. 调参

In [2]:
# 最重要的参数是:

# 树的数量(n_estimators)
# 学习速率-后树的影响较小(learning_rate)
# 树深度(max_depth)
# gamma-过拟合参数。
# colsample_bytree-减少过度拟合

In [7]:
def objective(params):
    params = {
#         'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }

    clf = xgb.XGBRegressor(
        n_estimators=250,
        learning_rate=0.05,
        n_jobs=4,
        **params
    )

    score = cross_val_score(clf, X, y_new_add_amount, scoring="r2", cv=StratifiedKFold()).mean()
    print(score, params)
    return score

space = {
#     'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)



-2.2466393839463366 {'gamma': '0.487', 'colsample_bytree': '0.522'}




-2.775193229007071 {'gamma': '0.036', 'colsample_bytree': '0.957'}




-2.7406750197209937 {'gamma': '0.475', 'colsample_bytree': '0.920'}




-1.9637292827145176 {'gamma': '0.095', 'colsample_bytree': '0.654'}




-2.2821398697036397 {'gamma': '0.004', 'colsample_bytree': '0.424'}




-3.2187072169978634 {'gamma': '0.394', 'colsample_bytree': '0.373'}




-2.1044221557731566 {'gamma': '0.422', 'colsample_bytree': '0.440'}




-2.7406750197209937 {'gamma': '0.236', 'colsample_bytree': '0.921'}




-1.9156269751683135 {'gamma': '0.129', 'colsample_bytree': '0.745'}




-3.1919517752372535 {'gamma': '0.027', 'colsample_bytree': '0.318'}


In [8]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'colsample_bytree': 0.37336968350219246, 'gamma': 0.39376598418601827}
