In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

plt.style.use({'figure.figsize':(15,10)})

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [None]:
df = pd.read_csv('data/bilibili_rank100_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#按标题去重
df = df.drop_duplicates(subset=['标题'],keep='first',inplace=False)

In [None]:
df.shape

In [None]:
#缺失值处理
df = df.drop(df[df['时间']>1000].index)
df[df.isnull().values==True]

In [None]:
#重置索引
df=df.reset_index(drop=True, inplace=False)

In [None]:
#分区按序号编码
def LabelEncoding(df):
    x, dfc = '分区', df
    key = dfc[x].unique()  # 将唯一值作为关键字
    value = [i for i in range(len(key))]  # 键值
    Dict = dict(zip(key, value))  # 字典，即键值对
    for i in range(len(key)):
        for j in range(dfc.shape[0]):
            if key[i] == dfc[x][j]:
                dfc[x][j] = Dict[key[i]]
    dfc[x] = dfc[x].astype(np.int64)
    return dfc

df = LabelEncoding(df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.drop(df.columns[[0,2,10,11,12]],axis=1,inplace=True)
df

In [None]:
X = df.drop(["播放"],axis = 1)
y = df["播放"]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=99)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
gbdt = GradientBoostingRegressor()

param_grid = {
    'n_estimators':[10,50,100,200,500,1000],
    'max_depth':[3,5,7,9],
}

grid = GridSearchCV(gbdt, param_grid=param_grid, cv=10)

grid.fit(X_train, y_train)

In [None]:
# 查看最佳分数和最佳参数
grid.best_score_

In [None]:
grid.best_params_

In [None]:
# 获取最佳模型
grid.best_estimator_

In [None]:
# 利用最佳模型来进行预测
gbdt=grid.best_estimator_
pred = gbdt.predict(X_test)

In [None]:
result = {"labels":y_test,"prediction":pred}
result = pd.DataFrame(result)
result.head()

In [None]:
result['labels'].plot(style='k.',figsize=(15,5))
result['prediction'].plot(style='r.')
#设置图例文字大小和图示大小
plt.legend(fontsize=15,markerscale=3)
#设置坐标文字大小
plt.tick_params(labelsize=25)
#生成刻度线网格
plt.grid()

In [None]:
from sklearn import metrics
MSE = metrics.mean_squared_error(y_test,pred)
RMSE = np.sqrt(MSE)
print('(MSE,RMSE)=',(MSE,RMSE))

In [None]:
print("模型评分: {:.2f}".format(gbdt.score(X_test, y_test)))