In [1]:
#特征工程&模型构建训练
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")

In [2]:
# 读入data
df = pd.read_csv('/Users/jsheng/Downloads/Bike-Sharing-Dataset/hour.csv') # based on hour data

In [3]:
# 特征工程
# 对dropFeatrues的四个维度进行one-hot

df_mnth = pd.get_dummies(df['mnth'], prefix='mnth')
df_hr = pd.get_dummies(df['hr'], prefix='hr')
df_weathersit = pd.get_dummies(df['weathersit'], prefix='weathersit')
df_season = pd.get_dummies(df['season'], prefix='season')
df_weekday = pd.get_dummies(df['weekday'], prefix='weekday')

df = df.join(df_mnth)
df = df.join(df_hr)                    
df = df.join(df_weathersit)
df = df.join(df_season)
df = df.join(df_weekday)

dropFeatures = ["instant", "dteday", "registered", 'casual', 'season', 'mnth', 'hr', 'weathersit', 'weekday']
df = df.drop(dropFeatures, axis=1)

df.head()

Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,cnt,mnth_1,mnth_2,...,season_2,season_3,season_4,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0,0,0,0.24,0.2879,0.81,0.0,16,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0.22,0.2727,0.8,0.0,40,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0.22,0.2727,0.8,0.0,32,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0.24,0.2879,0.75,0.0,13,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0.24,0.2879,0.75,0.0,1,1,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
df.columns # 打印特征

Index(['yr', 'holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed',
       'cnt', 'mnth_1', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6',
       'mnth_7', 'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hr_0',
       'hr_1', 'hr_2', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9',
       'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17',
       'hr_18', 'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4', 'season_1', 'season_2',
       'season_3', 'season_4', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6'],
      dtype='object')

In [5]:
# 填充一下windspeed缺失的问题,2180条数据windspeed==0,代码df[df['windspeed']==0] 查看
# 填充方法，RF模型的预测值填充windspeed==0的数据
from sklearn.ensemble import RandomForestRegressor
data_wind_zero = df[df['windspeed']==0]
data_wind_notzero = df[df['windspeed']!=0]
rf_wind = RandomForestRegressor()
wind_features = ['yr', 'holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed',
       'cnt', 'mnth_1', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6',
       'mnth_7', 'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hr_0',
       'hr_1', 'hr_2', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9',
       'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17',
       'hr_18', 'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4', 'season_1', 'season_2',
       'season_3', 'season_4', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6']
rf_wind.fit(data_wind_notzero[wind_features], data_wind_notzero['windspeed'])
wind_values = rf_wind.predict(X=data_wind_zero[wind_features])
data_wind_zero['windspeed'] = wind_values
df = data_wind_zero.append(data_wind_notzero)
df.reset_index(inplace=True)
df.drop('index', inplace=True, axis=1)

In [6]:
df['windspeed'].describe() # 已经没有为0的windspeed了

count    17379.000000
mean         0.201337
std          0.107585
min          0.089600
25%          0.104500
50%          0.194000
75%          0.253700
max          0.850700
Name: windspeed, dtype: float64

In [7]:
# 构建 训练数据&测试数据
from sklearn.model_selection import train_test_split
x, y = df.ix[:,df.columns!='cnt'], df.ix[:,df.columns=='cnt']
x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2) # 8:2 的比例划分数据集

In [8]:
# 评估指标 RMSLE
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [9]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
pd.options.mode.chained_assignment = None
# 线性回归模型
# linear regression
lr = LinearRegression()
lr.fit(x_train, x_test)
preds = lr.predict(y_train)
print(preds)
print("rmsle Value For Ridge Regression: ",rmsle(preds, y_test.values))

[[220.375]
 [ 18.   ]
 [230.375]
 ...
 [295.875]
 [173.   ]
 [-15.375]]
rmsle Value For Ridge Regression:  1.1042911495627106


In [10]:
# ridge 岭回归
ridge_m_ = Ridge()
ridge_params_ = {'max_iter':[3000],'alpha':[0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000]}
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
# 网格搜索，超参数自动搜索
grid_ridge_m = GridSearchCV(ridge_m_,
                          ridge_params_,
                          scoring = rmsle_scorer,
                          cv=5)
grid_ridge_m.fit(x_train.values, x_test.values)
preds = grid_ridge_m.predict(X=y_train)
print(preds)
print(grid_ridge_m.best_params_)
print("rmsle Value For Ridge Regression: ",rmsle(preds, y_test.values))

[[204.80461941]
 [ 35.78859939]
 [233.33232898]
 ...
 [285.12305858]
 [175.84531307]
 [ 11.46155777]]
{'alpha': 100, 'max_iter': 3000}
rmsle Value For Ridge Regression:  1.0714880917741527


In [11]:
# Lasso 
lasso_m_ = Lasso()

alpha  = 1/np.array([0.1, 1, 2, 3, 4, 10, 30,100,200,300])
lasso_params_ = {'max_iter':[3000],'alpha':alpha}

grid_lasso_m = GridSearchCV(lasso_m_,lasso_params_,scoring = rmsle_scorer,cv=5)
grid_lasso_m.fit(x_train.values, x_test.values)
preds = grid_lasso_m.predict(X=y_train)

print(preds)
print (grid_lasso_m.best_params_)
print("rmsle Value For Lasso Regression: ",rmsle(preds, y_test.values))

[133.69409609 178.56110859 239.22125659 ... 226.42391155 162.52466744
 159.88953023]
{'alpha': 10.0, 'max_iter': 3000}
rmsle Value For Lasso Regression:  1.543475667153089


In [16]:
# ensemble model
# random forest 随机森林
no_of_test=[500]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring=rmsle_scorer)
clf_rf.fit(x_train.values,x_test.values)
preds = clf_rf.predict(y_train)
print(preds)
print("rmsle Value For RF Regression: ",rmsle(preds, y_test.values))

[119.504  48.14  185.4   ... 293.864 175.19   17.064]
rmsle Value For RF Regression:  1.697711134316645


In [17]:
# ensemble model
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=4000,alpha=0.01)
gb.fit(x_train, x_test)
preds = gb.predict(y_train)
print(preds)
print("rmsle Value For gbr Regression: ",rmsle(preds, y_test.values))

[108.37231991  18.93273    122.75864776 ... 428.17558622 131.45109955
   9.50301055]
rmsle Value For gbr Regression:  2.129158604145119


In [18]:
# xgboost
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(x_train, x_test)
preds = xgb.predict(y_train)
print(preds)
print("rmsle Value For xgb Regression: ",rmsle(preds, y_test.values))

[176.78632   28.549183 206.26033  ... 285.96466  137.93895   15.7596  ]
rmsle Value For xgb Regression:  1.856336990888132


In [19]:
print('done!')

done!
