In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.kernel_ridge import KernelRidge
import xgboost
from xgboost.sklearn import XGBRegressor
import numpy as np
import os
import sklearn.metrics as metrics
from sklearn import preprocessing

base_dir = "/home/tanyx/dataDemo/campus2018/new_result_data"
csvs = os.listdir(base_dir)
datas = []
for csv in csvs:
    with open(base_dir+"/"+csv, mode="r", encoding="utf-8") as f:
        f.readline()
        for line in f.readlines():
            data = line.strip().split(",")
            data[0] = data[0][-2:]
            datas.append(data)

In [2]:
datas = np.array(datas, dtype=int)
X = preprocessing.scale(datas[:,:-1])
y = datas[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
xgb_train = xgboost.DMatrix(X_train, label=y_train)
xgb_test = xgboost.DMatrix(X_test, label=y_test)



In [3]:
print(X_train.shape, X_test.shape)

(206223, 7) (22914, 7)


In [6]:
xlf = XGBRegressor(max_depth=6, 
                    learning_rate=0.1, 
                    silent=True, 
                    objective='reg:linear', 
                    nthread=4, 
                    gamma=0.1,
                    min_child_weight=6,
                    subsample=0.8, 
                    colsample_bytree=0.8,
                    scale_pos_weight=1, 
                    seed=27)
xlf.fit(X_train, y_train, eval_metric='rmse', verbose = True, eval_set = [(X_test, y_test)],early_stopping_rounds=100)

[0]	validation_0-rmse:599.335
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:559.441
[2]	validation_0-rmse:523.769
[3]	validation_0-rmse:493.198
[4]	validation_0-rmse:466.56
[5]	validation_0-rmse:444.861
[6]	validation_0-rmse:428.521
[7]	validation_0-rmse:408.998
[8]	validation_0-rmse:396.702
[9]	validation_0-rmse:383.909
[10]	validation_0-rmse:372.772
[11]	validation_0-rmse:360.367
[12]	validation_0-rmse:352.125
[13]	validation_0-rmse:343.705
[14]	validation_0-rmse:336.797
[15]	validation_0-rmse:331.997
[16]	validation_0-rmse:326.222
[17]	validation_0-rmse:323.947
[18]	validation_0-rmse:319.414
[19]	validation_0-rmse:315.14
[20]	validation_0-rmse:311.575
[21]	validation_0-rmse:309.203
[22]	validation_0-rmse:306.136
[23]	validation_0-rmse:299.4
[24]	validation_0-rmse:297.963
[25]	validation_0-rmse:297.068
[26]	validation_0-rmse:294.493
[27]	validation_0-rmse:286.557
[28]	validation_0-rmse:279.422
[29]	validation_0-rmse:273.475
[30]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=6, missing=None,
       n_estimators=100, n_jobs=1, nthread=4, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8)

In [7]:
predicts = xlf.predict(X_test)
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predicts)))

RMSE: 210.70963062586387


In [None]:
# validate
import json
with open("vals.json", mode="r", encoding="utf-8") as f:
    vals = json.loads(f.read())
    for i in range(len(vals)):
        vals[i][0] = vals[i][0][-2:]

vals = np.array(vals, dtype=int)
predicts = xlf.predict(vals)

In [None]:
predicts[0]

In [None]:
import datetime
year = 2017
month = 11
results = []
i = 0
for day in range(1, 31):
    for hour in range(0, 24):
        for loc_id in range(1, 34):
            results.append([loc_id, datetime.datetime(year,month,day,hour,0,0).strftime("%Y-%m-%d %H"), int(predicts[i])])
            i+=1
with open("result.csv", mode="w", encoding="utf-8") as f:
    for result in results:
        f.write("%d,%s,%d\n" % (result[0], result[1], result[2]))

In [None]:
# kernal ridge
kr = GridSearchCV(KernelRidge(kernel='chi2', gamma=0.1), cv=10, pre_dispatch=1,
                    param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                                "gamma": np.logspace(-2, 2, 5)})
print(X.shape)
print(y.shape)
kr.fit(X[:50000,:], y[:50000])
predicted = kr.predict(X)


print("MSE:",metrics.mean_squared_error(y, predicted))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted)))
# fig, ax = plt.subplots()
# ax.scatter(y, predicted)
# ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
# ax.set_xlabel('KRMeasured')
# ax.set_ylabel('KRPredicted')
# plt.show()