In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

# **Part1: For testing the model accuracy**

In [None]:
all_data = pd.read_csv("train.csv", parse_dates=['date'])

def preprocess(all_data):
  all_data['date'] = [pd.Timestamp(date) for date in all_data['date'].values]
  all_data['year'] = all_data['date'].apply(lambda x: x.year)
  all_data['month'] = all_data['date'].apply(lambda x: x.month)
  all_data['day'] = all_data['date'].apply(lambda x: x.day)
  all_data['DayOfWeek'] = all_data['date'].apply(lambda d: d.dayofweek)
  all_data['DayOfYear'] = all_data['date'].apply(lambda d: d.dayofyear)
  all_data['WeekOfYear'] = all_data['date'].apply(lambda d: d.weekofyear)
  all_data['Hour'] = all_data['date'].apply(lambda d: d.hour)
  period_dict ={
    23: 7, 0: 7, 1: 7,
    2: 0, 3: 0, 4: 0,
    5: 1, 6: 1, 7: 1,
    8: 2, 9: 2, 10: 2, 11: 2,
    12: 3, 13: 3,
    14: 4, 15: 4, 16: 4, 17: 4,
    18: 5,
    19: 6, 20: 6, 21: 6, 22: 6,
    }
  all_data['PeriodOfDay']=all_data['Hour'].map(period_dict)
  all_data['Weekend'] = all_data['date'].apply(lambda x: True if x.dayofweek in [5, 6] else False)
  public_vacation_list = [
    '20170102', '20170128', '20170129', '20170130', '20170131',
    '20170405', '20170414', '20170415', '20170417', '20170501',
    '20170503', '20170530', '20170701', '20171002', '20171005',
    '20171028', '20171225', '20171226', '20180101', '20180216',
    '20180217', '20180218', '20180219', '20180330', '20180331',
    '20180402', '20180405', '20180501', '20180522', '20180618',
    '20180702', '20180925', '20181001', '20181017', '20181225',
    '20181226'
  ]
  all_data['day'] = all_data['date'].apply(lambda x: x.strftime('%Y%m%d'))
  all_data['vacation'] = all_data['day'].apply(lambda x: True if x in public_vacation_list else False)
  
  return all_data


data = preprocess(all_data)
input = data.drop(['id','date','day','speed'], axis=1)
target = data['speed']

trainInput, testInput, trainTarget, testTarget = train_test_split(
    input, target, test_size=0.2, random_state=1)


In [None]:
all_data.set_index('date',inplace=True)

In [None]:
all_data.resample('D').mean()

Unnamed: 0_level_0,id,speed,year,month,DayOfWeek,DayOfYear,WeekOfYear,Hour,PeriodOfDay,Weekend,vacation
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-01,11.5,44.606355,2017.0,1.0,6.0,1.0,52.0,11.500000,3.458333,True,False
2017-01-02,746.5,35.863858,2017.0,1.0,0.0,2.0,1.0,11.500000,3.458333,False,True
2017-01-03,1418.5,31.895967,2017.0,1.0,1.0,3.0,1.0,11.500000,3.458333,False,False
2017-01-04,2162.5,33.049537,2017.0,1.0,2.0,4.0,1.0,11.500000,3.458333,False,False
2017-01-05,2882.5,37.776822,2017.0,1.0,3.0,5.0,1.0,11.500000,3.458333,False,False
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-27,13947.5,29.479029,2018.0,12.0,3.0,361.0,52.0,13.250000,4.166667,False,False
2018-12-28,13960.5,32.430848,2018.0,12.0,4.0,362.0,52.0,12.857143,3.857143,False,False
2018-12-29,13975.0,37.689331,2018.0,12.0,5.0,363.0,52.0,10.133333,3.400000,True,False
2018-12-30,13987.5,42.996602,2018.0,12.0,6.0,364.0,52.0,13.400000,3.400000,True,False


In [None]:
import lightgbm as lgb
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.6,
    'num_leaves': 300,
    'min_data': 2,
    'min_hessian': 1,
    'verbose': 1,
    'feature_fraction': 0.9,
    'lambda_l1': 1e-4,
}


lgb_train = lgb.Dataset(trainInput, trainTarget)
lgb_test = lgb.Dataset(testInput, testTarget)

gbm = lgb.train(params,lgb_train,900)
prediction1 = gbm.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction1, testTarget)}")

均方误差(MSE)：13.951039396662884


In [None]:
regressor = RandomForestRegressor(n_estimators=300,max_features='log2',random_state=2000)
ABmodel = AdaBoostRegressor(regressor,random_state=1).fit(trainInput, trainTarget)
prediction2 = ABmodel.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction2, testTarget)}")

均方误差(MSE)：15.057107907490797


In [None]:
RFmodel = RandomForestRegressor(n_estimators=300,max_features='log2',random_state=2000).fit(trainInput, trainTarget)
#print(cross_val_score(RFmodel, trainInput, trainTarget, cv=10, scoring='neg_mean_squared_error'))
prediction3 = RFmodel.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction3, testTarget)}")

均方误差(MSE)：19.493976239628807


In [None]:
pipe = Pipeline([('rf',  RandomForestRegressor(max_features='log2'))])
tuned_parameters = {
    'rf__n_estimators': range(100,1000,100),
    'rf__random_state': range(100,2000,100),
}
grid = GridSearchCV(pipe, tuned_parameters, cv=10)
grid.fit(trainInput, trainTarget)
prediction = grid.predict(testInput)

print(f"均方误差(MSE)：{mean_squared_error(prediction, testTarget)}")
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

In [None]:
ETmodel = ExtraTreesRegressor().fit(trainInput, trainTarget)
#print(cross_val_score(model, trainInput, trainTarget, cv=10, scoring='neg_mean_squared_error'))
prediction = ETmodel.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction, testTarget)}")

均方误差(MSE)：22.982645499357098


In [None]:
Bmodel = BaggingRegressor(random_state=0).fit(trainInput, trainTarget)
#print(cross_val_score(model, trainInput, trainTarget, cv=10, scoring='neg_mean_squared_error'))
prediction = Bmodel.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction, testTarget)}")

均方误差(MSE)：22.356993269923542


In [None]:
weight = [2,1,4,1]
voting = VotingRegressor(estimators=[('rf',RFmodel),('et',ETmodel),('ab',ABmodel),('b',Bmodel)],weights=weight)
voting.fit(trainInput, trainTarget)
print(cross_val_score(voting, trainInput, trainTarget, cv=10, scoring='neg_mean_squared_error'))
prediction = voting.predict(testInput)
print(f"均方误差(MSE)：{mean_squared_error(prediction, testTarget)}")

# **Part2: For generating the submission data**

In [None]:
all_data = pd.read_csv("train.csv", parse_dates=['date'])
test_data = pd.read_csv("test.csv", parse_dates=['date'])

def preprocess(all_data):
  all_data['date'] = [pd.Timestamp(date) for date in all_data['date'].values]
  all_data['year'] = all_data['date'].apply(lambda x: x.year)
  all_data['month'] = all_data['date'].apply(lambda x: x.month)
  all_data['day'] = all_data['date'].apply(lambda x: x.day)
  all_data['DayOfWeek'] = all_data['date'].apply(lambda d: d.dayofweek)
  all_data['DayOfYear'] = all_data['date'].apply(lambda d: d.dayofyear)
  all_data['WeekOfYear'] = all_data['date'].apply(lambda d: d.weekofyear)
  all_data['Hour'] = all_data['date'].apply(lambda d: d.hour)
  period_dict ={
    23: 7, 0: 7, 1: 7,
    2: 0, 3: 0, 4: 0,
    5: 1, 6: 1, 7: 1,
    8: 2, 9: 2, 10: 2, 11: 2,
    12: 3, 13: 3,
    14: 4, 15: 4, 16: 4, 17: 4,
    18: 5,
    19: 6, 20: 6, 21: 6, 22: 6,
    }
  all_data['PeriodOfDay']=all_data['Hour'].map(period_dict)
  all_data['Weekend'] = all_data['date'].apply(lambda x: True if x.dayofweek in [5, 6] else False)
  public_vacation_list = [
    '20170102', '20170128', '20170129', '20170130', '20170131',
    '20170405', '20170414', '20170415', '20170417', '20170501',
    '20170503', '20170530', '20170701', '20171002', '20171005',
    '20171028', '20171225', '20171226', '20180101', '20180216',
    '20180217', '20180218', '20180219', '20180330', '20180331',
    '20180402', '20180405', '20180501', '20180522', '20180618',
    '20180702', '20180925', '20181001', '20181017', '20181225',
    '20181226'
  ]
  all_data['date'] = all_data['date'].apply(lambda x: x.strftime('%Y%m%d'))
  all_data['vacation'] = all_data['date'].apply(lambda x: True if x in public_vacation_list else False)
  
  return all_data

data = preprocess(all_data)
trainInput = data.drop(['id','date','speed'], axis=1)
trainTarget = data['speed']

testset = preprocess(test_data)

testInput = testset.drop(['id', 'date'], axis=1)


In [None]:
import lightgbm as lgb
params = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.8,
    'num_leaves': 300,
    'min_data': 2,
    'min_hessian': 1,
    'verbose': 1,
    'feature_fraction': 0.9,
    'lambda_l1': 1e-4,
}


lgb_train = lgb.Dataset(trainInput, trainTarget)
lgb_test = lgb.Dataset(testInput)

gbm = lgb.train(params,lgb_train,900)
prediction = gbm.predict(testInput)

In [None]:
df = pd.DataFrame({'id':testInput.index, 'speed':prediction})

In [None]:
df.to_csv('submission.csv')