# Install required libraries

In [1]:
# # uncomment and run this cell to install packages
# !pip3 install pandas scikit-learn xgboost workalendar==14.0.0
# # after installing packages restart kernel

# Import required libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from workalendar.asia import HongKong

# Read test and train data sets from csv

In [3]:
testData = pd.read_csv('test.csv')
trainData = pd.read_csv('train.csv')
trainData.head()

Unnamed: 0,id,date,speed
0,0,1/1/2017 0:00,43.00293
1,1,1/1/2017 1:00,46.118696
2,2,1/1/2017 2:00,44.294158
3,3,1/1/2017 3:00,41.067468
4,4,1/1/2017 4:00,46.448653


# Feature engineering

In [4]:
# date time formating
date_format_string ='%d/%m/%Y %H:%M'
testData.date = pd.to_datetime(testData.date, format=date_format_string)
trainData.date = pd.to_datetime(trainData.date, format=date_format_string)
# feature creation 
def FeatureCreation(Data):    
    Data['year']=Data['date'].dt.year 
    Data['month']=Data['date'].dt.month 
    Data['day']=Data['date'].dt.day
    Data['dayofweek_num']=Data['date'].dt.dayofweek
    Data['Hour'] = Data['date'].dt.hour
    Data['weekofyear'] = Data['date'].dt.weekofyear
    Data['date_only'] = Data['date'].dt.date
    return Data

# Getting holiday dates for years 2017 and 2018

In [5]:
cal = HongKong()
holidays_2017 = pd.DataFrame(cal.holidays(2017),columns =['date_only','holiday'])
holidays_2018 = pd.DataFrame(cal.holidays(2018),columns =['date_only','holiday'])
holidays = pd.concat([holidays_2017,holidays_2018],ignore_index=True, sort=False)
holidays = holidays.drop('holiday', axis=1)
holidays['is_holiday'] = 1
holidays.head()

Unnamed: 0,date_only,is_holiday
0,2017-01-01,1
1,2017-01-02,1
2,2017-01-28,1
3,2017-01-29,1
4,2017-01-30,1


# Train, validation and test sets

In [6]:
# feature creation 
trainData = FeatureCreation(trainData)
# adding is holiday feature, it is 1 if the day is holiday else 0
trainData =  pd.merge(trainData,holidays, how='left', on=['date_only'])
trainData['is_holiday'] = trainData['is_holiday'].fillna(0)
testData = FeatureCreation(testData)
testData =  pd.merge(testData,holidays, how='left', on=['date_only'])
testData['is_holiday'] = testData['is_holiday'].fillna(0)
# features
features = ['year','month','day','dayofweek_num','Hour','weekofyear','is_holiday']
X = trainData[features]
y = trainData.speed
X_test = testData[features]
# train test split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [7]:
print(train_X)

       year  month  day  dayofweek_num  Hour  weekofyear  is_holiday
8894   2018      1    9              1    11           2         0.0
3701   2017      6    3              5    14          22         0.0
2499   2017      4   14              4    12          15         1.0
13829  2018     12   17              0     1          51         0.0
11467  2018      7    2              0    14          27         1.0
...     ...    ...  ...            ...   ...         ...         ...
13123  2018     10   26              4     6          43         0.0
3264   2017      5   16              1     9          20         0.0
9845   2018      3   14              2    18          11         0.0
10799  2018      5   18              4    12          20         0.0
2732   2017      4   24              0     5          17         0.0

[10522 rows x 7 columns]


# Model XGBoost Regression 

In [8]:
# XGBRegressor model
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# fit model with train features and label. early stopping prevents overfitting
my_model.fit(train_X, train_y, 
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], 
             verbose=False)
# validation set MSE 
mse = mean_squared_error(my_model.predict(val_X), val_y)
print(mse)

11.002992890633738


# Model fitting and prediction

In [9]:
# fitting model with full data set
my_model.fit(X, y,
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], 
             verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
# prediction for test data set 
preds = my_model.predict(X_test)
# storing predictions into csv file
df = pd.DataFrame(preds)
pred = pd.DataFrame()
pred['id'] = df.index
pred['speed']=df[0]
pred.head()
pred.to_csv('test_predictions.csv', index = False)