### Training model

- Decide which feature to use(just try first)
- Split training data to training data set and validation data set
- Select serval models to train
- Generate submission file (change column name 'date' back to 'day' before submission)


### Feature explanation

- year, year of data
- month, month of year 
- day, day of month
- hour, hour of day from 0 ~ 23
- weekday, week of day from 0 to 6, 0 is Mon, 1 is Tue, 2 is Wed etc.
- grid_id, grid id defined in grid info
- temperture, temperture at given time
- visibility, 0 visibility is bad, 1 visiblity is good
- wind, 0 means no wind or small wind, 1 means wind is strong
- rainy, 0 means no rain, 1 means is rainy
- holiday, 0 means working day, 1 means holiday
- car_number, number of cars in that grid at given time

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import timeit
%matplotlib inline

training_features_filepath = 'data/training.csv'
test_features_filepath = 'data/test.csv'
submission_filepath = 'data/submit_samples.csv'
new_submission_filepath = 'data/submission.csv'

#drop_columns = ['car_number','year','visibility','wind','rainy']
drop_columns = ['car_number','year']

train = pd.read_csv(training_features_filepath)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,visibility,wind,rainy,holiday,car_number
0,2017,1,2,0,0,2,11,1,0,0,1,1
1,2017,1,2,0,0,4,11,1,0,0,1,5
2,2017,1,2,0,0,5,11,1,0,0,1,3
3,2017,1,2,0,0,6,11,1,0,0,1,3
4,2017,1,2,0,0,7,11,1,0,0,1,1


In [61]:
test = pd.read_csv(test_features_filepath)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,visibility,wind,rainy,holiday,car_number
0,2017,3,13,9,0,1,9,1,1,1,0,0
1,2017,3,13,9,0,2,9,1,1,1,0,0
2,2017,3,13,9,0,3,9,1,1,1,0,0
3,2017,3,13,9,0,4,9,1,1,1,0,0
4,2017,3,13,9,0,5,9,1,1,1,0,0


In [62]:
def split_train_val_data(index):
    # split train and val by index
    train_ = train.iloc[:index]
    val_ = train.iloc[index:-1]
    # extract x and y
    train_x = train_.drop(columns=drop_columns)
    train_y = train_['car_number']
    val_x = val_.drop(columns=drop_columns)
    val_y = val_['car_number']
    
    return train_x, train_y, val_x, val_y

# split with index 69692, use last week's data as validation data
train_x, train_y, val_x, val_y = split_train_val_data(69692)

In [63]:
train_x.head()

Unnamed: 0,month,day,hour,weekday,grid_id,temperture,visibility,wind,rainy,holiday
0,1,2,0,0,2,11,1,0,0,1
1,1,2,0,0,4,11,1,0,0,1
2,1,2,0,0,5,11,1,0,0,1
3,1,2,0,0,6,11,1,0,0,1
4,1,2,0,0,7,11,1,0,0,1


### Model selecting

- GLB, XGBoost, RandomForest

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regr = RandomForestRegressor(bootstrap=True, 
                             criterion='mse', 
                             max_depth=5, 
                             max_features='auto', 
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0, 
                             min_impurity_split=None, 
                             min_samples_leaf=5, 
                             min_samples_split=5,
                             min_weight_fraction_leaf=0.0, 
                             n_estimators=300,
                             n_jobs=8, 
                             oob_score=False, 
                             random_state=0, 
                             verbose=0, 
                             warm_start=False)

regr.fit(train_x, train_y)

print(regr.feature_importances_)

predictions = regr.predict(val_x)

score = mean_squared_error(val_y, predictions)

print("score is ", score)

[1.15168920e-04 2.58518649e-02 3.84003924e-01 1.36957137e-03
 5.39281762e-01 3.57199444e-04 4.90205089e-02]
score is  38.92687235281718


In [36]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

regr2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=300, random_state=0)

regr2.fit(train_x, train_y)

print(regr2.feature_importances_)

predictions = regr2.predict(val_x)

score = mean_squared_error(val_y, predictions)

print("score is ", score)

[0.0331794  0.14388488 0.25797507 0.03858287 0.42947211 0.04834633
 0.04855934]
score is  32.18423006162821


In [64]:
import xgboost as xgb
regressor = xgb.XGBRegressor(n_estimators=3000, 
                             nthread=-1, 
                             max_depth=12,
                             learning_rate=0.02, 
                             silent=True, 
                             subsample=0.9, 
                             colsample_bytree=0.7)

regressor.fit(train_x.as_matrix(), train_y)

pred = regressor.predict(val_x.as_matrix())

print("score is ", mean_squared_error(val_y, pred))

test = test.drop(columns=drop_columns)

pred = regressor.predict(test.as_matrix())

score is  13.24203034773223


In [65]:
def generate_submission_file(pred):
    sample_df = pd.read_csv(submission_filepath)
    sample_df['car_number'] = pred
    sample_df['car_number'] = sample_df.car_number.round()
    sample_df['car_number'] = sample_df['car_number'].astype(int)
    sample_df.to_csv(new_submission_filepath, index=False)

In [66]:
generate_submission_file(pred)