### Training model

- Decide which feature to use(just try first)
- Split training data to training data set and validation data set
- Select serval models to train
- Generate submission file (change column name 'date' back to 'day' before submission)


### Feature explanation

- year, year of data
- month, month of year 
- day, day of month
- hour, hour of day from 0 ~ 23
- weekday, week of day from 0 to 6, 0 is Mon, 1 is Tue, 2 is Wed etc.
- grid_id, grid id defined in grid info
- temperture, temperture at given time
- rainy, 0 means no rain, 1 means is rainy
- holiday, 0 means working day, 1 means holiday
- car_number, number of cars in that grid at given time

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import timeit
%matplotlib inline

training_features_filepath = 'data/training.csv'
test_features_filepath = 'data/test.csv'
submission_filepath = 'data/submit_samples.csv'
new_submission_filepath = 'data/submission.csv'

drop_columns = ['car_number','year']

# loading data
train = pd.read_csv(training_features_filepath)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number
0,2017,1,2,9,0,1,13,0,1,21,20.0
1,2017,1,2,9,0,2,13,0,1,16,7.0
2,2017,1,2,9,0,3,13,0,1,22,4.0
3,2017,1,2,9,0,4,13,0,1,72,3.0
4,2017,1,2,9,0,6,13,0,1,41,9.0


In [6]:
test = pd.read_csv(test_features_filepath)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number
0,2017,3,13,9,0,1,9,1,0,21,0
1,2017,3,13,9,0,2,9,1,0,16,0
2,2017,3,13,9,0,3,9,1,0,22,0
3,2017,3,13,9,0,4,9,1,0,72,0
4,2017,3,13,9,0,5,9,1,0,72,0


In [3]:
def retrieve_index_by_month_day_hour(df, month, day, hour):
    month = df[df['month'] == month]
    day = month[month['day'] == day]
    hour = day[day['hour'] == hour]
    return hour.index[0]

def generate_submission_file(pred):
    sample_df = pd.read_csv(submission_filepath)
    sample_df['car_number'] = pred
    sample_df['car_number'] = sample_df['car_number'].apply(lambda x : np.ceil(x))
    sample_df['car_number'] = sample_df['car_number'].astype(int)
    sample_df.columns = ['grid_id','day','hour','car_number']
    sample_df.to_csv(new_submission_filepath, index=False)

def split_train_val_data(index):
    # split train and val by index
    train_ = train.iloc[:index]
    val_ = train.iloc[index:-1]
    # extract x and y
    train_x = train_.drop(columns=drop_columns)
    train_y = train_['car_number']
    val_x = val_.drop(columns=drop_columns)
    val_y = val_['car_number']
    return train_x, train_y, val_x, val_y


def val_or_generate_submission_file(param_dict, train, index, t='train'):
    regressor = xgb.XGBRegressor(**param_dict)        
    # only train and validate
    if t == 'train':
        train_x, train_y, val_x, val_y = split_train_val_data(index)
        # train with part of data
        regressor.fit(train_x.as_matrix(), train_y)
        pred = regressor.predict(val_x.as_matrix())
        print("score is ", mean_squared_error(val_y, pred))
    # using the whole data set to train and do prediction
    else:
        print("Train with all the data and generate submission file")
        # train with all the data
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        regressor.fit(train_x.as_matrix(), train_y)
        test_ = test.drop(columns=drop_columns)
        pred = regressor.predict(test_.as_matrix())
        generate_submission_file(pred)

### XGB Model

#### TODOs

- Try use ceil instead of round. Done test score : 8.0536
- Try remove wind and visibitly and make a submission. Done test score : 8.0266
- Add average speed as grid feature and make a submission (2018-09-10)
- Add missing data and set car number to 0 will get a higher loss, maybe should try use average instead of 0 (round, ceil) (2018-09-11)
- Try K-means or cluster algorithm and use cluster as new features and make a submission (2018-09-12)
- Plot 50 grids on map and see if can find more features for grid (2018-09-xx)
- Try LGB and make a submission (2018-09-xx)
- Try NN or LSTM and make a submission (2018-09-xx)
- Try ensemble learning and stacking models (2018-09-xx)

In [9]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

param_dict = {
    'max_depth' : 9,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}

index = retrieve_index_by_month_day_hour(train, 3, 6, 9)
val_or_generate_submission_file(param_dict, train, index, 't')

Train with all the data and generate submission file


In [None]:
#import lightgbm as lgb
#from sklearn.metrics import mean_squared_error

# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': 'mean_squared_error',
#     'max_depth': 6,
#     'learning_rate': 0.01,
#     'verbose': 0}
#     #'early_stopping_round': 3000}

# n_estimators = 1

# index = retrieve_index_by_month_day_hour(train, 3, 6, 0)
# train_x, train_y, val_x, val_y = split_train_val_data(index)

# d_train = lgb.Dataset(train_x, label=train_y)
# d_valid = lgb.Dataset(val_x, label=val_y)
# watchlist = [d_valid]

# model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)