### Training model

- Decide which feature to use(just try first)
- Split training data to training data set and validation data set
- Select serval models to train
- Generate submission file (change column name 'date' back to 'day' before submission)


### Feature explanation

- year, year of data
- month, month of year 
- day, day of month
- hour, hour of day from 0 ~ 23
- weekday, week of day from 0 to 6, 0 is Mon, 1 is Tue, 2 is Wed etc.
- grid_id, grid id defined in grid info
- temperture, temperture at given time
- visibility, 0 visibility is bad, 1 visiblity is good
- wind, 0 means no wind or small wind, 1 means wind is strong
- rainy, 0 means no rain, 1 means is rainy
- holiday, 0 means working day, 1 means holiday
- car_number, number of cars in that grid at given time

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import timeit
%matplotlib inline

training_features_filepath = 'data/training.csv'
filtered_training_features_filepath = 'data/filtered_train.csv'
test_features_filepath = 'data/test.csv'
submission_filepath = 'data/submit_samples.csv'
new_submission_filepath = 'data/submission.csv'

drop_columns = ['car_number','year']
#drop_columns = ['car_number','year']

# loading data
train = pd.read_csv(training_features_filepath)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,car_number
0,2017,1,2,0,0,1,11,0,1,0.0
1,2017,1,2,0,0,2,11,0,1,1.0
2,2017,1,2,0,0,3,11,0,1,0.0
3,2017,1,2,0,0,4,11,0,1,5.0
4,2017,1,2,0,0,5,11,0,1,3.0


In [68]:
test = pd.read_csv(test_features_filepath)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,car_number
0,2017,3,13,9,0,1,9,1,0,0
1,2017,3,13,9,0,2,9,1,0,0
2,2017,3,13,9,0,3,9,1,0,0
3,2017,3,13,9,0,4,9,1,0,0
4,2017,3,13,9,0,5,9,1,0,0


In [73]:
def remove_chinese_new_year_data(df):
    df = df[(df.month != 1) | (df.day != 27)]
    df = df[(df.month != 1) | (df.day != 28)]
    df = df[(df.month != 1) | (df.day != 29)]
    df = df[(df.month != 1) | (df.day != 30)]
    df = df[(df.month != 1) | (df.day != 31)]
    df = df[(df.month != 2) | (df.day != 1)]
    df = df[(df.month != 2) | (df.day != 2)]
    return df

def remove_other_hours_data(df):
    df = df[(df.hour != 0)]
    df = df[(df.hour != 1)]
    df = df[(df.hour != 2)]
    df = df[(df.hour != 3)]
    df = df[(df.hour != 4)]
    df = df[(df.hour != 5)]
    df = df[(df.hour != 6)]
    df = df[(df.hour != 7)]
    df = df[(df.hour != 8)]
    df = df[(df.hour != 23)]
    return df

def retrieve_index_by_month_day_hour(df, month, day, hour):
    month = df[df['month'] == month]
    day = month[month['day'] == day]
    hour = day[day['hour'] == hour]
    return hour.index[0]

def generate_submission_file(pred):
    sample_df = pd.read_csv(submission_filepath)
    sample_df['car_number'] = pred
    sample_df['car_number'] = sample_df.car_number.round()
    sample_df['car_number'] = sample_df['car_number'].astype(int)
    sample_df.columns = ['grid_id','day','hour','car_number']
    sample_df.to_csv(new_submission_filepath, index=False)

def split_train_val_data(index):
    # split train and val by index
    train_ = train.iloc[:index]
    val_ = train.iloc[index:-1]
    # extract x and y
    train_x = train_.drop(columns=drop_columns)
    train_y = train_['car_number']
    val_x = val_.drop(columns=drop_columns)
    val_y = val_['car_number']
    return train_x, train_y, val_x, val_y


def val_or_generate_submission_file(param_dict, train, index, t='train'):
    regressor = xgb.XGBRegressor(**param_dict)        
    # only train and validate
    if t == 'train':
        train_x, train_y, val_x, val_y = split_train_val_data(index)
        # train with part of data
        regressor.fit(train_x.as_matrix(), train_y)
        pred = regressor.predict(val_x.as_matrix())
        print("score is ", mean_squared_error(val_y, pred))
    # using the whole data set to train and do prediction
    else:
        print("Train with all the data and generate submission file")
        # train with all the data
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        regressor.fit(train_x.as_matrix(), train_y)
        test_ = test.drop(columns=drop_columns)
        pred = regressor.predict(test_.as_matrix())
        generate_submission_file(pred)

### Model selecting

- XGBoost

### parameter tuning


- 'subsample' : 0.70 -> score : 11.347555193983732
- 'subsample' : 0.75 -> score : 11.32022463662512
- 'subsample' : 0.80 -> score : 11.362855348692083
- 'subsample' : 0.85 -> score : 11.357121493573116
- 'subsample' : 0.90 -> score : 11.342456547514821


- 'colsample_bytree' : 0.70 -> score : 11.32022463662512
- 'colsample_bytree' : 0.80 -> score : 11.289090541110422
- 'colsample_bytree' : 0.85 -> score : 11.289090541110422
- 'colsample_bytree' : 0.90 -> score : 11.193780988596988
- 'colsample_bytree' : 0.95 -> score : 11.193780988596988
- 'colsample_bytree' : 1.00 -> score : 11.27700235393081


- 'n_estimators' : 1400 -> score : 11.195739124743117
- 'n_estimators' : 1500 -> score : 11.193780988596988
- 'n_estimators' : 1600 -> score : 11.195144507584441


- 'max_depth' : 7 -> score : 11.41866426958629
- 'max_depth' : 8 -> score : 11.193780988596988
- 'max_depth' : 9 -> score : 11.173645757975832
- 'max_depth' : 10 -> score : 11.31215321717593


- 'reg_alpha' : 0 -> score : 11.173645757975832
- 'reg_alpha' : 0.5 -> score : 11.158566613863234
- 'reg_alpha' : 0.6 -> score : 11.152764218716774
- 'reg_alpha' : 0.7 -> score : 11.1442919339056
- 'reg_alpha' : 0.8 -> score : 11.163985128246354


- 'reg_lambda' : 0.9 -> score : 11.155791261430858
- 'reg_lambda' : 1 -> score : 11.1442919339056


- drop columns ['car_number','year'] -> score : 11.1442919339056
- drop columns ['car_number','year','visibility','wind'] -> score : 11.118486445690271
- drop columns ['car_number','year','visibility','wind','rainy'] -> score : 11.23252868306267
- drop columns ['car_number','year','visibility','wind','temperture'] -> score : 11.436237226808007
- drop columns ['car_number','year','visibility','wind','month'] -> score : 11.717619152663552
- drop columns ['car_number','year','visibility','wind','month','day] -> score : 12.833065146438912

- adding missing data -> score 10.855065173746608


- param_dict = {
    'max_depth' : 9,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1 
}

### Train 1, keeping all the trainning data

In [70]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

param_dict = {
    'max_depth' : 9,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}

index = retrieve_index_by_month_day_hour(train, 3, 6, 0)
val_or_generate_submission_file(param_dict, train, index)

score is  10.855065173746608


### Train 2, remove all the Chinese New Year's data

In [71]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

param_dict = {
    'max_depth' : 9,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}

train_rm_cny = remove_chinese_new_year_data(train)
train_rm_cny.reset_index()
train_rm_cny.to_csv(filtered_training_features_filepath, index=False)
train_ = pd.read_csv(filtered_training_features_filepath)
index = retrieve_index_by_month_day_hour(train_, 3, 6, 0)
val_or_generate_submission_file(param_dict, train_, index)

score is  17.207985039471247


### Train 3, remove CNY and 23:00 ~ 8:00 data

In [74]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

param_dict = {
    'max_depth' : 8,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}

# generate filtered train file
train1 = remove_other_hours_data(train)
train2 = remove_chinese_new_year_data(train1)
train2.reset_index()
train2.to_csv(filtered_training_features_filepath, index=False)
train_ = pd.read_csv(filtered_training_features_filepath)
index = retrieve_index_by_month_day_hour(train_, 3, 6, 9)
val_or_generate_submission_file(param_dict, train_, index, 't')

Train with all the data and generate submission file


### Train 4, adding missing data

In [None]:
#import lightgbm as lgb
#from sklearn.metrics import mean_squared_error

# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': 'mean_squared_error',
#     'max_depth': 6,
#     'learning_rate': 0.01,
#     'verbose': 0}
#     #'early_stopping_round': 3000}

# n_estimators = 1

# index = retrieve_index_by_month_day_hour(train, 3, 6, 0)
# train_x, train_y, val_x, val_y = split_train_val_data(index)

# d_train = lgb.Dataset(train_x, label=train_y)
# d_valid = lgb.Dataset(val_x, label=val_y)
# watchlist = [d_valid]

# model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)