### Training model

- Decide which feature to use(just try first)
- Split training data to training data set and validation data set
- Select serval models to train
- Generate submission file (change column name 'date' back to 'day' before submission)


### Feature explanation

- year, year of data
- month, month of year 
- day, day of month
- hour, hour of day from 0 ~ 23
- weekday, week of day from 0 to 6, 0 is Mon, 1 is Tue, 2 is Wed etc.
- grid_id, grid id defined in grid info
- temperture, temperture at given time
- rainy, 0 means no rain, 1 means is rainy
- holiday, 0 means working day, 1 means holiday
- car_number, number of cars in that grid at given time

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import timeit
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
%matplotlib inline


training_features_filepath = 'data/training.csv'
test_features_filepath = 'data/test.csv'
submission_filepath = 'data/submit_samples.csv'
lgb_submission_filepath = 'data/submission_lgb.csv'
xgb_submission_filepath = 'data/submission_xgb.csv'
final_submission_filepath = 'data/submission_final.csv'

training_features_xgb_filepath = 'data/training_xgb.csv'
test_features_xgb_filepath = 'data/test_xgb.csv'

training_features_lgb_filepath = 'data/training_lgb.csv'
test_features_lgb_filepath = 'data/test_lgb.csv'

drop_columns = ['car_number','year']

one_hot_encoding_columns = ['holiday','rainy']

# loading data
train = pd.read_csv(training_features_filepath)
train = pd.get_dummies(train, columns=one_hot_encoding_columns)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,car_number,holiday_0,holiday_1,rainy_0,rainy_1
0,2017,1,2,9,0,1,13,20.0,0,1,1,0
1,2017,1,2,9,0,2,13,7.0,0,1,1,0
2,2017,1,2,9,0,3,13,4.0,0,1,1,0
3,2017,1,2,9,0,4,13,3.0,0,1,1,0
4,2017,1,2,9,0,6,13,9.0,0,1,1,0


In [45]:
test = pd.read_csv(test_features_filepath)
test = pd.get_dummies(test, columns=one_hot_encoding_columns)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,car_number,holiday_0,holiday_1,rainy_0,rainy_1
0,2017,3,13,9,0,1,9,0,1,0,0,1
1,2017,3,13,9,0,2,9,0,1,0,0,1
2,2017,3,13,9,0,3,9,0,1,0,0,1
3,2017,3,13,9,0,4,9,0,1,0,0,1
4,2017,3,13,9,0,5,9,0,1,0,0,1


In [52]:
def retrieve_index_by_month_day_hour(df, month, day, hour):
    month = df[df['month'] == month]
    day = month[month['day'] == day]
    hour = day[day['hour'] == hour]
    return hour.index[0]

def generate_submission_file(pred, output_filepath):
    sample_df = pd.read_csv(submission_filepath)
    sample_df['car_number'] = pred
    sample_df['car_number'] = sample_df['car_number'].apply(lambda x : np.ceil(x))
    sample_df['car_number'] = sample_df['car_number'].astype(int)
    sample_df.columns = ['grid_id','day','hour','car_number']
    sample_df.to_csv(output_filepath, index=False)

def split_train_val_data(index, train):
    # split train and val by index
    train_ = train.iloc[:index]
    val_ = train.iloc[index:-1]
    # extract x and y
    train_x = train_.drop(columns=drop_columns)
    train_y = train_['car_number']
    val_x = val_.drop(columns=drop_columns)
    val_y = val_['car_number']
    return train_x, train_y, val_x, val_y

def val_or_generate_submission_file_xgb(param_dict, train, test, index, t='train'):
    regressor = xgb.XGBRegressor(**param_dict)        
    # only train and validate
    if t == 'train':
        # train with part of data
        train_x, train_y, val_x, val_y = split_train_val_data(index, train)
        regressor.fit(train_x.as_matrix(), train_y)
        pred = regressor.predict(val_x.as_matrix())
        print("score is ", np.sqrt(mean_squared_error(val_y, pred)))
    # using predition as a extra feature
    elif t == 'extract':
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        regressor.fit(train_x.as_matrix(), train_y)
        test_ = test.drop(columns=drop_columns)
        pred = regressor.predict(test_.as_matrix())
        # extract new feature to test
        test['xgb'] = np.ceil(pred)
        test.to_csv(test_features_xgb_filepath, index=False)
        # extract new feature to train
        pred = regressor.predict(train_x.as_matrix())
        train['xbg'] = np.ceil(pred)
        train.to_csv(training_features_xgb_filepath, index=False)        
    # using the whole data set to train and do prediction
    else:
        print("Train with all the data and generate submission file")
        # train with all the data
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        regressor.fit(train_x.as_matrix(), train_y)
        test_ = test.drop(columns=drop_columns)
        pred = regressor.predict(test_.as_matrix())
        generate_submission_file(pred, xgb_submission_filepath)
        
        
def val_or_generate_submission_file_lgbm(param_dict, n_estimators, train, test, index, t='train'):
    # only train and validate
    if t == 'train':
        # train with part of data
        train_x, train_y, val_x, val_y = split_train_val_data(index, train)
        d_train = lgb.Dataset(train_x, label=train_y)
        d_valid = lgb.Dataset(val_x, label=val_y)
        watchlist = [d_valid]    
        model = lgb.train(param_dict, d_train, n_estimators, watchlist, verbose_eval=0)
        pred = model.predict(val_x)
        print("score is ", np.sqrt(mean_squared_error(val_y, pred)))
    # using predition as a extra feature
    elif t == 'extract':
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        d_train = lgb.Dataset(train_x, label=train_y)
        model = lgb.train(param_dict, d_train, n_estimators, verbose_eval=0)
        test_ = test.drop(columns=drop_columns)
        # extract predition to test
        pred = model.predict(test_)
        test['lgb'] = np.ceil(pred)
        test.to_csv(test_features_lgb_filepath, index=False)
        # extract prediction to train
        pred = model.predict(train_x)
        train['lgb'] = np.ceil(pred)
        train.to_csv(training_features_lgb_filepath, index=False)        
    # using the whole data set to train and do prediction
    else:
        print("Train with all the data and generate submission file")
        # train with all the data
        train_x = train.drop(columns=drop_columns)
        train_y = train['car_number']
        d_train = lgb.Dataset(train_x, label=train_y)
        model = lgb.train(param_dict, d_train, n_estimators, verbose_eval=0)
        test_ = test.drop(columns=drop_columns)
        pred = model.predict(test_)
        generate_submission_file(pred, lgb_submission_filepath)

### TODOs

- Try use ceil instead of round. **Done test score : 8.0536**
- Try remove wind and visibitly and make a submission. **Done test score : 8.0266**
- Add average speed as grid feature and make a submission **Done test socre: 8.058**
- Add missing data and set car number to average via ceil **Done test socre: 8.0373**
- Add missing data and set car number to average via round **Done test socre: 8.0484**
- Try LGB and make a submission **Done test socre: 8.0091**
- Try filter invalid date in each csv and make a submission via LGB **Done test socre: 8.0012**
- Adding avg,std,median car number to features and make a submission via LGB **Done test socre: 8.1684**
- Remove outliers and make a submission via LGB **Done test socre: 8.0288**
- Try NN or LSTM and make a submission (2018-09-xx)
- Try ensemble learning and stacking models (2018-09-xx)
- Plot 50 grids on map and see if can find more features for grid (2018-09-xx)


### XGB Model 1 - Extract feature

In [47]:
param_dict = {
    'max_depth' : 5,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}
index = retrieve_index_by_month_day_hour(train, 3, 6, 9)
val_or_generate_submission_file_xgb(param_dict, train, test, index, 'extract')
#val_or_generate_submission_file_xgb(param_dict, train, test, index)

### Light GBM Model 1 - Extract feature

In [48]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'n_jobs': 8,
    'metric': 'rmse',
    'max_depth': 5,
    'learning_rate': 0.01,
    'verbose': 0}

n_estimators = 4750
# reloading data
train = pd.read_csv(training_features_xgb_filepath)
test = pd.read_csv(test_features_xgb_filepath)
index = retrieve_index_by_month_day_hour(train, 3, 6, 9)
val_or_generate_submission_file_lgbm(params, n_estimators, train, test, index, 'extract')
#val_or_generate_submission_file_lgbm(params, n_estimators, train, test, index)

### XGB Model 2 - Prediction

In [53]:
param_dict = {
    'max_depth' : 9,
    'learning_rate' : 0.01,
    'n_estimators' : 1500,
    'silent' : True,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'n_jobs' : 8,
    'gamma' : 0,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : 0.75,
    'colsample_bytree' : 0.9,
    'reg_alpha' : 0.7,
    'reg_lambda' : 1
}
# loading train and test set with xgb and lgb features
train = pd.read_csv(training_features_lgb_filepath)
test = pd.read_csv(test_features_lgb_filepath)
# predict
index = retrieve_index_by_month_day_hour(train, 3, 6, 9)
val_or_generate_submission_file_xgb(param_dict, train, test, index, '') # 3.369559474786608 3.362073701927338

Train with all the data and generate submission file


### Light GBM Model 2 - Prediction

In [54]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'n_jobs': 8,
    'metric': 'rmse',
    'max_depth': 7,
    'learning_rate': 0.01,
    'verbose': 0}

n_estimators = 4750
# loading train and test set with xgb and lgb features
train = pd.read_csv(training_features_lgb_filepath)
test = pd.read_csv(test_features_lgb_filepath)
# prediction
index = retrieve_index_by_month_day_hour(train, 3, 6, 9)
val_or_generate_submission_file_lgbm(params, n_estimators, train, test, index, '') # 3.3073688776246732 3.289658603021144

Train with all the data and generate submission file


### NN Model

In [None]:
# TBC

### Averaging prediction

In [63]:
# loading prediction file
xgb_pred = pd.read_csv(xgb_submission_filepath)
lgb_pred = pd.read_csv(lgb_submission_filepath)
# rename car_number
xgb_pred = xgb_pred.rename(columns={'car_number': 'car_number_xgb'})
lgb_pred = lgb_pred.rename(columns={'car_number': 'car_number_lgb'})
# merge 2 dataframes
avg_sub = pd.merge(xgb_pred, lgb_pred, on=['grid_id','day','hour'], how='left')
avg_sub['car_number'] = 0
avg_sub['car_number'] = np.ceil((avg_sub.car_number_xgb + avg_sub.car_number_lgb)/2)
avg_sub = avg_sub.drop(columns=['car_number_xgb', 'car_number_lgb'])
avg_sub.to_csv(final_submission_filepath, index=False)

### LSTM Model

In [56]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

train_grid_id_1 = train[train['grid_id']==1]
train_grid_id_1 = train_grid_id_1.reset_index()
index = retrieve_index_by_month_day_hour(train_grid_id_1, 3, 6, 9)
train_x, train_y, val_x, val_y = split_train_val_data(index, train_grid_id_1)

train_x = train_x.drop(columns=['grid_id'])
val_x = val_x.drop(columns=['grid_id'])

min_max_scaler = MinMaxScaler()
train_x = min_max_scaler.fit_transform(train_x)
val_x = min_max_scaler.fit_transform(val_x)

train_x = np.reshape(train_x, (1, 784, 8))
val_x = np.reshape(val_x, (1, 97, 8))

lstm = Sequential()
lstm.add(LSTM(20, dropout=0.1, recurrent_dropout=0.1, input_shape=(784, 8), return_sequences=True))
lstm.add(LSTM(10, dropout=0.1, recurrent_dropout=0.1))
lstm.add(Dense(1))
lstm.compile(loss='mean_squared_error', optimizer='adam')

lstm.summary()

#lstm.fit(train_x, train_y, epochs=100, batch_size=1, validation_data=(val_x, val_y), verbose=1)
#score, mse = lstm.evaluate(val_x, val_y, batch_size=1)
#print(score + ", " + mse)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_21 (LSTM)               (None, 784, 20)           2320      
_________________________________________________________________
lstm_22 (LSTM)               (None, 10)                1240      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 3,571
Trainable params: 3,571
Non-trainable params: 0
_________________________________________________________________
