In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
import math
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, roc_curve
import xgboost as xgb
from sklearn.grid_search import GridSearchCV



In [2]:
# tsv to Dataframe
df_train = pd.read_csv('../data/raw/train.tsv',sep='\t')
df_test = pd.read_csv('../data/raw/test.tsv',sep='\t')

In [3]:
# Feature engineering from datetime
# https://adtech.cyberagent.io/techblog/archives/259
# https://note.nkmk.me/python-pandas-datetime-timestamp/
def create_date_future(df):
    df['datetime'] = pd.to_datetime(df['datetime']) # dtypeをdatetime64に変換
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek'] = df['datetime'].dt.dayofweek  

In [4]:
create_date_future(df_train)
create_date_future(df_test)
df_train.head()

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek
0,2015-01-01,阿寒摩周国立公園,11028,2015,1,1,3
1,2015-01-01,十和田八幡平国立公園,34757,2015,1,1,3
2,2015-01-01,日光国立公園,29714,2015,1,1,3
3,2015-01-01,伊勢志摩国立公園,42652,2015,1,1,3
4,2015-01-01,大山隠岐国立公園,3637,2015,1,1,3


In [5]:
# label encoder
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer

def onehot_encdr_park(df):
    le = preprocessing.LabelEncoder()
    le.fit(df['park'])
    le.transform(['阿寒摩周国立公園', '十和田八幡平国立公園', '日光国立公園', '伊勢志摩国立公園','大山隠岐国立公園','阿蘇くじゅう国立公園', '霧島錦江湾国立公園', '慶良間諸島国立公園']) 
    
    # onehot vector for park column
    encoder = LabelBinarizer()
    park_onehot = encoder.fit_transform(df['park'])
    df_park = pd.DataFrame(park_onehot,columns=le.classes_)
    df = pd.concat([df,df_park],axis=1)
    return df

In [6]:
df_train = onehot_encdr_park(df_train)
df_test = onehot_encdr_park(df_test)
df_train

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,慶良間諸島国立公園,日光国立公園,阿寒摩周国立公園,阿蘇くじゅう国立公園,霧島錦江湾国立公園
0,2015-01-01,阿寒摩周国立公園,11028,2015,1,1,3,0,0,0,0,0,1,0,0
1,2015-01-01,十和田八幡平国立公園,34757,2015,1,1,3,0,1,0,0,0,0,0,0
2,2015-01-01,日光国立公園,29714,2015,1,1,3,0,0,0,0,1,0,0,0
3,2015-01-01,伊勢志摩国立公園,42652,2015,1,1,3,1,0,0,0,0,0,0,0
4,2015-01-01,大山隠岐国立公園,3637,2015,1,1,3,0,0,1,0,0,0,0,0
5,2015-01-01,阿蘇くじゅう国立公園,1369,2015,1,1,3,0,0,0,0,0,0,1,0
6,2015-01-01,霧島錦江湾国立公園,35352,2015,1,1,3,0,0,0,0,0,0,0,1
7,2015-01-01,慶良間諸島国立公園,151,2015,1,1,3,0,0,0,1,0,0,0,0
8,2015-01-02,阿寒摩周国立公園,11153,2015,1,2,4,0,0,0,0,0,1,0,0
9,2015-01-02,十和田八幡平国立公園,33795,2015,1,2,4,0,1,0,0,0,0,0,0


In [7]:
df_train['dayofweek'].unique()

array([3, 4, 5, 6, 0, 1, 2])

In [8]:
def onehot_encdr_dayofweek(df):
    # onehot vector for park column
    encoder = LabelBinarizer()
    park_onehot = encoder.fit_transform(df['dayofweek'])
    df_park = pd.DataFrame(park_onehot,columns=['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])
    df = pd.concat([df,df_park],axis=1)
    return df

In [9]:
df_train = onehot_encdr_dayofweek(df_train)
df_test = onehot_encdr_dayofweek(df_test)
df_train

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,...,阿寒摩周国立公園,阿蘇くじゅう国立公園,霧島錦江湾国立公園,Mon,Tue,Wed,Thu,Fri,Sat,Sun
0,2015-01-01,阿寒摩周国立公園,11028,2015,1,1,3,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,2015-01-01,十和田八幡平国立公園,34757,2015,1,1,3,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,2015-01-01,日光国立公園,29714,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2015-01-01,伊勢志摩国立公園,42652,2015,1,1,3,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2015-01-01,大山隠岐国立公園,3637,2015,1,1,3,0,0,1,...,0,0,0,0,0,0,1,0,0,0
5,2015-01-01,阿蘇くじゅう国立公園,1369,2015,1,1,3,0,0,0,...,0,1,0,0,0,0,1,0,0,0
6,2015-01-01,霧島錦江湾国立公園,35352,2015,1,1,3,0,0,0,...,0,0,1,0,0,0,1,0,0,0
7,2015-01-01,慶良間諸島国立公園,151,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,2015-01-02,阿寒摩周国立公園,11153,2015,1,2,4,0,0,0,...,1,0,0,0,0,0,0,1,0,0
9,2015-01-02,十和田八幡平国立公園,33795,2015,1,2,4,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
def onehot_encdr_month(df):
    # onehot vector for park column
    encoder = LabelBinarizer()
    month_onehot = encoder.fit_transform(df['month'])
    df_month = pd.DataFrame(month_onehot,columns=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
    df = pd.concat([df,df_month],axis=1)
    return df

In [35]:
df_train = onehot_encdr_month(df_train)
df_test = onehot_encdr_month(df_test)
df_train

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,...,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,2015-01-01,阿寒摩周国立公園,11028,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-01,十和田八幡平国立公園,34757,2015,1,1,3,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-01,日光国立公園,29714,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-01,伊勢志摩国立公園,42652,2015,1,1,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-01,大山隠岐国立公園,3637,2015,1,1,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2015-01-01,阿蘇くじゅう国立公園,1369,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2015-01-01,霧島錦江湾国立公園,35352,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2015-01-01,慶良間諸島国立公園,151,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2015-01-02,阿寒摩周国立公園,11153,2015,1,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2015-01-02,十和田八幡平国立公園,33795,2015,1,2,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
def onehot_encdr_day(df):
    # onehot vector for park column
    encoder = LabelBinarizer()
    day_onehot = encoder.fit_transform(df['day'])
    df_day = pd.DataFrame(day_onehot,columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31])
    df = pd.concat([df,df_day],axis=1)
    return df

In [57]:
df_train = onehot_encdr_day(df_train)
df_test = onehot_encdr_day(df_test)
df_train.tail()

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,...,22,23,24,25,26,27,28,29,30,31
5843,2016-12-31,伊勢志摩国立公園,54754,2016,12,31,5,1,0,0,...,0,0,0,0,0,0,0,0,0,1
5844,2016-12-31,大山隠岐国立公園,2862,2016,12,31,5,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5845,2016-12-31,阿蘇くじゅう国立公園,1386,2016,12,31,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5846,2016-12-31,霧島錦江湾国立公園,32600,2016,12,31,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5847,2016-12-31,慶良間諸島国立公園,852,2016,12,31,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [60]:
# holiday data
df_holiday = pd.read_csv('../data/raw/holiday.csv')
df_holiday['datetime'] = pd.to_datetime(df_holiday['datetime'])
df_holiday.columns

Index(['datetime', 'holiday'], dtype='object')

In [61]:
# 学習データと休日データの結合
df_merged = pd.merge(df_train,df_holiday,on='datetime')
df_merged

Unnamed: 0,datetime,park,visitors,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,...,23,24,25,26,27,28,29,30,31,holiday
0,2015-01-01,阿寒摩周国立公園,11028,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2015-01-01,十和田八幡平国立公園,34757,2015,1,1,3,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,2015-01-01,日光国立公園,29714,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2015-01-01,伊勢志摩国立公園,42652,2015,1,1,3,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2015-01-01,大山隠岐国立公園,3637,2015,1,1,3,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5,2015-01-01,阿蘇くじゅう国立公園,1369,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,2015-01-01,霧島錦江湾国立公園,35352,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,2015-01-01,慶良間諸島国立公園,151,2015,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,2015-01-02,阿寒摩周国立公園,11153,2015,1,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2015-01-02,十和田八幡平国立公園,33795,2015,1,2,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
from sklearn.cross_validation import train_test_split
import math
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, roc_curve
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

In [63]:
df_except_y = df_merged.drop(['park','visitors','datetime'], axis=1)
X_train = df_except_y.as_matrix() 
y_train = df_merged['visitors'].as_matrix()
df_except_y.head()

Unnamed: 0,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,慶良間諸島国立公園,日光国立公園,阿寒摩周国立公園,...,23,24,25,26,27,28,29,30,31,holiday
0,2015,1,1,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2015,1,1,3,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2015,1,1,3,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,2015,1,1,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2015,1,1,3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [64]:
df_test_merged = pd.merge(df_test,df_holiday,on='datetime')
df_test_merged = df_test_merged.drop(['park','datetime','index'], axis=1)
X_test = df_test_merged.as_matrix()
df_test_merged.head()

Unnamed: 0,year,month,day,dayofweek,伊勢志摩国立公園,十和田八幡平国立公園,大山隠岐国立公園,慶良間諸島国立公園,日光国立公園,阿寒摩周国立公園,...,23,24,25,26,27,28,29,30,31,holiday
0,2017,1,1,6,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2017,1,1,6,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2017,1,1,6,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,2017,1,1,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2017,1,1,6,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [65]:
# parametar tuning
params = {"learning_rate":[0.1,0.3,0.5],
        "max_depth": [2,3,5,10],
         "subsample":[0.5,0.8,0.9,1],
         "colsample_bytree": [0.5,1.0],
         }

# instantiate
model = xgb.XGBRegressor()
cv = GridSearchCV(model,params,cv=10,n_jobs=-1)

In [67]:
# Cross Validation
from sklearn.model_selection import train_test_split
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train, random_state=0)

In [66]:
# model
cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.3, 0.5], 'max_depth': [2, 3, 5, 10], 'subsample': [0.5, 0.8, 0.9, 1], 'colsample_bytree': [0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [18]:
# predict
y_train_pred = cv.predict(X_train)
y_test_pred = cv.predict(X_test)

In [19]:
# create submit file
submit = pd.DataFrame({
    '':df_test['index'],
    '':y_test_pred})
submit.to_csv('../submit/submit_4.tsv',sep='\t')