In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib.gridspec import GridSpec
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

In [None]:
train[:5]

In [None]:
test[:5]

In [None]:
# transfer the target from x to log(x + 1)
for col in ['casual', 'registered', 'count']:
    train['%s_log' % col] = np.log(train[col] + 1)

In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(221)
sns.distplot(train['casual'])
plt.xlabel("casual (before transformation)")
plt.subplot(222)
sns.distplot(np.log(train['casual'] + 1))
plt.xlabel("casual (after transformation)")
plt.subplot(223)
sns.distplot(train['registered'])
plt.xlabel("registered (before transformation)")
plt.subplot(224)
sns.distplot(np.log(train['registered'] + 1))
plt.xlabel("registered (after transformation)")
plt.show()

In [None]:
# extract information from the timestamp
train_date = pd.DatetimeIndex(train['datetime'])
train['year'] = train_date.year
train['month'] = train_date.month
train['hour'] = train_date.hour
train['dayofweek'] = train_date.dayofweek
test_date = pd.DatetimeIndex(test['datetime'])
test['year'] = test_date.year
test['month'] = test_date.month
test['hour'] = test_date.hour
test['dayofweek'] = test_date.dayofweek

In [None]:
# new feature
# non-registered user: more rentals during daytime
# registered user: more rentals when going to work / going off work
fig = plt.figure(figsize=(15, 10))
gs1 = GridSpec(4, 4, fig, wspace=0.5, hspace=0.5)
plt.subplot(gs1[:2, 1:3])
sns.boxplot(x='hour', y='count', hue='workingday', data=train)
plt.subplot(gs1[2:, :2])
sns.boxplot(x='hour', y='casual', hue='workingday', data=train)
plt.subplot(gs1[2:, 2:])
sns.boxplot(x='hour', y='registered', hue='workingday', data=train)
plt.show()

In [None]:
# new feature
# combine year and season
train['year_season'] = train_date.year + train.season / 10
fig = plt.figure(figsize=(12, 10))
gs1 = GridSpec(4, 4, fig, wspace=0.5, hspace=0.5)
plt.subplot(gs1[:2, 1:3])
sns.boxplot(x='year_season', y='count', data=train)
plt.subplot(gs1[2:, :2])
sns.boxplot(x='year_season', y='casual', data=train)
plt.subplot(gs1[2:, 2:])
sns.boxplot(x='year_season', y='registered', data=train)
plt.show()

In [None]:
# new feature
for df in [train, test]:
    df['year_season'] = df['year'] + df['season'] / 10
    df['hour_workingday_casual'] = df[['hour', 'workingday']].apply(
        lambda x: int(10 <= x['hour'] <= 19), axis=1)
    df['hour_workingday_registered'] = df[['hour', 'workingday']].apply(
      lambda x: int(
        (x['workingday'] == 1 and (x['hour'] == 8 or 17 <= x['hour'] <= 18))
        or (x['workingday'] == 0 and 10 <= x['hour'] <= 19)), axis=1)

by_season = train.groupby('year_season')[['count']].median()
by_season.columns = ['count_season']
train = train.join(by_season, on='year_season')
test = test.join(by_season, on='year_season')

In [None]:
# GradientBoostingRegressor
# features used to train the model
# removing month improves the performance
features = ['season', 'holiday', 'workingday', 'weather',
            'temp', 'atemp', 'humidity', 'windspeed',
            'year', 'hour', 'dayofweek', 'hour_workingday_casual', 'count_season']
reg = GradientBoostingRegressor(n_estimators=1000, min_samples_leaf=6, random_state=0)
reg.fit(train[features], train['casual_log'])
pred_casual = reg.predict(test[features])
pred_casual = np.exp(pred_casual) - 1
pred_casual[pred_casual < 0] = 0
features = ['season', 'holiday', 'workingday', 'weather',
            'temp', 'atemp', 'humidity', 'windspeed',
            'year', 'hour', 'dayofweek', 'hour_workingday_registered', 'count_season']
reg = GradientBoostingRegressor(n_estimators=1000, min_samples_leaf=6, random_state=0)
reg.fit(train[features], train['registered_log'])
pred_registered = reg.predict(test[features])
pred_registered = np.exp(pred_registered) - 1
pred_registered[pred_registered < 0] = 0
pred1 = pred_casual + pred_registered

In [None]:
# GradientBoostingRegressor
# rank 4７/3251 public score 0.36904
submission = pd.DataFrame({'datetime':test.datetime, 'count':pred1},
                          columns = ['datetime', 'count'])
submission.to_csv("submission.csv", index=False)