In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

In [3]:
path = '/home/vzzzz/Downloads/kaggle_bike/'

In [4]:
from datetime import datetime

In [5]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')

In [6]:
train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [7]:
dropfeats = ['count', 'casual', 'registered']

train['hour'] = train['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').hour)
train['day'] = train['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').day)
train['weekday'] = train['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').weekday())
train['month'] = train['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)
train['year'] = train['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
train['count'] = np.log1p(train['count'])

train = train.drop('datetime', axis=1)

test['hour'] = test['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').hour)
test['day'] = test['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').day)
test['weekday'] = test['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').weekday())
test['month'] = test['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)
test['year'] = test['datetime'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)

timecolumn = test['datetime']

test = test.drop('datetime', axis=1)

In [8]:
train.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,day,weekday,month,year
0,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,0,1,5,1,2011
1,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,1,1,5,1,2011


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

In [10]:
def loss_func(truth, prediction):
    y = np.expm1(truth)
    y_ = np.expm1(prediction)
    log1 = np.array([np.log(x + 1) for x in truth])
    log2 = np.array([np.log(x + 1) for x in prediction])
    return np.sqrt(np.mean((log1 - log2)**2))

In [11]:
X = train.drop(dropfeats, axis=1).values
Y = train['count'].values

In [12]:
param_grid = {
    'n_estimators': [50, 80, 100, 120],
    'max_depth': [None, 1, 2, 5],
    'max_features': ['sqrt', 'log2', 'auto']
}

scorer = make_scorer(loss_func, greater_is_better=False)

model = RandomForestRegressor(random_state=42)

result = GridSearchCV(model, param_grid, cv=4, scoring=scorer, n_jobs=3).fit(X, Y)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)

('\tParams:', {'max_features': 'auto', 'n_estimators': 120, 'max_depth': None})
('\tScore:', -0.11836225952688276)


In [15]:
testX = test.values

pred = result.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        "datetime": timecolumn,
        "count": pred
    })
submission.to_csv('RandomForest.csv', index=False)

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

In [20]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'n_estimators': [100, 1000, 1500, 2000, 4000],
    'max_depth': [1, 2, 3, 4, 5, 8, 10]
}

scorer = make_scorer(loss_func, greater_is_better=False)

model = GradientBoostingRegressor(random_state=42)

result = GridSearchCV(model, param_grid, cv=4, scoring=scorer, n_jobs=3).fit(X, Y)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)

('\tParams:', {'n_estimators': 2000, 'learning_rate': 0.01, 'max_depth': 4})
('\tScore:', -0.09649149584358846)


In [21]:
pred = result.predict(testX)
pred = np.expm1(pred)

submission = pd.DataFrame({
        "datetime": timecolumn,
        "count": pred
    })
submission.to_csv('GBR.csv', index=False)

0.42