In [1]:
from datetime import date

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.cross_validation import ShuffleSplit
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
)
from sklearn.linear_model import LinearRegression

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.core import (
    Dense,
    Dropout,
    Activation,
    Merge, 
    Reshape
)
from keras.callbacks import EarlyStopping

from metric import rmsle
%matplotlib inline

Using Theano backend.


Create basic datetime features

In [2]:
def calculate_period(timestamp):
    initial_date = date(2011, 1, 1)
    current_date = timestamp.date()
    return (current_date.year - initial_date.year) * 12 + (current_date.month - initial_date.month)

In [3]:
df = pd.read_csv('train.csv', parse_dates=['datetime'])

def create_datetime_features(df):
    df['month'] = df.datetime.map(lambda ts: ts.date().month)
    df['week_day'] = df.datetime.map(lambda ts: ts.date().isoweekday())
    df['week_number'] = df.datetime.map(lambda ts: ts.date().isocalendar()[1])
    df['hour'] = df.datetime.map(lambda ts: ts.hour)
    df['year'] = df.datetime.map(lambda ts: ts.date().year) 
    return df

df = create_datetime_features(df)

Try Grandient Boost with ShuffleSplit - CV

In [4]:
possible_features = [
    'season', 'holiday', 'workingday', 'weather', 
    'temp', 'atemp', 'windspeed', 'month', 
    'hour', 'year', 'week_day']
target = 'count'

In [5]:
model = GradientBoostingRegressor()
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx][target])
    y_pred = model.predict(df.iloc[test_idx][possible_features])
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.619339109963 +/- 0.0203670517485


Add Log Scaled Targets

In [6]:
df['log_count'] = np.log1p(df['count'])
df['log_registered'] = np.log1p(df['registered'])
df['log_casual'] = np.log1p(df['casual'])

In [7]:
model = GradientBoostingRegressor()
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_count'])
    y_pred = model.predict(df.iloc[test_idx][possible_features])
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(np.expm1(y_pred), y_true))
print np.mean(results), "+/-", np.std(results)

0.389355593927 +/- 0.00936657387187


Much better ;)

Add period & week day feature 

In [8]:
df['period'] = df.datetime.map(calculate_period)

In [9]:
possible_features = [
    'season', 'holiday', 'workingday', 'weather', 
    'temp', 'atemp', 'windspeed', 'month', 'hour', 'year',
    'period', 'week_day']

model = GradientBoostingRegressor(n_estimators=200)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_count'])
    y_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.322041547746 +/- 0.0033472935359


Check split into registered prediction and causal prediction + change to Random Forest

In [12]:
possible_features = [
    'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
]

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
    
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.306949120888 +/- 0.0077058747834


# Try new represenation -- Embedding

In [13]:
w_embedding_model = Sequential()
w_embedding_model.add(Embedding(10, 1, input_length=6))
w_embedding_model.add(Reshape(target_shape=(6,)))
w_embedding_model.compile('rmsprop', 'msle')

wd_embedding_model = Sequential()
wd_embedding_model.add(Embedding(10, 1, input_length=6))
wd_embedding_model.add(Reshape(target_shape=(6,)))
wd_embedding_model.compile('rmsprop', 'msle')

In [14]:
weather_gp = df.groupby(['weather'])['log_count', 'log_registered', 'log_casual']
weather_agg_df = weather_gp.mean().merge(
    weather_gp.median(), suffixes=('_mean', '_median'), left_index=True, right_index=True)

week_day_gp = df.groupby(['week_day'])['log_count', 'log_registered', 'log_casual']
week_day_agg_df = week_day_gp.mean().merge(
    week_day_gp.median(), suffixes=('_mean', '_median'), left_index=True, right_index=True)

In [15]:
weather_features = w_embedding_model.predict(weather_agg_df.values)
week_day_features = wd_embedding_model.predict(week_day_agg_df.values)

In [16]:
weather_features

array([[ 0.04401053,  0.04401053,  0.03068844, -0.03055367,  0.04401053,
         0.03695909],
       [ 0.04401053,  0.04401053,  0.03068844,  0.04401053,  0.04401053,
         0.03068844],
       [ 0.04401053,  0.03695909,  0.02493496,  0.04401053,  0.04401053,
         0.02493496],
       [-0.03055367, -0.03055367,  0.02493496, -0.03055367, -0.03055367,
         0.02493496]])

In [17]:
from sklearn.decomposition import PCA
clf = PCA(3)
weather_features = clf.fit_transform(weather_features)

In [18]:
week_day_features

array([[ 0.04429511,  0.04429511, -0.03744267,  0.04429511,  0.04429511,
        -0.03744267],
       [ 0.04429511,  0.04429511, -0.03744267,  0.04429511,  0.04429511,
        -0.03744267],
       [ 0.04429511,  0.04429511, -0.03744267,  0.04429511,  0.04429511,
        -0.03744267],
       [ 0.04429511,  0.04429511, -0.03744267, -0.04366054,  0.04429511,
        -0.03744267],
       [ 0.04429511,  0.04429511, -0.03744267, -0.04366054,  0.04429511,
        -0.03744267],
       [ 0.04429511,  0.04429511, -0.03815959,  0.04429511,  0.04429511,
        -0.03815959],
       [ 0.04429511,  0.04429511, -0.03815959,  0.04429511,  0.04429511,
        -0.03815959]])

In [19]:
clf = PCA(3)
week_day_features = clf.fit_transform(week_day_features)

In [20]:
weather_columns = ['weather_{}'.format(i) for i in range(weather_features.shape[1])]
df_weather = pd.DataFrame(index=weather_agg_df.index).reset_index().join(pd.DataFrame(weather_features, columns=weather_columns))

In [21]:
week_day_columns = ['week_day_{}'.format(i) for i in range(week_day_features.shape[1])]
df_week_day = pd.DataFrame(index=week_day_agg_df.index).reset_index().join(pd.DataFrame(week_day_features, columns=week_day_columns))

In [22]:
merged_df = df.merge(df_weather, on=['weather'])
merged_df = merged_df.merge(df_week_day, on=['week_day'])

In [23]:
possible_features = [
    'season', 'holiday', 'workingday', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period',
] + weather_columns + weather_columns

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
    
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  merged_df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.326291054463 +/- 0.00667505717805


In [26]:
possible_features = [
    'season', 'holiday', 'workingday', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
] + weather_columns 

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
    
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  merged_df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.310750455874 +/- 0.00657877016079


In [27]:
possible_features = [
    'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period',
] +  week_day_columns

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
    
    model.fit(merged_df.iloc[train_idx][possible_features], merged_df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(merged_df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  merged_df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.321277397046 +/- 0.00438124311109


It does not look very helpful :(

## Try simple deep learning techniques

In [28]:
possible_features = ['holiday', 'weather', 'temp', 'atemp', 'windspeed', 'month', 'hour', 'year', 'period', 'week_day']

indexes = np.random.permutation(len(merged_df))
test_indexes = indexes[:int(len(merged_df) * 0.3)]
train_indexes = indexes[int(len(merged_df) * 0.3):]

X = merged_df[possible_features].values

In [29]:
def create_model(input_dim):
    model = Sequential()
    model.add(BatchNormalization(input_shape=(input_dim, )))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='sgd')
    return model    

In [30]:
model = create_model(len(possible_features))
model.fit(
    X[train_indexes],
    merged_df.iloc[train_indexes][['log_count']].values,
    batch_size=100, nb_epoch=1000,
    verbose=0,
    validation_data=(X[test_indexes], merged_df.iloc[test_indexes][['log_count']].values),
    callbacks=[EarlyStopping(verbose=0, patience=50)],
)
rmsle(np.expm1(model.predict(X[test_indexes]).ravel()), merged_df.iloc[test_indexes][target].ravel())

0.74341005558447182

it needs more exploration...

## Try removing some features

In [31]:
possible_features = [
    'season', 'holiday', 'weather', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
]

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
    
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.309681241524 +/- 0.00889556768212


In [32]:
possible_features = [
    'holiday', 'workingday', 'weather', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
]

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
    
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.304904836797 +/- 0.00634309303924


In [33]:
possible_features = [
    'holiday', 'workingday', 'weather', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
]

model = RandomForestRegressor(n_estimators=200, n_jobs=4)
results = []
for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3):
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_registered'])
    reg_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
    
    model.fit(df.iloc[train_idx][possible_features], df.iloc[train_idx]['log_casual'])
    cas_pred = np.expm1(model.predict(df.iloc[test_idx][possible_features]))
  
    y_pred = reg_pred + cas_pred
    y_true =  df.iloc[test_idx][target]
    results.append(rmsle(y_pred, y_true))
print np.mean(results), "+/-", np.std(results)

0.308603065611 +/- 0.00768300734267


# Generate current best solution

In [37]:
test_df = pd.read_csv('test.csv', parse_dates=['datetime'])
test_df = create_datetime_features(test_df)
test_df['period'] = test_df.datetime.map(calculate_period)

In [38]:
possible_features = [
    'holiday', 'workingday', 'weather', 'temp', 'atemp', 
    'windspeed', 'month', 'hour', 'year', 'period', 'week_day'
]

model = RandomForestRegressor(n_estimators=200, n_jobs=4)

# shuffle dataset
df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True)

model.fit(df[possible_features], df['log_registered'])
reg_pred = np.expm1(model.predict(test_df[possible_features]))

model.fit(df[possible_features], df['log_casual'])
cas_pred = np.expm1(model.predict(test_df[possible_features]))

y_pred = reg_pred + cas_pred
pd.DataFrame(
    {'datetime': test_df.datetime, 'count': y_pred}
)[['datetime', 'count']].to_csv('my_submission.csv', index=False)

Paramter tuning is written as a script