In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [46]:
train = pd.read_csv("train.csv", parse_dates = [0])
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [47]:
test = pd.read_csv("test.csv", parse_dates = [0])
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [48]:
train['istrain'] = 1
df = pd.concat([train, test])
df = df.set_index('datetime')

In [49]:
df[['log-casual','log-registered','log-count']] = np.log(df[['casual','registered','count']]+1)

In [50]:
ccols = ['day','month','year','hour','dow','woy']
df[ccols] = pd.DataFrame([[x.day, x.month, x.year, x.hour, x.dayofweek, x.weekofyear] for x in df.index],
                         columns = ccols, index = df.index)
df['peak'] = df[['hour', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  
                                                                ( x['hour'] == 8 or 17 <= x['hour'] <= 18)) 
                                                               or (x['workingday'] == 0 and  11 <= x['hour'] <= 18)], axis = 1)
df['night'] = df[['hour', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and 3 <= x['hour'] <= 4)
                                                               or (x['workingday'] == 0 and  x['hour'] == 4)], axis = 1)
df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
df['sticky'] = df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)
df.loc[df['windspeed'] == 0, 'windspeed'] = 0

  
  
  if __name__ == "__main__":
  # Remove the CWD from sys.path while we load stuff.


In [64]:
season_dummy = pd.get_dummies(df['season'],prefix = 'season')
weather_dummy = pd.get_dummies(df['weather'],prefix = 'weather')
df = pd.concat([df,season_dummy,weather_dummy], axis=1)

In [73]:
def get_rmsle(y_pred, y_actual):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

def custom_train_test_split(data, cutoff_day=15):
    train = data[data['day'] <= cutoff_day]
    test = data[data['day'] > cutoff_day]
    return train, test

def get_data(df):
    data = df[df['istrain'] == 1].copy()
    return data

def prep_data(data, input_cols):
    X = data[input_cols]
    y_r = data['log-registered']
    y_c = data['log-casual']
    return X, y_r, y_c

def predict_on_validation_set(df, model, input_cols):
    data = get_data(df)

    train, test = custom_train_test_split(data)

    X_train, y_train_r, y_train_c = prep_data(train, input_cols)
    X_test, y_test_r, y_test_c = prep_data(test, input_cols)

    model_r = model.fit(X_train, y_train_r)
    y_pred_r = np.exp(model_r.predict(X_test)) - 1

    model_c = model.fit(X_train, y_train_c)
    y_pred_c = np.exp(model_c.predict(X_test)) - 1

    y_pred_comb = y_pred_r + y_pred_c
    y_pred_comb[y_pred_comb < 0] = 0

    y_test_comb = np.exp(y_test_r) + np.exp(y_test_c) - 2

    score = get_rmsle(y_pred_comb, y_test_comb)
    return (y_pred_comb, y_test_comb, score)

def predict_on_test_set(df, model, x_cols):
    # prepare training set
    df_train = get_data(df)
    X_train, y_train_reg, y_train_cas = prep_data(df_train, x_cols)

    # prepare test set
    X_test = df[df['istrain'] != 1][x_cols]

    casual_model = model.fit(X_train, y_train_cas)
    y_pred_cas = casual_model.predict(X_test)
    y_pred_cas = np.exp(y_pred_cas) - 1
    registered_model = model.fit(X_train, y_train_reg)
    y_pred_reg = registered_model.predict(X_test)
    y_pred_reg = np.exp(y_pred_reg) - 1
    
    y_pred_comb = y_pred_cas + y_pred_reg
    y_pred_comb[y_pred_comb < 0] = 0
    # add casual & registered predictions together
    return y_pred_comb

In [74]:
# random forest model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
params = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)
# rf_cols = [
#     'weather', 'temp', 'atemp', 'windspeed',
#     'workingday', 'season', 'holiday', 'sticky',
#     'hour', 'dow', 'woy', 'peak'
# ]
rf_cols = [
    'weather_1', 'weather_2', 'weather_3', 'temp', 'atemp', 'windspeed',
    'workingday', 'season_1', 'season_2', 'season_3', 'holiday', 'sticky',
    'hour', 'dow', 'woy', 'peak','night'
]
rf_p, rf_t, rf_score = predict_on_validation_set(df, rf_model, rf_cols)
print(rf_score)

0.4407426525813912


In [75]:
params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'dow', 'year', 'ideal'
    ]
(gbm_p, gbm_t, gbm_score) = predict_on_validation_set(df, gbm_model, gbm_cols)
print(gbm_score)



0.3190430279132304


In [76]:
params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_cols = [
    'weather_1', 'weather_2', 'weather_3', 'weather_4', 'temp', 'atemp', 'windspeed',
    'workingday', 'season_1', 'season_2', 'season_3', 'season_4', 'holiday', 'sticky',
    'hour', 'dow', 'woy', 'peak','night'
]

(gbm_p, gbm_t, gbm_score) = predict_on_validation_set(df, gbm_model, gbm_cols)
print(gbm_score)



0.43232977582497223


In [70]:
df.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'casual', 'registered', 'count', 'istrain',
       'log-casual', 'log-registered', 'log-count', 'day', 'month', 'year',
       'hour', 'dow', 'woy', 'peak', 'night', 'ideal', 'sticky', 'season_1',
       'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
       'weather_3', 'weather_4'],
      dtype='object')

In [78]:
rf_pred = predict_on_test_set(df, rf_model, rf_cols)
gbm_pred = predict_on_test_set(df, gbm_model, gbm_cols)



In [79]:
y_pred = .20*rf_pred + .80*gbm_pred

In [80]:
test['count'] = y_pred
final_df = test[['datetime', 'count']].copy()
final_df.to_csv('output1.csv', index=False)

In [81]:
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'dow', 'year', 'ideal', 'log-registered', 'log-casual'
    ]
get_data(df)[gbm_cols].to_csv('train_p.csv', index=False)
df[df['istrain'] != 1][gbm_cols].to_csv('test_p.csv', index=False)