In [1]:
% matplotlib inline

# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sci-kit learn libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# set seed
np.random.seed(0)

##Data Import

In [2]:
dfs = {}

# import training data set
train_df = pd.read_csv('../data/train.csv')
train_df['_data'] = 'train'
dfs['train'] = train_df

# import test data set
test_df = pd.read_csv('../data/test.csv')
test_df['_data'] = 'test'
dfs['test'] = test_df

In [4]:
# combine train and test data
combined_df = dfs['train'].append(dfs['test'])

# lowercase column names
combined_df.columns = map(str.lower, combined_df.columns)

## Transform Data

In [38]:
# parse datetime column & add new time related columns
dt = pd.DatetimeIndex(combined_df['datetime'])
combined_df.set_index(dt, inplace=True)

# create new columns for day, month, year, hour
combined_df['date'] = dt.date
combined_df['day'] = dt.day
combined_df['month'] = dt.month
combined_df['year'] = dt.year
combined_df['hour'] = dt.hour
combined_df['dayofweek'] = dt.dayofweek

In [6]:
# creating new columns transforming bike ridership to log
for column in ['casual', 'registered', 'count']:
    combined_df['%s_log' % column] = np.log(combined_df[column] + 1)

In [10]:
# mark peak hours
#combined_df['peak'] = combined_df[['hour', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['hour'] == 8 or 17 <= x['hour'] <= 18 or 12 <= x['hour'] <= 12)) or (x['workingday'] == 0 and  10 <= x['hour'] <= 19)], axis = 1)

# mark peak hours
# sat/sun - 10am to 7pm
# mon-fri - 6am to 10am | 4pm to 7pm
combined_df['peak'] = 0
combined_df['peak'][(
        ( (combined_df['workingday'] == 0 ) & ( (combined_df['hour'] >= 10) & (combined_df['hour'] <= 19) ) ) |
        ( 
            (combined_df['workingday'] == 1 ) & 
            ( 
               ( (combined_df['hour'] >= 6) & (combined_df['hour'] <= 10) ) | 
               ( (combined_df['hour'] >= 16) & (combined_df['hour'] <= 19) )
            )
        )
    )] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# defined perfect weather and humid weather variables
combined_df['perfect'] = combined_df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
combined_df['humid'] = combined_df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)

In [39]:
combined_df.head()

Unnamed: 0,_data,atemp,casual,count,datetime,holiday,humidity,registered,season,temp,...,year,hour,dow,casual_log,registered_log,count_log,peak,perfect,humid,dayofweek
2011-01-01 00:00:00,train,14.395,3,16,2011-01-01 00:00:00,0,81,13,1,9.84,...,2011,0,5,1.386294,2.639057,2.833213,0,0,0,5
2011-01-01 01:00:00,train,13.635,8,40,2011-01-01 01:00:00,0,80,32,1,9.02,...,2011,1,5,2.197225,3.496508,3.713572,0,0,0,5
2011-01-01 02:00:00,train,13.635,5,32,2011-01-01 02:00:00,0,80,27,1,9.02,...,2011,2,5,1.791759,3.332205,3.496508,0,0,0,5
2011-01-01 03:00:00,train,14.395,3,13,2011-01-01 03:00:00,0,75,10,1,9.84,...,2011,3,5,1.386294,2.397895,2.639057,0,0,0,5
2011-01-01 04:00:00,train,14.395,0,1,2011-01-01 04:00:00,0,75,1,1,9.84,...,2011,4,5,0.0,0.693147,0.693147,0,0,0,5


## Utility Functions

In [13]:
# RMSLE score function for testing
def RMSLE_score(Y_pred, Y_act ):
    diff = (np.log(Y_pred + 1) - np.log(Y_act + 1))
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

In [33]:
# get training data
def get_train_data():
    train_data = combined_df[combined_df['_data'] == 'train'].copy()
    return train_data

# get test data
def get_test_data():
    test_data = combined_df[combined_df['_data'] == 'test'].copy()
    return test_data

# split train and test data
def split_train_test(df, cutoff_day = 15):
    train_data = df[df['day'] <= cutoff_day]
    test_data = df[df['day'] > cutoff_day]

    return train_data, test_data

# prepare data for training the model
def prepare_data(df, features):
    X = df[features].as_matrix()
    Y_reg = df['registered_log'].as_matrix()
    Y_cas = df['casual_log'].as_matrix()

    return X, Y_reg, Y_cas

In [15]:
def make_kaggle_submission(predictions, file_name):
    print "-" * 80

    # check shape of the test and prediction sets
    print "Generating file for Kaggle Submission File: %s" % (file_name)
    print "Shape of Kaggle Test Data: ", FINAL_TEST_DF.shape
    print "Shape of Kaggle Test Predictions: ", predictions.shape  

    # formatting predictions to integers and removing negative values
    predictions = np.rint(predictions)
    predictions[ predictions < 0] = 0
    print predictions
    
    print "Shape of Final Predictions: ", predictions.shape

    # create submission file
    #sbmt_file_name = [os.getcwd(),'../submissions/',file_name]
    sbmt_file_name = file_name
    np.savetxt(sbmt_file_name, zip(FINAL_TEST_DF['datetime'], predictions), delimiter=',', fmt="%s", header=','.join(['datetime','count']), comments='')
    print "kaggle submission file generated"

## Predictions

In [35]:
# prediction on validation data
def predict_validation_data(model, features):
    df = get_train_data()

    train, test = split_train_test(df)

    X_train, Y_train_reg, Y_train_cas = prepare_data(train, features)
    X_test, Y_test_reg, Y_test_cas = prepare_data(test, features)

    # predict registered users count
    model_reg = model.fit(X_train, Y_train_reg)
    Y_prd_reg = np.exp(model_reg.predict(X_test)) - 1

    # predict casual users count
    model_cas = model.fit(X_train, Y_train_cas)
    Y_prd_cas = np.exp(model_cas.predict(X_test)) - 1

    # combine registered and casual user predictions
    Y_prd = np.round(Y_prd_reg + Y_prd_cas)
    Y_prd[Y_prd < 0] = 0

    # transform predictions back from log
    Y_test = np.exp(Y_test_reg) + np.exp(Y_test_cas) - 2

    score = RMSLE_score(Y_prd, Y_test)
    return (Y_prd, Y_test, score)

# predict Kaggle test data & transform output
def predict_kaggle_data(model, features):
    # get train and test data
    train_df = get_train_data()
    test_df = get_test_data()

    # prepare training data
    X_train, Y_train_reg, Y_train_cas = prepare_data(train_df, features)

    # prepare test data
    X_test = test_df[features].as_matrix()

    # predict casual users count
    model_cas = model.fit(X_train, Y_train_cas)
    Y_prd_cas = np.exp(model_cas.predict(X_test)) - 1
    
    # predict registered users count
    model_reg = model.fit(X_train, Y_train_reg)
    Y_prd_reg = np.exp(model_reg.predict(X_test)) - 1

    # combine casual & registered predictions together
    Y_prd = np.round(Y_prd_reg + Y_prd_cas)
    Y_prd[Y_prd < 0] = 0
    
    return Y_prd

## Random Forest Regression

In [24]:
params = {
    'n_estimators': 1000, 
    'max_depth': 15, 
    'random_state': 0, 
    'min_samples_split' : 5, 
    'n_jobs': -1}

rf_model = RandomForestRegressor(**params)
rf_features = [
    'weather', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'humid',
    'hour', 'dayofweek', 'peak'
    ]

(rf_prd, rf_test, rf_score) = predict_validation_data(rf_model, rf_features)
print rf_score

0.45145158864


## Gradient Boost

In [25]:
params = {
    'n_estimators': 150, 
    'max_depth': 5, 
    'random_state': 0, 
    'min_samples_leaf' : 10, 
    'learning_rate': 0.1, 
    'subsample': 0.7, 
    'loss': 'ls'}

gbm_model = GradientBoostingRegressor(**params)
gbm_features = [
    'weather', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'humidity',
    'hour', 'dayofweek', 'year', 'perfect'
]

(gbm_prd, gbm_test, gbm_score) = predict_validation_data(gbm_model, gbm_features)
print gbm_score

0.319933809759


In [26]:
# combine predictions from both the models
# random forest and gradient boost
y_prd = np.round(.2 * rf_prd + .8 * gbm_prd)
print RMSLE_score(y_prd, rf_test)

0.324200152693


## Predictions on Kaggle Test Data

In [36]:
# predict on Kaggle data using random forest
rf_prd = predict_kaggle_data(rf_model, rf_features)

# predict on Kaggle data using gradient boost
gbm_prd = predict_kaggle_data(gbm_model, gbm_features)

# combine predictions from both the models
# random forest and gradient boost
output = np.round(.2 * rf_prd + .8 * gbm_prd)

## Kaggle Submission

In [37]:
FINAL_TEST_DF = get_test_data()
make_kaggle_submission(output, 'combine_random_forest_grad_boost.csv')

--------------------------------------------------------------------------------
Generating file for Kaggle Submission File: combine_random_forest_grad_boost.csv
Shape of Kaggle Test Data:  (6493, 25)
Shape of Kaggle Test Predictions:  (6493,)
[ 12.   5.   3. ...,  99.  78.  42.]
Shape of Final Predictions:  (6493,)
kaggle submission file generated
