In [11]:
# Importing libraries
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [13]:
# read data, combine train and test data,separate time information
dfs = {}
for name in ['train', 'test']:
#     df = pd.read_csv('C:\Users/Administrator/Desktop/Kaggle/data/%s.csv' % name)
    df = pd.read_csv('Data/%s.csv' %name)
    df['_data'] = name
    dfs[name] = df

df = dfs['train'].append(dfs['test'])

dt = pd.DatetimeIndex(df['dteday'])
df['day'] = dt.day
df['dow'] = dt.dayofweek
df['woy'] = dt.weekofyear

In [14]:
# logarithmic transformation to do error calculation
df['cnt_log'] = np.log(df['cnt'] + 1)

# add a by_season_all column which represents all renting bikes by season
by_season = df[df['_data'] == 'train'].groupby('season')[['cnt']].agg(sum)
by_season.columns = ['by_season_all']
df = df.join(by_season, on='season')

In [15]:
# add another feature discussed before, it shows peaks of bike-renting number
df['peak'] = df[['hr', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['hr'] == 8 or 17 <= x['hr'] <= 18 or 12 <= x['hr'] <= 12)) or (x['workingday'] == 0 and  10 <= x['hr'] <= 19)], axis = 1)

# add features to show special climate, this idea is from a blog 
df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 0.6 and x['windspeed'] < 0.44778], axis = 1)
df['wet'] = df[['hum', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['hum'] >= 0.6], axis = 1)

## OKAY WEATHER DOES NOT WORK
# test for mutual exclusiveness for weathersit
df['w1'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 1],axis = 1)
df['w2'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 2],axis = 1)
df['w3'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 3],axis = 1)
df['w4'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 4],axis = 1)

## OKAY SEASONS DO NOT WORK EITHER
# test for mutual exclusiveness for seasons
df['s1'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 1],axis = 1)
df['s2'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 2],axis = 1)
df['s3'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 3],axis = 1)
df['s4'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 4],axis = 1)

In [16]:
def get_data():
    data = df[df['_data'] == 'train'].copy()
    return data

def get_error(y_pred, y_actual):
    difference = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(difference).mean()
    return np.sqrt(mean_error)

def get_feature_and_result(data, input_cols):
    X = data[input_cols].as_matrix()
    y = data['cnt_log'].as_matrix()
    return X, y
def train_test_split(data):
    train = data[data['day'] <= 14]
    test = data[data['day'] > 14 ]
    return train, test

# similar functions have been given during the lab
def predict_on_validation_set(model, input_cols):
    data = get_data()
    train, test = train_test_split(data)
    X_train, y_train = get_feature_and_result(train, input_cols)
    X_test, y_test = get_feature_and_result(test, input_cols)
    my_model = model.fit(X_train, y_train)
    y_pred = np.exp(my_model.predict(X_test))-1
    y_pred_last = np.round(y_pred)
    y_pred_last[y_pred_last < 0] = 0
    y_test_last = np.exp(y_test)-1
    score = get_error(y_pred_last, y_test_last)
    return (y_pred_last, y_test_last, score)

df_test = df[df['_data'] == 'test'].copy()

def predict_on_test_set(model, input_cols):
    df_train = df[df['_data'] == 'train'].copy()
    X_train = df_train[input_cols].as_matrix()
    y_train = df_train['cnt_log'].as_matrix()
    X_test = df_test[input_cols].as_matrix()
    my_model = model.fit(X_train, y_train)
    y_pred = my_model.predict(X_test)
    y_pred = np.exp(y_pred)-1
    return y_pred


In [24]:
# do a random forest regression
parameters = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1}
rf_model = RandomForestRegressor(**parameters)

rf_cols = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]

rf_cols1 = [
    'weathersit', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'wet',
    'hr', 'dow', 'woy', 'peak'
]


# Split weather into 4
rf_cols2 = [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split seasons into 3
rf_cols3 = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 's1', 's2', 's3', 's4', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split both
rf_cols4= [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday','s1', 's2', 's3', 's4' , 
    'hr', 'dow', 'yr','mnth','woy'
]


(rf_p, rf_t, rf_score) = predict_on_validation_set(rf_model, rf_cols)
(rf_p, rf_t, rf_score1) = predict_on_validation_set(rf_model, rf_cols1)
(rf_p, rf_t, rf_score2) = predict_on_validation_set(rf_model, rf_cols2)
(rf_p, rf_t, rf_score3) = predict_on_validation_set(rf_model, rf_cols3)
(rf_p, rf_t, rf_score4) = predict_on_validation_set(rf_model, rf_cols4)
print rf_score
print rf_score1
print rf_score2
print rf_score3
print rf_score4

0.328202097292
0.434502579521
0.327785342162
0.328934936078
0.328824642471


In [26]:
# do a gradient boosting regression
parameters = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gb_model = GradientBoostingRegressor(**parameters)
gb_cols = [
    'weathersit', 'temp', 'windspeed', 'wet',
    'holiday', 'workingday', 
    'hr', 'dow', 'yr', 'ideal', 'by_season_all','mnth','woy'
    ]

gb_cols1 = [
    'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
    'holiday', 'workingday', 'season',
    'hr', 'dow', 'yr', 'ideal', 'by_season_all',
]

# Split weather into 4
gb_cols2 = [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split seasons into 3
gb_cols3 = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 's1', 's2', 's3', 's4', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split both
gb_cols4= [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday','s1', 's2', 's3', 's4' , 
    'hr', 'dow', 'yr','mnth','woy'
]


(gb_p, gb_t, gb_score) = predict_on_validation_set(gb_model, gb_cols)
(gb_p, gb_t, gb_score1) = predict_on_validation_set(gb_model, gb_cols1)
(gb_p, gb_t, gb_score2) = predict_on_validation_set(gb_model, gb_cols2)
(gb_p, gb_t, gb_score3) = predict_on_validation_set(gb_model, gb_cols3)
(gb_p, gb_t, gb_score4) = predict_on_validation_set(gb_model, gb_cols4)
print gb_score
print gb_score1
print gb_score2
print gb_score3
print gb_score4

0.320212406439
0.317385773638
0.325393674571
0.321603052518
0.322327209435


In [28]:
rf_pred = predict_on_test_set(rf_model, rf_cols)
gb_pred = predict_on_test_set(gb_model, gb_cols)

a = [0.2*rf_pred+0.8*gb_pred,0.8*rf_pred+0.2*gb_pred] 
y_pred = np.round(rf_pred)

df_test['Prediction'] = y_pred
result = df_test[['dteday', 'Prediction']].copy()
result.to_csv('C:\Users/Administrator/Desktop/Kaggle/data/output/submit_new.csv', index=False)