In [3]:
import numpy as np
import pandas as pd
from sklearn import grid_search
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor

# Discovering the training and testing data set.

In [9]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
sub = pd.read_csv('sample_solutions.csv')
print train.columns.values, test.columns.values, sub.columns.values

['date' 'year' 'month' 'hour' 'holiday' 'weekday' 'working' 'weather_type'
 'temp' 'feels_like' 'humidity' 'windspeed' 'count'] ['id' 'date' 'year' 'month' 'hour' 'holiday' 'weekday' 'working'
 'weather_type' 'temp' 'feels_like' 'humidity' 'windspeed'] ['id' 'count']


In [38]:
train.head()

Unnamed: 0,date,year,month,hour,holiday,weekday,working,weather_type,temp,feels_like,humidity,windspeed,count
0,1/1/11,0,1,0,0,6,0,1,9.02,14.6829,0.81,0.0,16
1,1/1/11,0,1,1,0,6,0,1,9.02,13.9077,0.8,0.0,40
2,1/1/11,0,1,2,0,6,0,1,9.02,13.9077,0.8,0.0,32
3,1/1/11,0,1,3,0,6,0,1,9.84,14.6829,0.75,0.0,13
4,1/1/11,0,1,4,0,6,0,1,9.84,14.6829,0.75,0.0,1


In [46]:
test.head()

Unnamed: 0,id,date,year,month,hour,holiday,weekday,working,weather_type,temp,feels_like,humidity,windspeed
0,1,8/1/12,1,8,0,0,3,1,1,27.88,32.4564,0.79,11.0014
1,2,8/1/12,1,8,1,0,3,1,1,27.06,30.9111,0.83,6.0032
2,3,8/1/12,1,8,2,0,3,1,1,26.24,29.3658,0.83,7.0015
3,4,8/1/12,1,8,3,0,3,1,1,26.24,29.3658,0.83,7.0015
4,5,8/1/12,1,8,4,0,3,1,2,26.24,30.1359,0.78,8.9981


In [4]:
print train.isnull().sum(), test.isnull().sum(), sub.isnull().sum()

date            0
year            0
month           0
hour            0
holiday         0
weekday         0
working         0
weather_type    0
temp            0
feels_like      0
humidity        0
windspeed       0
count           0
dtype: int64 id              0
date            0
year            0
month           0
hour            0
holiday         0
weekday         0
working         0
weather_type    0
temp            0
feels_like      0
humidity        0
windspeed       0
dtype: int64 id       0
count    0
dtype: int64


# Creating the final training and testing data sets

Transforming Pandas' Dataframe into numpy.ndarray usable by scikit-learn's implementation.

In [10]:
cols = ['year', 'month', 'hour', 'holiday', 'weekday', 'working',
       'weather_type', 'temp', 'feels_like', 'humidity', 'windspeed']

X_train = train[cols].values
Y_train = train['count'].values
X_test = test[cols].values

# Building the Model : Random Forest Regressor

#### Why Random Forest?
1. Usually RF algorithm does not overfit;
2. It provides excellent accuracy among current classification and regression algorithms;
3. It can be applied efficiently to large-scale datasets;
4. It handles high dimensional dataset;
5. It determines the most important features of a dataset;
6. It is easily interpretable.

#### Using a k-fold cross-validation to determine the best number of trees the random forest classifier should have.

In [11]:
parameters = {'n_estimators':np.arange(10,140,10)}

In [12]:
Model = RandomForestRegressor()                               # Create the Model
cross_validation = grid_search.GridSearchCV(Model, parameters) # Define the Cross-Validation
cross_validation.fit(X_train, Y_train)                         # Run the Cross-Validation

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [13]:
cross_validation.best_params_

{'n_estimators': 110}

In [14]:
cross_validation.best_score_

0.79848239434779433

In [39]:
model = cross_validation.best_estimator_ # Get the best model (number of trees = 110) from the Cross-Validation

#### Performance of the Model

In [40]:
model.score(X_train, Y_train)

0.99201422009424645

In [41]:
model.score(X_train[:7946], Y_train[:7946])

0.99097946805901649

## Predictions

In [42]:
Y_test = model.predict(X_test)

In [43]:
Y_test

array([  44.20909091,   21.59090909,    7.26363636, ...,  172.32727273,
        116.90909091,   55.91818182])

## Determining the attributes with the largest impact on the likelihood to rent a bike

In [44]:
model.feature_importances_

array([ 0.08204439,  0.03188874,  0.61126316,  0.00318906,  0.01271043,
        0.05608759,  0.01883886,  0.07252325,  0.07321593,  0.02718039,
        0.0110582 ])

Hence, the most impactful feature, is the Month on which the bike is rent.

## Output File for Submission

In [45]:
sub['count'] = Y_test
sub.to_csv('my_random_forest_submission.csv', header=True, index=False)