In [16]:
import numpy as np
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn import grid_search
from sklearn.externals import joblib
import csv
import matplotlib.pyplot as plt
% matplotlib inline

# Discovering the training and testing data set.

In [11]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
print train.columns.values
print test.columns.values

['date' 'year' 'month' 'hour' 'holiday' 'weekday' 'working' 'weather_type'
 'temp' 'feels_like' 'humidity' 'windspeed' 'count']
['id' 'date' 'year' 'month' 'hour' 'holiday' 'weekday' 'working'
 'weather_type' 'temp' 'feels_like' 'humidity' 'windspeed']


In [3]:
train.head()

Unnamed: 0,date,year,month,hour,holiday,weekday,working,weather_type,temp,feels_like,humidity,windspeed,count
0,1/1/11,0,1,0,0,6,0,1,9.02,14.6829,0.81,0.0,16
1,1/1/11,0,1,1,0,6,0,1,9.02,13.9077,0.8,0.0,40
2,1/1/11,0,1,2,0,6,0,1,9.02,13.9077,0.8,0.0,32
3,1/1/11,0,1,3,0,6,0,1,9.84,14.6829,0.75,0.0,13
4,1/1/11,0,1,4,0,6,0,1,9.84,14.6829,0.75,0.0,1


In [7]:
train = train.dropna(subset=['date']) # Dates are already separated into year, month, hour, ... no need to keep them.
test= test.dropna(subset=['date'])

In [8]:
print train.shape
print train.dtypes

(15893, 13)
date             object
year              int64
month             int64
hour              int64
holiday           int64
weekday           int64
working           int64
weather_type      int64
temp            float64
feels_like      float64
humidity        float64
windspeed       float64
count             int64
dtype: object


In [9]:
train.isnull().sum()

date            0
year            0
month           0
hour            0
holiday         0
weekday         0
working         0
weather_type    0
temp            0
feels_like      0
humidity        0
windspeed       0
count           0
dtype: int64

# Creating the final training and testing data sets
Transforming Pandas' Dataframe into numpy.ndarray usable by scikit-learn's implementation.

In [12]:
features = ['year', 'month', 'hour', 'holiday' ,'weekday' ,'working', 'weather_type',
            'temp', 'feels_like', 'humidity', 'windspeed']

X_train = train[features].values
Y_train = train['count'].values
X_test = test[features].values

In [13]:
print X_train.shape # Checking if labels and data have the same number of rows.
print Y_train.shape

(15893L, 11L)
(15893L,)


# Building the Model : Random Forest Regressor
###### Why Random Forest?
1. Usually RF algorithm does not overfit;
2. It provides excellent accuracy among current classification and regression algorithms;
3. It can be applied efficiently to large-scale datasets;
4. It handles high dimensional dataset;
5. It determines the most important features of a dataset;
6. It is easily interpretable.

Using a grid search cross-validation to determine the best number of trees the random forest classifier should have.

In [14]:
parameters = {'n_estimators':np.arange(100,130,5)}

In [17]:
classifier = RandomForestRegressor()                               # Create the Model
cross_validation = grid_search.GridSearchCV(classifier, parameters) # Define the Cross-Validation
cross_validation.fit(X_train, Y_train)                         # Run the Cross-Validation

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([100, 105, 110, 115, 120, 125])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [18]:
print cross_validation.best_params_
print cross_validation.best_score_

{'n_estimators': 120}
0.798468722348


In [20]:
Model = cross_validation.best_estimator_ # Get the best model (number of trees = 120) from the Cross-Validation

# Performance of the model

In [22]:
Model.score(X_train, Y_train)

0.99194974976614458

In [24]:
Model.score(X_train[10000:11790], Y_train[10000:11790])

0.99143382306550543

# Predictions

In [25]:
Y_test = Model.predict(X_test)

# Output File for Submission

In [30]:
t = open('Predicitve_count.csv', 'w')
open_file_object = csv.writer(t)

open_file_object.writerow(['id','count'])

for i in range(X_test.shape[0]):
    open_file_object.writerow([i+1,Y_test[i]])

t.close()

# Determining the attributes with the largest impact on the likelihood to rent a bike

In [21]:
Model.feature_importances_

array([ 0.08232441,  0.03270805,  0.6135834 ,  0.00315526,  0.01251957,
        0.05395327,  0.01877249,  0.07140749,  0.07306305,  0.0275501 ,
        0.01096289])

Hence, the most impactful feature, is the hour on which the bike is rent.