In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from scipy.stats import ttest_ind
#from sklearn.cross_validation import train_test_split

In [2]:
with open('datapath.txt') as f:
    datapath=f.readlines()[0].rstrip()

If we want to save memory, we can import the values as unsigned 8-bit integers, which range from 0 to 255 - exactly the range we want. This only uses about 1/8 of the memory of the default 64-bit float, right off the bat.

At least on my computer, this helps a lot. I can't really manage to import the whole thing, and do training on it without this.

If we wanted to save more memory, we could convert to a "black and white" version, and replace the integers with 1-bit booleans.

Also, for testing, we can just import some of the rows.

In [4]:
train=pd.read_csv(datapath+'/train.csv', nrows=None, dtype=np.uint8)

In [5]:
predictors=train.columns.drop('label')

Set up our grid search:

In [6]:
rfc=RandomForestClassifier()

In [7]:

rfc.fit(train[predictors],train['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
parameters={}

In [9]:
parameters['n_estimators']=np.linspace(5,30,num=4,dtype=np.int)
parameters['max_features']=np.linspace(0.1,1.0,num=4)
parameters['min_samples_split']=np.linspace(2,10,num=4,dtype=np.int)
parameters['min_weight_fraction_leaf']=np.linspace(0.0,0.4,num=3)

In [10]:
parameters

{'max_features': array([ 0.1,  0.4,  0.7,  1. ]),
 'min_samples_split': array([ 2,  4,  7, 10]),
 'min_weight_fraction_leaf': array([ 0. ,  0.2,  0.4]),
 'n_estimators': array([ 5, 13, 21, 30])}

In [11]:
modelrfc=GridSearchCV(rfc,parameters)

The `%%time` "cell magic" command will output the amount of time it takes to run a cell.

This only times it once, so it's not necessarily representative of what it would take in the future. If you have some code you're testing that you'll loop over repeatedly, you should use the `%%timeit` (*time it*eratively) command, which will run it a bunch of times and give you statistics.

In [None]:
%%time
modelrfc.fit(train[predictors],train['label'])

In [None]:
modelrfc.best_params_

We have hit our highest value for the number of estimators. This gets at a problem we may have: more estimators may be better(generally the only risk in increasing it is overfitting), but may take more time to train. Let's try a second round of searching, looking only at the number of estimators.

In [42]:
def param_dict(best):
    #outputs a parameter dictionary that can be taken in by GridSearchCV
    iterator=zip(modelrfc.best_params_.keys(),modelrfc.best_params_.values())
    return {i[0]:[i[1]] for i in iterator}


In [43]:
newparams=param_dict(modelrfc.best_params_)
newparams['n_estimators']=np.linspace(30, 200, num=4, dtype=int)

In [18]:
modelrfc=GridSearchCV(rfc,newparams)

In [19]:
%%time
modelrfc.fit(train[predictors],train['label'])

CPU times: user 9.95 s, sys: 8.04 ms, total: 9.96 s
Wall time: 9.95 s


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.10000000000000001], 'min_samples_split': [4], 'min_weight_fraction_leaf': [0.0], 'n_estimators': array([ 30,  86, 143, 200])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [20]:
modelrfc.best_params_

{'max_features': 0.10000000000000001,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 86}

In [55]:
%%time
rfcRMSE=cross_val_score(modelrfc.best_estimator_, train[predictors],train['label'],cv=10)

CPU times: user 8.19 s, sys: 7.99 ms, total: 8.2 s
Wall time: 8.2 s


In [39]:
rfcRMSE.mean()

0.87463743810952921

#### Gradient Boosting

In [26]:
gbc=GradientBoostingClassifier()

In [31]:
parameters['n_estimators']=[5,10]

In [32]:
modelgbc=GridSearchCV(gbc,parameters)

In [33]:
%%time
modelgbc.fit(train[predictors],train['label'])

CPU times: user 7min 17s, sys: 624 ms, total: 7min 18s
Wall time: 7min 18s


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10], 'max_features': array([ 0.1,  0.4,  0.7,  1. ]), 'min_samples_split': array([ 2,  4,  7, 10]), 'min_weight_fraction_leaf': array([ 0. ,  0.2,  0.4])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [36]:
modelgbc.best_params_

{'max_features': 0.10000000000000001,
 'min_samples_split': 7,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10}

In [58]:
newparamsgbc=param_dict(modelgbc.best_params_)
newparamsgbc['n_estimators']=np.linspace(55,100,num=3,dtype=int)
newparamsgbc['max_features']=[0.05]

In [59]:
modelgbc=GridSearchCV(gbc,newparamsgbc)

In [60]:
%%time
modelgbc.fit(train[predictors],train['label'])

CPU times: user 2min 52s, sys: 12 ms, total: 2min 52s
Wall time: 2min 52s


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.05], 'min_samples_split': [4], 'min_weight_fraction_leaf': [0.0], 'n_estimators': array([ 55,  77, 100])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [52]:
modelgbc.best_params_

{'max_features': 0.05,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 55}

In [54]:
%%time
gbcRMSE=cross_val_score(modelgbc.best_estimator_, train[predictors],train['label'],cv=10)

CPU times: user 2min 30s, sys: 20 ms, total: 2min 30s
Wall time: 2min 30s


In [57]:
print(gbcRMSE.mean()-rfcRMSE.mean())
print(ttest_ind(gbcRMSE,rfcRMSE))

-0.0161461812225
Ttest_indResult(statistic=-1.4139795264335777, pvalue=0.17443118485828529)


Random forest seems to do better, but the difference is not statistically significant.