In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [2]:
df = pd.read_csv('abalone.csv')

In [3]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
df.describe()

Unnamed: 0,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [5]:
df.Sex = df.Sex.replace('M', 1)
df.Sex = df.Sex.replace('F', -1)
df.Sex = df.Sex.replace('I', 0)

In [6]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [19]:
x = df.drop('Rings', axis=1)
y = df.Rings

In [20]:
grid_t = {'n_estimators': np.arange(1, 51)}

In [21]:
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=1)

In [22]:
forest = RandomForestRegressor(random_state=1)

In [23]:
gs = GridSearchCV(forest, grid_t, scoring='r2', cv=cv)

In [24]:
gs

GridSearchCV(cv=sklearn.cross_validation.KFold(n=4177, n_folds=5, shuffle=True, random_state=1),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [25]:
gs.fit(x.values, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=4177, n_folds=5, shuffle=True, random_state=1),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [26]:
for a in gs.grid_scores_:
    print a.mean_validation_score, a.parameters

0.102163125849 {'n_estimators': 1}
0.338412959313 {'n_estimators': 2}
0.403584837629 {'n_estimators': 3}
0.442722234326 {'n_estimators': 4}
0.464020401671 {'n_estimators': 5}
0.470580138647 {'n_estimators': 6}
0.47582658808 {'n_estimators': 7}
0.481738243326 {'n_estimators': 8}
0.488342750783 {'n_estimators': 9}
0.494458152768 {'n_estimators': 10}
0.493391074362 {'n_estimators': 11}
0.497961126662 {'n_estimators': 12}
0.50213210699 {'n_estimators': 13}
0.506424775497 {'n_estimators': 14}
0.508328085298 {'n_estimators': 15}
0.510509452008 {'n_estimators': 16}
0.513845502985 {'n_estimators': 17}
0.516324176384 {'n_estimators': 18}
0.519031046713 {'n_estimators': 19}
0.518670729546 {'n_estimators': 20}
0.51983247513 {'n_estimators': 21}
0.520155890139 {'n_estimators': 22}
0.521015079498 {'n_estimators': 23}
0.522401328334 {'n_estimators': 24}
0.522615326396 {'n_estimators': 25}
0.523804761492 {'n_estimators': 26}
0.524120059055 {'n_estimators': 27}
0.525050806268 {'n_estimators': 28}
0.52