In [1]:
# coding=utf8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

pd.set_option('display.width', 256)

In [2]:
data = pd.read_csv("abalone.csv")
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
data['Sex'] = data['Sex'].map(lambda x: 1 
                              if x=='M' else 
                              (-1 if x=='F' else 0))
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
target = data.Rings.values
features = data.drop(['Rings'], axis=1)

X = features.values
X

array([[ 1.    ,  0.455 ,  0.365 , ...,  0.2245,  0.101 ,  0.15  ],
       [ 1.    ,  0.35  ,  0.265 , ...,  0.0995,  0.0485,  0.07  ],
       [-1.    ,  0.53  ,  0.42  , ...,  0.2565,  0.1415,  0.21  ],
       ..., 
       [ 1.    ,  0.6   ,  0.475 , ...,  0.5255,  0.2875,  0.308 ],
       [-1.    ,  0.625 ,  0.485 , ...,  0.531 ,  0.261 ,  0.296 ],
       [ 1.    ,  0.71  ,  0.555 , ...,  0.9455,  0.3765,  0.495 ]])

In [5]:
n_samples, n_features = X.shape
print(n_samples, n_features)

(4177, 8)


In [6]:
kf = KFold(n_samples, n_folds=5, shuffle=True, random_state=1)

In [9]:
%%time
from sklearn.metrics import r2_score, make_scorer

for i in range(1, 51, 1):
    rfr = RandomForestRegressor(n_estimators=i, random_state=1)
    scores = cross_val_score(rfr, features, target, scoring='r2', cv=kf, n_jobs=2)
    print("Accuracy (random forest): %0.3f (+/- %0.3f) by n_estimators %d" % (scores.mean(), scores.std() * 2, i))


Accuracy (random forest): 0.120 (+/- 0.224) by n_estimators 1
Accuracy (random forest): 0.334 (+/- 0.081) by n_estimators 2
Accuracy (random forest): 0.399 (+/- 0.052) by n_estimators 3
Accuracy (random forest): 0.441 (+/- 0.050) by n_estimators 4
Accuracy (random forest): 0.463 (+/- 0.044) by n_estimators 5
Accuracy (random forest): 0.470 (+/- 0.038) by n_estimators 6
Accuracy (random forest): 0.475 (+/- 0.038) by n_estimators 7
Accuracy (random forest): 0.481 (+/- 0.043) by n_estimators 8
Accuracy (random forest): 0.486 (+/- 0.043) by n_estimators 9
Accuracy (random forest): 0.492 (+/- 0.049) by n_estimators 10
Accuracy (random forest): 0.492 (+/- 0.045) by n_estimators 11
Accuracy (random forest): 0.496 (+/- 0.039) by n_estimators 12
Accuracy (random forest): 0.501 (+/- 0.039) by n_estimators 13
Accuracy (random forest): 0.505 (+/- 0.037) by n_estimators 14
Accuracy (random forest): 0.509 (+/- 0.038) by n_estimators 15
Accuracy (random forest): 0.511 (+/- 0.037) by n_estimators 16
A