In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('../00_Data/train.tsv', sep='\t')

In [3]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,M,0.53,0.435,0.155,0.699,0.288,0.1595,0.205,10
1,3,F,0.71,0.565,0.195,1.817,0.785,0.492,0.49,11
2,7,F,0.625,0.52,0.18,1.354,0.4845,0.351,0.375,11
3,12,M,0.375,0.28,0.095,0.2225,0.0875,0.043,0.08,10
4,16,I,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6


In [4]:
train.describe()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0
mean,2118.050766,0.521873,0.406557,0.139413,0.819116,0.35541,0.178227,0.237196,9.930077
std,1189.026152,0.121148,0.100285,0.04488,0.484888,0.218666,0.108274,0.139393,3.283578
min,0.0,0.11,0.09,0.0,0.008,0.0025,0.002,0.003,2.0
25%,1119.0,0.45,0.35,0.115,0.438875,0.184375,0.092,0.13,8.0
50%,2142.5,0.54,0.42,0.14,0.79125,0.334,0.16875,0.23025,9.0
75%,3128.25,0.61,0.48,0.165,1.140125,0.4975,0.2475,0.325,11.0
max,4175.0,0.78,0.625,1.13,2.555,1.2455,0.5745,1.005,29.0


## 前処理

In [5]:
del train['id']

In [7]:
train = pd.get_dummies(train)

In [8]:
train.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.53,0.435,0.155,0.699,0.288,0.1595,0.205,10,0,0,1
1,0.71,0.565,0.195,1.817,0.785,0.492,0.49,11,1,0,0
2,0.625,0.52,0.18,1.354,0.4845,0.351,0.375,11,1,0,0
3,0.375,0.28,0.095,0.2225,0.0875,0.043,0.08,10,0,0,1
4,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6,0,1,0


In [12]:
x_train = train.drop('Rings', axis=1)

In [13]:
x_train.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M
0,0.53,0.435,0.155,0.699,0.288,0.1595,0.205,0,0,1
1,0.71,0.565,0.195,1.817,0.785,0.492,0.49,1,0,0
2,0.625,0.52,0.18,1.354,0.4845,0.351,0.375,1,0,0
3,0.375,0.28,0.095,0.2225,0.0875,0.043,0.08,0,0,1
4,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,0,1,0


In [17]:
y_train = train.iloc[:, 7:8]

In [18]:
y_train.head()

Unnamed: 0,Rings
0,10
1,11
2,11
3,10
4,6


## 予測

In [19]:
from sklearn.svm import SVR

In [20]:
model = SVR()

In [21]:
from sklearn.grid_search import GridSearchCV

In [22]:
tuned_param = [
                {
                    'kernel': ['rbf'], 
                    'gamma' : [1e-3, 1e-4],
                     'C'    : [1, 10, 100, 1000]
                },
                {
                    'kernel': ['linear'], 
                    'C'     : [1, 10, 100, 1000]
                }
]

In [23]:
gs = GridSearchCV(
                    model,
                    tuned_param,
                    # k分割数                    
                    cv = 5, 
                    scoring = 'mean_squared_error'
)

In [25]:
# 警告を非表示
import warnings
warnings.filterwarnings('ignore')
gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_squared_error',
       verbose=0)

In [26]:
print(gs.best_score_)

-5.450841360626785


In [27]:
print(gs.best_params_)

{'C': 10, 'kernel': 'linear'}


In [28]:
# 最良スコアの出るモデルで構築
gs.best_estimator_.fit(x_train, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [29]:
test = pd.read_csv('../00_Data/test.tsv', sep='\t')

In [30]:
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,1,F,0.71,0.57,0.195,1.9805,0.9925,0.4925,0.48
1,2,F,0.485,0.395,0.16,0.66,0.2475,0.128,0.235
2,4,M,0.7,0.58,0.205,2.13,0.7415,0.49,0.58
3,5,F,0.67,0.525,0.19,1.527,0.5755,0.353,0.44
4,6,I,0.435,0.3,0.12,0.5965,0.259,0.139,0.1645


In [31]:
del test['id']

In [33]:
test = pd.get_dummies(test)

In [34]:
test.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M
0,0.71,0.57,0.195,1.9805,0.9925,0.4925,0.48,1,0,0
1,0.485,0.395,0.16,0.66,0.2475,0.128,0.235,1,0,0
2,0.7,0.58,0.205,2.13,0.7415,0.49,0.58,0,0,1
3,0.67,0.525,0.19,1.527,0.5755,0.353,0.44,1,0,0
4,0.435,0.3,0.12,0.5965,0.259,0.139,0.1645,0,1,0


In [35]:
pred = gs.best_estimator_.predict(test)

In [36]:
pred

array([ 9.21015229, 10.83859756, 14.72687239, ..., 12.55420941,
       11.5834457 , 11.40448831])

In [37]:
pred = np.round(pred).astype(int)

In [38]:
pred

array([ 9, 11, 15, ..., 13, 12, 11])

In [42]:
sample = pd.read_csv('../00_Data/sample_submit.csv', header=None)

In [43]:
sample.head()

Unnamed: 0,0,1
0,1,9
1,2,17
2,4,17
3,5,17
4,6,14


In [44]:
sample[1] = pred

In [45]:
sample.head()

Unnamed: 0,0,1
0,1,9
1,2,11
2,4,15
3,5,13
4,6,7


In [46]:
sample.to_csv('../80_submit/submit.csv', header=False, index=False)