# Comparing SVR with linear and RBF kernel on Abalone data

In [None]:
import pandas as pd
import numpy as np

In [54]:
data2=pd.read_csv('abalone.data.txt',header=None)
data2.columns = ['Sex','Length','Diamter','Height','Whole weight','Shucked weight'
                    ,'Viscera weight','Shell weight','Rings']

Replace M, F and I for the 'Sex' variable into 0,1, and 2, respectively.

In [66]:
def transform(s):
    d={'M':0,'F':1,'I':2}
    return d[s]
data2['Sex']=data2['Sex'].map(transform)

Choose 'Rings' as dependent variable and add 1.5 to get 'Age'

In [68]:
y=data2['Rings'].values
X=data2.iloc[:,0:8].values
y=np.array(y)
y1=[item+1.5 for item in y]

Train linear SVR on the dataset

In [72]:
from sklearn.svm import LinearSVR
lin_svr = LinearSVR(random_state=42)
lin_svr.fit(X, y1)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=0)

In [78]:
from sklearn.metrics import mean_squared_error

y_pred = lin_svr.predict(X)
mse = mean_squared_error(y1, y_pred)
np.sqrt(mse)

2.3059905329001786

Train SVR with RBF Kernel. Use randomized search with cross validation to find the appropriate hyperparameter values for C and gamma.

In [74]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)
rnd_search_cv.fit(X, y1)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................
[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   0.4s
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   0.4s
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................
[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   0.4s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   0.4s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   0.4s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   0.4s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .......... C=2.56018640442, gamma=0.00205111041884, total=   0.4s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .......... C=2.56018640442, gamma=0.00205111041884, total=   0.4s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   17.7s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10ec82588>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10ec827b8>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [75]:
rnd_search_cv.best_estimator_

SVR(C=4.7454011884736254, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.079694548186439285, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [77]:
y_pred = rnd_search_cv.best_estimator_.predict(X)
mse = mean_squared_error(y1, y_pred)
np.sqrt(mse)

2.2679419683229716

RMSE is slightly better than for the linear SVR