# 加载数据集

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

In [2]:
boston=load_boston()

In [3]:
X=boston.data
y=boston.target

In [4]:
X[:5]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00]])

# 分割数据

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
X_train.shape,y_train.shape

((404, 13), (404,))

# 标准化处理

In [7]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [8]:
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

# 线性核
## 模型训练

In [9]:
from sklearn.svm import LinearSVR

In [10]:
linsvm_reg=LinearSVR()
linsvm_reg.fit(X_train_scaled,y_train)

LinearSVR()

In [11]:
from sklearn.metrics import mean_squared_error
y_pred=linsvm_reg.predict(X_test_scaled)
mean_squared_error(y_test,y_pred)

30.027872087668495

我们再来看看再训练集上的均方误差

In [12]:
y_pred=linsvm_reg.predict(X_train_scaled)
mean_squared_error(y_train,y_pred)

25.176052394013336

很严重的欠拟合。用网格搜索交叉验证来寻找一些合适的超参数来调整模型。
## 交叉验证

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
param_grid={'epsilon':[2.91,2.92,2.93,2.94,2.95,2.96,2.97,2.98]}
linsvm_reg_grid_search=GridSearchCV(linsvm_reg,param_grid,cv=3,verbose=2,n_jobs=-1)

In [15]:
linsvm_reg_grid_search.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


GridSearchCV(cv=3, estimator=LinearSVR(), n_jobs=-1,
             param_grid={'epsilon': [2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97,
                                     2.98]},
             verbose=2)

In [16]:
linsvm_reg_grid_search.best_estimator_

LinearSVR(epsilon=2.91)

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
param_distributions = {"epsilon": reciprocal(2.9,2.92), "C": uniform(1, 10)}

In [18]:
linsvm_reg_rnd_search=RandomizedSearchCV(LinearSVR(),param_distributions,n_jobs=30,cv=3,verbose=2)
linsvm_reg_rnd_search.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=LinearSVR(), n_jobs=30,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002CBC5799A08>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002CBC57998C8>},
                   verbose=2)

In [19]:
linsvm_reg_rnd_search.best_estimator_

LinearSVR(C=4.627035457192266, epsilon=2.914177343394307)

In [20]:
linsvm_reg_rnd_search.best_estimator_.fit(X_train_scaled,y_train)

LinearSVR(C=4.627035457192266, epsilon=2.914177343394307)

## 最优模型

In [21]:
linsvm_reg_rnd_search.best_estimator_.fit(X_train_scaled,y_train)

LinearSVR(C=4.627035457192266, epsilon=2.914177343394307)

In [22]:
y_prd=linsvm_reg_rnd_search.best_estimator_.predict(X_test_scaled)

In [23]:
mean_squared_error(y_test,y_prd)

27.614661077604573

In [24]:
y_pred=linsvm_reg_rnd_search.best_estimator_.predict(X_train_scaled)

In [25]:
mean_squared_error(y_train,y_pred)

23.000565414732307

相较之前以及有所下降

# 高斯核
## 模型训练

In [26]:
from sklearn.svm import SVR

In [27]:
svm_reg=SVR(kernel='rbf')
svm_reg.fit(X_train_scaled,y_train)

SVR()

In [28]:
y_pred=svm_reg.predict(X_test_scaled)

In [29]:
mean_squared_error(y_test,y_pred)

25.66853967839608

## 交叉验证

In [30]:
param_grid={
    'C':[99,98,100],
    'epsilon':[0.24,0.25,0.3,0.35]
}
svm_reg_grid_search=GridSearchCV(svm_reg,param_grid,cv=3,verbose=2,n_jobs=-1)

In [31]:
svm_reg_grid_search.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV(cv=3, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [99, 98, 100],
                         'epsilon': [0.24, 0.25, 0.3, 0.35]},
             verbose=2)

In [32]:
svm_reg_grid_search.best_estimator_

SVR(C=100, epsilon=0.25)

In [33]:
param_distributions={
    "epsilon": reciprocal(2.4,2.6), "C": uniform(100,1000)
}
svm_reg_rnd_search=RandomizedSearchCV(SVR(kernel='rbf'),param_distributions,cv=3,verbose=2,n_jobs=-1)

In [34]:
svm_reg_rnd_search.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=SVR(), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002CBC57C16C8>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002CBC57C1D48>},
                   verbose=2)

In [35]:
svm_reg_rnd_search.best_estimator_

SVR(C=202.96827791113992, epsilon=2.5215614893199567)

In [36]:
svm_reg_rnd_search.best_estimator_.fit(X_train_scaled,y_train)

SVR(C=202.96827791113992, epsilon=2.5215614893199567)

In [37]:
y_pred=svm_reg_rnd_search.best_estimator_.predict(X_train_scaled)
mean_squared_error(y_pred,y_train)

3.577070744460457

In [38]:
y_pred=svm_reg_rnd_search.best_estimator_.predict(X_test_scaled)
mean_squared_error(y_test,y_pred)

13.025037176622204