## 数据

In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
prostate = pd.read_csv("../../../datasets/prostate/prostate.data", 
                       sep="\t", index_col=0)
prostate.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T


In [3]:
train_data = prostate.drop("train", axis=1)[prostate["train"] == "T"]
test_data = prostate.drop("train", axis=1)[prostate["train"] == "F"]

scaler = StandardScaler()
scaler.fit(train_data)
train_data_std = scaler.fit_transform(train_data)
test_data_std = scaler.transform(test_data)
X_train = train_data_std[:, :-1]
y_train = train_data_std[:, -1]
X_test = test_data_std[:, :-1]
y_test = test_data_std[:, -1]

## 正交匹配追踪

In [4]:
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.metrics import mean_squared_error
import numpy as np

In [5]:
omp = OrthogonalMatchingPursuit()
omp.fit(X_train, y_train)

OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None,
                          normalize=True, precompute='auto', tol=None)

In [6]:
y_train_pred = omp.predict(X_train)
train_err = mean_squared_error(y_train, y_train_pred)
print("训练样本误差: {:.3f}".format(train_err)) # 均方差

训练样本误差: 0.462


In [7]:
y_test_pred = omp.predict(X_test)
test_err = mean_squared_error(y_test, y_test_pred)
print("测试样本误差: {:.3f}".format(test_err))

测试样本误差: 0.334


In [8]:
omp.coef_

array([0.73315515, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

### 使用最优参数模型

In [22]:
from sklearn.linear_model import OrthogonalMatchingPursuitCV

In [36]:
omp_cv = OrthogonalMatchingPursuitCV(cv=10)
omp_cv.fit(X_train, y_train)

OrthogonalMatchingPursuitCV(copy=True, cv=10, fit_intercept=True, max_iter=None,
                            n_jobs=None, normalize=True, verbose=False)

In [37]:
y_train_pred = omp_cv.predict(X_train)
train_err = mean_squared_error(y_train, y_train_pred)
print("训练样本误差: {:.3f}".format(train_err)) # 均方差

训练样本误差: 0.385


In [38]:
y_test_pred = omp_cv.predict(X_test)
test_err = mean_squared_error(y_test, y_test_pred)
print("测试样本误差: {:.3f}".format(test_err))

测试样本误差: 0.343


In [44]:
print("有{}个非零系数".format(omp_cv.n_nonzero_coefs_))

有2个非零系数


In [42]:
pd.options.display.float_format = '{:.3f}'.format
variables = ["Intercept"] + list(train_data.iloc[:, :-1].columns)
pd.DataFrame({"系数": variables, 
              "值": np.r_[train_data.iloc[:, -1].mean(), 
                                   omp_cv.coef_]})

Unnamed: 0,系数,值
0,Intercept,2.452
1,lcavol,0.646
2,lweight,0.291
3,age,0.0
4,lbph,0.0
5,svi,0.0
6,lcp,0.0
7,gleason,0.0
8,pgg45,0.0
