In [2]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

In [3]:
"""
准备数据集
"""
data_df1 = pd.read_csv(r"E:\Data\20230424-单人双次iFEMG肌力等级测试\normed_sEMG_fea_bicps_br.csv", index_col = 0, header = [0, 1])
data_df2 = pd.read_csv(r"E:\Data\20230310-iFEMG肌力测试\normed_sEMG_fea_bicps_br.csv", index_col = 0, header = [0, 1])

data_df = pd.concat([data_df1, data_df2], axis = 0, ignore_index = True)
print(data_df.shape)
data_df.head()

(590, 45)


Unnamed: 0_level_0,agonist_ch1,agonist_ch1,agonist_ch1,agonist_ch1,agonist_ch1,agonist_ch1,agonist_ch1,agonist_ch1,antagonist_ch1,antagonist_ch1,antagonist_ch1,antagonist_ch1,antagonist_ch1,antagonist_ch1,antagonist_ch2,antagonist_ch2,antagonist_ch2,antagonist_ch2,antagonist_ch2,antagonist_ch2,antagonist_ch2
Unnamed: 0_level_1,subject_name,height(cm),weight(kg),gender,age,sensor_channel,label(kg),FMG_increase,subject_name,height(cm),...,slope_sign_change,mean_freq,mean_power_freq,mav,rms,wave_length,zero_crossing,slope_sign_change,mean_freq,mean_power_freq
0,Li Peiyang,182,82,1,21,bicps_br,0.0,0.562248,Li Peiyang,182,...,0.088028,0.0,0.0,0.605877,0.447505,0.522791,0.043523,0.07388,0.01716,0.0
1,Li Peiyang,182,82,1,21,bicps_br,0.0,0.662764,Li Peiyang,182,...,0.161384,0.393232,0.41787,0.372326,0.303616,0.435363,0.135639,0.070732,0.062069,0.036479
2,Li Peiyang,182,82,1,21,bicps_br,0.0,0.723141,Li Peiyang,182,...,0.125369,0.771976,0.56579,0.142527,0.07763,0.460236,0.415688,0.485084,0.215211,0.319404
3,Li Peiyang,182,82,1,21,bicps_br,0.0,0.452452,Li Peiyang,182,...,0.253225,0.68727,0.593396,0.300342,0.250268,0.346974,0.173791,0.31274,0.078377,0.101901
4,Li Peiyang,182,82,1,21,bicps_br,0.0,0.576496,Li Peiyang,182,...,1.0,1.0,0.820658,0.0,0.001957,0.0,1.0,1.0,0.158286,0.205744


In [4]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

print(data_df.shape)
data_df.dtypes

(590, 45)


agonist_ch1     subject_name          object
                height(cm)             int64
                weight(kg)             int64
                gender                 int64
                age                    int64
                sensor_channel        object
                label(kg)            float64
                FMG_increase         float64
antagonist_ch1  subject_name          object
                height(cm)             int64
                weight(kg)             int64
                gender                 int64
                age                    int64
                sensor_channel        object
                label(kg)            float64
                FMG_increase         float64
antagonist_ch2  subject_name          object
                height(cm)             int64
                weight(kg)             int64
                gender                 int64
                age                    int64
                sensor_channel        object
          

In [5]:
"""划分训练集和测试集"""

x1_data = data_df["agonist_ch1"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values
x2_data = data_df["antagonist_ch1"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values
x3_data = data_df["antagonist_ch2"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values

y_data = data_df["agonist_ch1"]['label(kg)'].values
x_data = np.concatenate((x1_data, x2_data, x3_data), axis=1)
print("data shape: ", x_data.shape)

train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
# train_data = preprocessing.normalize(train_data_r, norm = 'max')
# test_data = preprocessing.normalize(test_data_r, norm = 'max')
scaler = preprocessing.StandardScaler().fit(train_data_r)
train_data = scaler.transform(train_data_r)
test_data = scaler.transform(test_data_r)

print("train data shape: ", train_data.shape)
print("train label length: ", train_label.shape)
print("test data shape: ", test_data.shape)
print("test label length: ", test_label.shape)

data shape:  (590, 21)
train data shape:  (472, 21)
train label length:  (472,)
test data shape:  (118, 21)
test label length:  (118,)


In [6]:
"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)

print(svr.best_params_)

end_time = time.perf_counter()
print("training time(min): ", (end_time - start_time)/60)

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(str(svr) + "train score: " + str(score_train))
print(str(svr) + "test score: " + str(score_test))
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print("train mean squared error: ", mean_squared_error(train_label, train_pre))
print("test mean squared error: ", mean_squared_error(test_label, test_pre))

{'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}
training time(min):  1.7840596149999328
GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('linear', 'rbf')})train score: 0.9925562065779138
GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('linear', 'rbf')})test score: 0.5148547913192211
train mean squared error:  0.009081804534984852
test mean squared error:  0.5904388614122723


In [19]:
"""
训练模型
获得特征贡献值
"""

svr_model = SVR(kernel="rbf", C=10, gamma=0.1).fit(train_data, train_label)


feature_contributions = svr_model.coef_
feature_names = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]

# 输出每个特征对模型的贡献度
for feature, contribution in zip(feature_names, feature_contributions):
    print(f"{feature}: {contribution}")

1: [-0.03211321 -0.05668682  0.08075054  0.12628053 -0.14945059 -0.05914749
 -0.07416854  0.04178432 -0.01661576 -0.01154517 -0.0384632  -0.07434973
 -0.03029566 -0.11521784  0.04832179 -0.02540578  0.04117705  0.0254164
  0.01950836 -0.06474162  0.03170682]


In [22]:
"""
estimator:训练模型
K:返回的特征数
data:特征数组
label:数据标签
filter_data:返回特征数组
"""
# 用rbf核进行筛选是不可以的，SVR中不提供rbf特征选择的逻辑，对于支持向量回归可以用linear进行筛选
svc = SVR(kernel='linear')
filter_data = RFE(estimator=svc, n_features_to_select=20, step=1).fit(x_data, y_data)