In [4]:
"""
准备数据集。
每个人单独标准化的数据，不需要再进行normalization。
每个人的数据前8训练后2测试。
"""

import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid
from utils_iFEMG_feature import df_save_csv

In [5]:
# 读数据
healthy_data_df = pd.read_csv(r"E:\Data\paper2\积水潭患者数据集\iFEMG_curl_abs_normed_healthy.csv", index_col = None, header = 0)
affected_data_df = pd.read_csv(r"E:\Data\paper2\积水潭患者数据集\iFEMG_curl_abs_normed_affected.csv", index_col = None, header = 0)

print(healthy_data_df.shape)
print(affected_data_df.shape)

(305, 33)
(334, 33)


In [6]:
# 对数据中的个人信息进行标准化
# 注意请勿多次运行
subject_feature_columes = ['subject_info_height', 'subject_info_weight', 'subject_info_age']
healthy_data_df[subject_feature_columes] = preprocessing.StandardScaler().fit_transform(healthy_data_df[subject_feature_columes])
affected_data_df[subject_feature_columes] = preprocessing.StandardScaler().fit_transform(affected_data_df[subject_feature_columes])
affected_data_df

Unnamed: 0,subject_info_subject_name,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_label,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,bicps_br_rms,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.793679,-0.752914,-0.810929,...,-0.647576,-1.238539,-1.503964,-1.147427,-1.070321,-1.232530,0.376447,1.705454,0.105379,0.254580
1,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.201078,-0.370036,-0.319629,...,-0.990404,-1.238539,-1.412707,-0.549752,-0.484550,-0.723404,-0.114673,1.168864,-0.132658,-0.354457
2,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.344490,-0.843829,-1.071729,...,0.120976,-1.238539,-1.366298,-1.250154,-1.111153,-1.285875,0.881968,1.395438,0.438751,1.042885
3,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.263340,-0.944688,-1.112037,...,0.831664,-1.238539,-1.181899,-1.283414,-1.195652,-1.074565,1.741708,1.764677,0.729488,1.210865
4,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.226833,-0.730232,-0.711954,...,0.772762,-1.238539,-1.250283,-0.959925,-0.674516,-1.136803,0.198398,2.245472,-0.142526,0.420992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.325607,1.140434,0.765285,...,-0.631949,-1.163073,-0.803933,-0.554339,-0.641616,-0.444217,-0.200727,0.070101,-0.184029,0.381508
330,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.467121,0.695256,0.159426,...,0.580124,-1.163073,-0.250457,0.227316,-0.104751,-0.105162,-0.778112,-0.937609,-0.430678,-0.758771
331,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.637476,2.064690,1.034738,...,-1.690339,-1.163073,-0.144783,0.129719,-0.063850,0.057894,-0.267731,-0.662817,-0.575182,-0.777306
332,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.588006,1.346218,0.760663,...,-1.090736,-1.163073,-0.615497,0.187120,-0.164690,0.181178,-0.702865,-0.347841,-0.423124,-0.520345


In [12]:
# 选数据集
data_df = affected_data_df

# 划分数据集，选择前80%作为训练集，后20%作为测试集
train_list = []
test_list = []

for (subject_info_name, subject_info_label), group in data_df.groupby(['subject_info_subject_name', 'subject_info_label']):
    # 计算分割索引
    split_index = int(len(group) * 0.8)
    
    # 选取前80%作为训练集，后20%作为测试集
    train_list.append(group.iloc[:split_index])
    test_list.append(group.iloc[split_index:])

# 合并所有训练集和测试集
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)
print(train_df.shape)
print(test_df.shape)

(250, 33)
(84, 33)


In [13]:
'''
最传统做法
不需要划分数据集，进行cv参数搜索
'''
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

train_data = train_df.loc[:, columns_to_scale].values
train_label = train_df.loc[:, 'subject_info_label'].values
test_data = test_df.loc[:, columns_to_scale].values
test_label = test_df.loc[:, 'subject_info_label'].values

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")


Best params: {'C': 1.0, 'gamma': 0.01}
training time(min): 0.033961791666661155
train score: 0.7946686250575871
test score: 0.7127138450450508
train mean squared error: 0.10849791984507075
test mean squared error: 0.14861032675532382


In [14]:
test_err_df = pd.DataFrame({'y_true': test_label,
                            'y_predict': test_pre})

In [15]:
df_save_csv(test_err_df, r"E:\Data\paper2\clinical肌力预测模型\affected_8020_err.csv")

File E:\Data\paper2\clinical肌力预测模型\affected_8020_err.csv saved!
