In [13]:
"""
准备数据集
每个人单独标准化的数据，不需要再进行normalization
clinical数据，使用健侧训练模型，使用患侧测试
"""

import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid
from utils_iFEMG_feature import df_save_csv

In [10]:
# 读数据
healthy_data_df = pd.read_csv(r"E:\Data\paper2\积水潭患者数据集\iFEMG_curl_abs_normed_healthy.csv", index_col = None, header = 0)
affected_data_df = pd.read_csv(r"E:\Data\paper2\积水潭患者数据集\iFEMG_curl_abs_normed_affected.csv", index_col = None, header = 0)

print(healthy_data_df.shape)
print(affected_data_df.shape)


(305, 33)
(334, 33)


In [3]:
# 对数据中的个人信息进行标准化
# 注意请勿多次运行
subject_feature_columes = ['subject_info_height', 'subject_info_weight', 'subject_info_age']
healthy_data_df[subject_feature_columes] = preprocessing.StandardScaler().fit_transform(healthy_data_df[subject_feature_columes])
affected_data_df[subject_feature_columes] = preprocessing.StandardScaler().fit_transform(affected_data_df[subject_feature_columes])
affected_data_df

Unnamed: 0,subject_info_subject_name,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_label,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,bicps_br_rms,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.793679,-0.752914,-0.810929,...,-0.647576,-1.238539,-1.503964,-1.147427,-1.070321,-1.232530,0.376447,1.705454,0.105379,0.254580
1,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.201078,-0.370036,-0.319629,...,-0.990404,-1.238539,-1.412707,-0.549752,-0.484550,-0.723404,-0.114673,1.168864,-0.132658,-0.354457
2,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.344490,-0.843829,-1.071729,...,0.120976,-1.238539,-1.366298,-1.250154,-1.111153,-1.285875,0.881968,1.395438,0.438751,1.042885
3,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.263340,-0.944688,-1.112037,...,0.831664,-1.238539,-1.181899,-1.283414,-1.195652,-1.074565,1.741708,1.764677,0.729488,1.210865
4,w8s1,0.093715,1.200781,0.875053,0,0.0,-1.262618,-1.226833,-0.730232,-0.711954,...,0.772762,-1.238539,-1.250283,-0.959925,-0.674516,-1.136803,0.198398,2.245472,-0.142526,0.420992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.325607,1.140434,0.765285,...,-0.631949,-1.163073,-0.803933,-0.554339,-0.641616,-0.444217,-0.200727,0.070101,-0.184029,0.381508
330,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.467121,0.695256,0.159426,...,0.580124,-1.163073,-0.250457,0.227316,-0.104751,-0.105162,-0.778112,-0.937609,-0.430678,-0.758771
331,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.637476,2.064690,1.034738,...,-1.690339,-1.163073,-0.144783,0.129719,-0.063850,0.057894,-0.267731,-0.662817,-0.575182,-0.777306
332,1107s1,-1.309426,-0.508562,-0.571817,0,2.0,1.411665,1.588006,1.346218,0.760663,...,-1.090736,-1.163073,-0.615497,0.187120,-0.164690,0.181178,-0.702865,-0.347841,-0.423124,-0.520345


In [4]:
'''
最传统做法
随机划分训练集和测试集，进行cv参数搜索
查看affected和healthy两个数据集各自的test准确率
'''
# 更改数据集切换健侧模型和患侧模型
data_df = healthy_data_df

columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

y_data = data_df.loc[:, 'subject_info_label'].values
x_data = data_df.loc[:, columns_to_scale].values

print(x_data.shape)
print(y_data.shape)

# 划分训练集和测试集
train_data, test_data, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 5, stratify = y_data)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")


(305, 30)
(305,)
Best params: {'C': 100.0, 'gamma': 0.01}
training time(min): 0.04326419666666652
train score: 0.9860816028607048
test score: 0.8912900755789626
train mean squared error: 0.007222202412393271
test mean squared error: 0.055275242409191835


In [19]:
'''
使用健侧全部数据作为训练集
使用患侧全部数据作为测试集
'''
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

x_train = healthy_data_df.loc[:, columns_to_scale].values
y_train = healthy_data_df.loc[:, 'subject_info_label'].values
x_test = affected_data_df.loc[:, columns_to_scale].values
y_test = affected_data_df.loc[:, 'subject_info_label'].values

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(x_train, y_train)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(x_test, y_test)
score_train = svr.score(x_train, y_train)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(x_test)
train_pre = svr.predict(x_train)
print(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
print(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")


(305, 30)
(305,)
(334, 30)
(334,)
Best params: {'C': 1.0, 'gamma': 0.01}
training time(min): 0.00746091000000888
train score: 0.8616215260118028
test score: 0.45532599989077716
train mean squared error: 0.07151872573653904
test mean squared error: 0.28628757005364397


In [7]:
'''
使用健侧全部数据作为训练集
每次选一名被试的患侧作为测试集
'''
# 特征名称
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']
# 患侧数据被试名称
subject_name_list = ['w8s1', '0912s1', '0912s2', '0919s2', '0926s1', '0926s2', '1010s1', '1017s1', '1017s2', '1024s1', '1107s1']

x_train = healthy_data_df.loc[:, columns_to_scale].values
y_train = healthy_data_df.loc[:, 'subject_info_label'].values

print(x_train.shape)
print(y_train.shape)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# cv参数搜索
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(x_train, y_train)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_train = svr.score(x_train, y_train)
print(f"train score: {str(score_train)}")

train_pre = svr.predict(x_train)
print(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")

# 用于存储每个被试作为test时的模型score
test_name_list = []
test_score_list = []
test_mse_list = []
test_err_df_list = []   # 用于存储多个df，每个df是当天test subject的true label 和 predicted label数据

# 使用模型循环输出患侧结果
for subject in subject_name_list:
    x_test = affected_data_df[affected_data_df['subject_info_subject_name'] == subject].loc[:, columns_to_scale].values
    y_test = affected_data_df[affected_data_df['subject_info_subject_name'] == subject].loc[:, 'subject_info_label'].values

    # 计算预测值和真实值误差
    test_pre = svr.predict(x_test)
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    temp_predict_label_df['test_subject'] = subject
    # 结果保存到list
    test_name_list.append(subject)
    test_score_list.append(svr.score(x_test, y_test))
    test_mse_list.append(mean_squared_error(y_test, test_pre))
    test_err_df_list.append(temp_predict_label_df)
    pass



(305, 30)
(305,)
Best params: {'C': 1.0, 'gamma': 0.01}
training time(min): 0.03459331499999886
train score: 0.8616215260118028
train mean squared error: 0.07151872573653904


In [8]:
# 通用代码模块，用于合并表征模型结果的df并保存
model_performance_df = pd.DataFrame({'test_subject': test_name_list,
                                     'test_score': test_score_list,
                                     'test_mse': test_mse_list})
test_err_df = pd.concat(test_err_df_list, axis=0, ignore_index=True)
df_save_csv(model_performance_df, r"E:\Data\paper2\clinical肌力预测模型\healthy_train_affected_test_score_mse.csv")
df_save_csv(test_err_df, r"E:\Data\paper2\clinical肌力预测模型\healthy_train_affected_test_err.csv")

File E:\Data\paper2\肌力预测模型结果\healthy_train_affected_test_score_mse.csv saved!
File E:\Data\paper2\肌力预测模型结果\healthy_train_affected_test_err.csv saved!


In [20]:
'''
* 每次取一个受试者作为测试集，其余人的数据作为训练集，使用cv参数搜索
* 加入受试者个人特征
* 分别测试健侧和患侧
'''
data_df = healthy_data_df

# subject_name_list = ['w8s1', '0912s1', '0912s2', '0919s2', '0926s1', '0926s2', '1010s1', '1017s1', '1017s2', '1024s1', '1107s1']    # affected
subject_name_list = ['w8s1', '0912s1', '0912s2', '0919s2', '0926s1', '1010s1', '1017s1', '1017s2', '1024s1', '1107s1']    # healthy

# 选取数据集名称
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

# 用于存储每个被试作为test时的模型score
test_name_list = []
train_score_list = []
test_score_list = []
train_mse_list = []
test_mse_list = []
test_err_df_list = []   # 用于存储多个df，每个df是test subject的true label 和 predicted label数据

for subject in subject_name_list:
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["subject_info_subject_name"] == subject]  # 测试集
    train_df = data_df[data_df["subject_info_subject_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'subject_info_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'subject_info_label'].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")

    """
    Grid searching best parameters of SVR model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
    svr.fit(x_train, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test, y_test)
    score_train = svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = svr.predict(x_test)
    train_pre = svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    temp_predict_label_df['test_subject'] = subject
    logging.info(f"predicted label: \n{temp_predict_label_df}")

    test_name_list.append(subject)
    train_score_list.append(score_train)
    test_score_list.append(score_test)
    train_mse_list.append(mean_squared_error(y_train, train_pre))
    test_mse_list.append(mean_squared_error(y_test, test_pre))
    test_err_df_list.append(temp_predict_label_df)
    pass

In [21]:
# 通用代码模块，用于合并表征模型结果的df并保存
model_performance_df = pd.DataFrame({'test_subject': test_name_list,
                                     'train_score': train_score_list,
                                     'test_score': test_score_list,
                                     'train_mse': train_mse_list,
                                     'test_mse': test_mse_list})
test_err_df = pd.concat(test_err_df_list, axis=0, ignore_index=True)

In [22]:
df_save_csv(model_performance_df, r"E:\Data\paper2\clinical肌力预测模型\healthy_newtest_score_mse.csv")

File E:\Data\paper2\clinical肌力预测模型\healthy_newtest_score_mse.csv saved!


In [23]:
df_save_csv(test_err_df, r"E:\Data\paper2\clinical肌力预测模型\healthy_newtest_err.csv")

File E:\Data\paper2\clinical肌力预测模型\healthy_newtest_err.csv saved!


In [16]:
logging.shutdown()  # 关闭当前的日志处理器，解除文件占用、关闭后日志文件可删除