In [3]:
"""
准备数据集
每个人单独标准化的数据，不需要再进行normalization
clinical数据，使用健侧训练模型，使用患侧测试
"""

import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid
from utils_iFEMG_feature import df_save_csv

In [8]:
# 读数据
data_df = pd.read_csv(r"E:\Data\paper2\积水潭患者数据集\iFEMG_curl_abs_normed_all.csv", index_col = None, header = 0)
# data_df = pd.read_csv(r"E:\Data\paper2\iFEMG\iFEMG_extension_abs_normed_all.csv", index_col = None, header = 0)

print(data_df.shape)
data_df.head()

(645, 34)


Unnamed: 0,subject_info_subject_name,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_label,subject_info_side,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,w8s1,163,77.5,73,0,0.0,healthy,0.264332,-0.834709,1.33379,...,-0.920293,0.58045,0.376447,-0.302707,-0.257013,-0.174564,0.597931,0.935593,-0.404117,-0.021018
1,w8s1,163,77.5,73,0,0.0,healthy,0.264332,-1.060881,0.589367,...,0.967061,0.58045,0.401577,-0.906481,-0.666368,-0.88807,0.558823,1.10832,0.510007,0.712348
2,w8s1,163,77.5,73,0,0.0,healthy,0.264332,-0.87337,0.60765,...,1.132278,0.58045,0.442973,-0.768296,-0.565227,-0.874164,0.188775,0.862873,-0.153251,0.382974
3,w8s1,163,77.5,73,0,0.0,healthy,0.264332,-0.960192,0.330866,...,1.015542,0.58045,0.456464,-1.054028,-0.77076,-1.156951,0.427841,1.028323,-0.259272,0.145059
4,w8s1,163,77.5,73,0,0.0,healthy,0.264332,-0.961252,0.330415,...,1.150688,0.58045,0.484242,-1.120775,-0.831344,-1.210279,0.629919,1.226824,0.031978,0.488862


In [9]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

print(data_df.shape)
data_df.dtypes

(645, 34)


subject_info_subject_name                  object
subject_info_height                         int64
subject_info_weight                       float64
subject_info_age                            int64
subject_info_gender                         int64
subject_info_label                        float64
subject_info_side                          object
bicps_br_initial_pressure_ave             float64
bicps_br_FMG                              float64
bicps_br_mav                              float64
bicps_br_rms                              float64
bicps_br_wave_length                      float64
bicps_br_zero_crossing                    float64
bicps_br_slope_sign_change                float64
bicps_br_mean_freq                        float64
bicps_br_mean_power_freq                  float64
tricps_br_medial_initial_pressure_ave     float64
tricps_br_medial_FMG                      float64
tricps_br_medial_mav                      float64
tricps_br_medial_rms                      float64


In [10]:
# 对数据中的个人信息进行标准化
# 注意请勿多次运行
subject_feature_columes = ['subject_info_height', 'subject_info_weight', 'subject_info_age']
data_df[subject_feature_columes] = preprocessing.StandardScaler().fit_transform(data_df[subject_feature_columes])
data_df

Unnamed: 0,subject_info_subject_name,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_label,subject_info_side,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,w8s1,0.032657,1.164034,0.857301,0,0.0,healthy,0.264332,-0.834709,1.333790,...,-0.920293,0.580450,0.376447,-0.302707,-0.257013,-0.174564,0.597931,0.935593,-0.404117,-0.021018
1,w8s1,0.032657,1.164034,0.857301,0,0.0,healthy,0.264332,-1.060881,0.589367,...,0.967061,0.580450,0.401577,-0.906481,-0.666368,-0.888070,0.558823,1.108320,0.510007,0.712348
2,w8s1,0.032657,1.164034,0.857301,0,0.0,healthy,0.264332,-0.873370,0.607650,...,1.132278,0.580450,0.442973,-0.768296,-0.565227,-0.874164,0.188775,0.862873,-0.153251,0.382974
3,w8s1,0.032657,1.164034,0.857301,0,0.0,healthy,0.264332,-0.960192,0.330866,...,1.015542,0.580450,0.456464,-1.054028,-0.770760,-1.156951,0.427841,1.028323,-0.259272,0.145059
4,w8s1,0.032657,1.164034,0.857301,0,0.0,healthy,0.264332,-0.961252,0.330415,...,1.150688,0.580450,0.484242,-1.120775,-0.831344,-1.210279,0.629919,1.226824,0.031978,0.488862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,1107s1,-1.447506,-0.593344,-0.565233,0,2.0,affected,2.071787,1.432518,1.158777,...,0.963333,-0.160464,-0.101044,0.033244,-0.086020,0.248078,-1.023727,-0.976712,-1.027272,-0.925162
641,1107s1,-1.447506,-0.593344,-0.565233,0,2.0,affected,2.071787,1.482564,0.907694,...,1.081050,-0.160464,0.388158,0.336959,0.141355,0.343284,-1.128817,-1.212494,-1.083809,-1.122362
642,1107s1,-1.447506,-0.593344,-0.565233,0,2.0,affected,2.071787,1.542811,1.680064,...,0.860542,-0.160464,0.481561,0.299037,0.158678,0.389070,-1.035922,-1.148199,-1.116932,-1.125568
643,1107s1,-1.447506,-0.593344,-0.565233,0,2.0,affected,2.071787,1.525316,1.274841,...,0.918776,-0.160464,0.065509,0.321341,0.115970,0.423688,-1.115121,-1.074502,-1.082077,-1.081129


In [19]:
'''
最传统做法
随机划分训练集和测试集，进行cv参数搜索
查看affected和healthy两个数据集各自的test准确率

两者单独的准确率都比较高
'''
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

y_data = data_df[data_df['subject_info_side'] == 'affected'].loc[:, 'subject_info_label'].values
x_data = data_df[data_df['subject_info_side'] == 'affected'].loc[:, columns_to_scale].values

print(x_data.shape)
print(y_data.shape)

# 划分训练集和测试集
train_data, test_data, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 3, stratify = y_data)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")


(334, 30)
(334,)
Best params: {'C': 100.0, 'gamma': 0.01}
training time(min): 0.0059914350000326525
train score: 0.9852233924916307
test score: 0.8999385389299535
train mean squared error: 0.007803791553852559
test mean squared error: 0.05156876591346686


In [11]:
'''
使用健侧数据作为训练集
使用患侧数据作为测试集
'''
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

x_train = data_df[data_df['subject_info_side'] == 'healthy'].loc[:, columns_to_scale].values
y_train = data_df[data_df['subject_info_side'] == 'healthy'].loc[:, 'subject_info_label'].values
x_test = data_df[data_df['subject_info_side'] == 'affected'].loc[:, columns_to_scale].values
y_test = data_df[data_df['subject_info_side'] == 'affected'].loc[:, 'subject_info_label'].values

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(x_train, y_train)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(x_test, y_test)
score_train = svr.score(x_train, y_train)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(x_test)
train_pre = svr.predict(x_train)
print(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
print(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")


(311, 30)
(311,)
(334, 30)
(334,)
Best params: {'C': 1.0, 'gamma': 0.01}
training time(min): 0.035260970000066054
train score: 0.8213324200520464
test score: -0.07225422343869026
train mean squared error: 0.09519141030477317
test mean squared error: 0.5635904339962302


In [13]:
'''
* 每次取一个受试者作为测试集，其余人的数据作为训练集，使用cv参数搜索
* 加入受试者个人特征
* 单人双次测试结果合并为一次，n=10
'''
# 合并单人双次测试结果
data_df['subject_info_name'] = data_df['subject_info_name'].str.replace('-1', '').str.replace('-2', '')
print(data_df.shape)
print(set(data_df['subject_info_name']))

subject_name_list = ['wcx', 'lpy', 'hpy', 'zjz', 'zk', 'lmt', 'zx', 'lmh', 'zpk', 'pym']

# 选取数据集名称
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

# 用于存储每个被试作为test时的模型score
test_name_list = []
train_score_list = []
test_score_list = []
train_mse_list = []
test_mse_list = []
test_err_df_list = []   # 用于存储多个df，每个df是当天test subject的true label 和 predicted label数据

for subject in subject_name_list:
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["subject_info_name"] == subject]  # 测试集
    train_df = data_df[data_df["subject_info_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'subject_info_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'subject_info_label'].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")

    """
    Grid searching best parameters of SVR model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(SVR(kernel='rbf'), param_grid={"C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
    svr.fit(x_train, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test, y_test)
    score_train = svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = svr.predict(x_test)
    train_pre = svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    temp_predict_label_df['test_subject'] = subject
    logging.info(f"predicted label: \n{temp_predict_label_df}")

    test_name_list.append(subject)
    train_score_list.append(score_train)
    test_score_list.append(score_test)
    train_mse_list.append(mean_squared_error(y_train, train_pre))
    test_mse_list.append(mean_squared_error(y_test, test_pre))
    test_err_df_list.append(temp_predict_label_df)
    pass

(363, 33)
{'lmh', 'lpy', 'pym', 'hpy', 'wcx', 'zx', 'zk', 'zpk', 'lmt', 'zjz'}


In [14]:
# 通用代码模块，用于合并表征模型结果的df并保存
model_performance_df = pd.DataFrame({'test_subject': test_name_list,
                                     'train_score': train_score_list,
                                     'test_score': test_score_list,
                                     'train_mse': train_mse_list,
                                     'test_mse': test_mse_list})
test_err_df = pd.concat(test_err_df_list, axis=0, ignore_index=True)

In [144]:
df_save_csv(model_performance_df, r"E:\Data\paper2\肌力预测模型结果\fuck.csv")

File E:\Data\paper2\肌力预测模型结果\fuck.csv saved!


In [15]:
df_save_csv(test_err_df, r"E:\Data\paper2\肌力预测模型结果\test_err_extension.csv")

File E:\Data\paper2\肌力预测模型结果\test_err_extension.csv saved!


In [16]:
logging.shutdown()  # 关闭当前的日志处理器，解除文件占用、关闭后日志文件可删除