In [8]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid

In [9]:
"""
准备数据集
"""
data_df = pd.read_csv(r"E:\Data\paper2\iFEMG\iFEMG_curl_abs_all.csv", index_col = None, header = 0)

print(data_df.shape)
data_df.head()


(505, 33)


Unnamed: 0,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_name,subject_info_label,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,bicps_br_rms,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,172,80,23,1,hpy,0.0,1491.491151,2231.203862,177.53421,317.775103,...,40.303728,735.225231,1021.484658,117.910162,159.307572,27.044696,0.077683,0.245173,38.216989,46.319924
1,172,80,23,1,hpy,0.0,1491.491151,2315.715081,164.336743,250.237324,...,30.268375,735.225231,1046.557646,109.735186,183.371572,28.321039,0.089103,0.256171,28.779108,43.589793
2,172,80,23,1,hpy,0.0,1491.491151,2338.333283,215.922389,420.899864,...,22.517291,735.225231,1059.129322,65.996599,110.951026,17.359273,0.096116,0.27281,51.094086,51.946719
3,172,80,23,1,hpy,0.0,1491.491151,2397.877389,148.334215,260.118736,...,20.779308,735.225231,1057.169393,3.349566,4.222418,2.207407,0.212811,0.373337,108.361863,145.005578
4,172,80,23,1,hpy,0.0,1491.491151,2396.461597,145.041963,229.192017,...,23.865866,735.225231,1063.527714,3.386197,4.242233,2.224423,0.209498,0.377768,105.939848,143.717028


In [10]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

print(data_df.shape)
data_df.dtypes

(505, 33)


subject_info_height                         int64
subject_info_weight                         int64
subject_info_age                            int64
subject_info_gender                         int64
subject_info_name                          object
subject_info_label                        float64
bicps_br_initial_pressure_ave             float64
bicps_br_FMG                              float64
bicps_br_mav                              float64
bicps_br_rms                              float64
bicps_br_wave_length                      float64
bicps_br_zero_crossing                    float64
bicps_br_slope_sign_change                float64
bicps_br_mean_freq                        float64
bicps_br_mean_power_freq                  float64
tricps_br_medial_initial_pressure_ave     float64
tricps_br_medial_FMG                      float64
tricps_br_medial_mav                      float64
tricps_br_medial_rms                      float64
tricps_br_medial_wave_length              float64


In [16]:
logger_init(log_file_name="model_iFEMG_absfeatures_1test_1validate")

In [13]:
'''
随机划分训练集和测试集，进行cv参数搜索
'''
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

y_data = data_df.loc[:, 'subject_info_label'].values
x_data = data_df.loc[:, columns_to_scale].values

print(x_data.shape)
print(y_data.shape)

# 模型训练
train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
# train_data = preprocessing.normalize(train_data_r, norm = 'max')
# test_data = preprocessing.normalize(test_data_r, norm = 'max')
scaler = preprocessing.StandardScaler().fit(train_data_r)
train_data = scaler.transform(train_data_r)
test_data = scaler.transform(test_data_r)


"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")

(505, 30)
(505,)
Best params: {'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}
training time(min): 2.0166654766665792
train score: 0.9821538035511311
test score: 0.8995804983164567
train mean squared error: 0.010101875932294821
test mean squared error: 0.05412769630496646


In [15]:
'''
每次取一个受试者作为测试集，其余人的数据作为训练集
'''
# 选取数据集名称
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

for subject in set(data_df["subject_info_name"]):
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["subject_info_name"] == subject]  # 测试集
    train_df = data_df[data_df["subject_info_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'subject_info_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'subject_info_label'].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")
    
    # 数据标准化
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    """
    Grid searching best parameters of SVR model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
    svr.fit(x_train_scaled, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test_scaled, y_test)
    score_train = svr.score(x_train_scaled, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = svr.predict(x_test_scaled)
    train_pre = svr.predict(x_train_scaled)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    logging.info(f"predicted label: \n{temp_predict_label_df}")
    pass

In [18]:
# 自定义参数搜索SVR模型，每次挑出一个人的数据用于交叉验证，搜索最优参数
# 每次取一个受试者作为测试集，其余人的数据作为训练集

# 模型核函数名称
svr_kernel = "rbf"
# 建立参数网格
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
custom_grid_df = form_params_grid(param_grid)
print(f"参数网格shape: {custom_grid_df.shape}")

# 选取数据集名称
columns_to_scale = ['subject_info_height',
                    'subject_info_weight',
                    'subject_info_age',
                    'bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

for subject in set(data_df["subject_info_name"]):
    # 记录开始训练时间
    start_time = time.perf_counter()
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["subject_info_name"] == subject]  # 测试集
    train_df = data_df[data_df["subject_info_name"] != subject] # 训练集

    # 训练集和测试集分别进行标准化，模拟模型应用到新数据集中
    # test_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(test_df[columns_to_scale])
    # train_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(train_df[columns_to_scale])
    # 训练集和测试集使用同一个scaler进行标准化
    scaler = preprocessing.StandardScaler().fit(train_df[columns_to_scale])
    train_df.loc[:, columns_to_scale] = scaler.transform(train_df[columns_to_scale])
    test_df.loc[:, columns_to_scale] = scaler.transform(test_df[columns_to_scale])
    
    # 遍历参数网格，开始参数搜索
    average_mse = []    # 每个参数交叉验证后得到的平均mse
    average_score = []
    for index, row in custom_grid_df.iterrows():
        # 划分交叉验证数据集，每次单独划分一个人做交叉验证
        mse_list = []   # 每个参数组合对应一个mse列表
        score_list = []
        for validation_subject in set(train_df["subject_info_name"]):
            # 训练集和验证集数据
            x_validate = train_df[train_df["subject_info_name"] == validation_subject].loc[:, columns_to_scale].values
            y_validate = train_df[train_df["subject_info_name"] == validation_subject].loc[:, 'subject_info_label'].values
            x_cv = train_df[train_df["subject_info_name"] != validation_subject].loc[:, columns_to_scale].values
            y_cv = train_df[train_df["subject_info_name"] != validation_subject].loc[:, 'subject_info_label'].values
            # 训练模型
            svr_model = SVR(kernel=svr_kernel, C=row['C'], gamma=row["gamma"]).fit(x_cv, y_cv)
            y_validate_pred = svr_model.predict(x_validate)
            mse_list.append(mean_squared_error(y_validate, y_validate_pred))
            score_list.append(svr_model.score(x_validate, y_validate))
            pass
        # 获得每个参数对应的mse得分
        average_mse.append(sum(mse_list)/len(mse_list))
        average_score.append(sum(score_list)/len(score_list))
        # logging.info(f"C: {row['C']}, gamma: {row['gamma']}, average mse: {sum(mse_list)/len(mse_list)}")
        pass
    
    # 将参数网格与对应的mse对应起来
    params_with_score_df = pd.concat([custom_grid_df,
                                      pd.DataFrame(average_mse, columns=["mse"]),
                                      pd.DataFrame(average_score, columns=["score"])], axis=1)

    # 获得最佳参数
    # best_row_index = params_with_score_df['mse'].idxmin()
    best_row_index = params_with_score_df['score'].idxmax()
    best_C = float(params_with_score_df.loc[best_row_index]['C'])
    best_gamma = float(params_with_score_df.loc[best_row_index]['gamma'])
    logging.info(f"best params: \n{params_with_score_df.loc[best_row_index]}")

    # 测试集数据
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'subject_info_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'subject_info_label'].values

    # 最佳模型
    best_svr = SVR(kernel=svr_kernel, C=best_C, gamma=best_gamma).fit(x_train, y_train)

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = best_svr.score(x_test, y_test)
    score_train = best_svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = best_svr.predict(x_test)
    train_pre = best_svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    logging.info(f"predicted label: \n{temp_predict_label_df}")
    pass

参数网格shape: (49, 2)
