In [1]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid

In [3]:
"""
准备数据集
"""
sEMG_data_df = pd.read_csv(r"E:\Code_R\health_iFEMG_US\Data\iFEMG\iFEMG_curl_cleaned.csv", index_col = None, header = 0)
FMG_data_df = pd.read_csv(r"E:\Code_R\health_iFEMG_US\Data\FMG\FMG_bicps_240410.csv", index_col = None, header = 0)


In [4]:
set(sEMG_data_df['bicps_br_subject_name'])

{'chw',
 'hpy',
 'lmh',
 'lmt',
 'lpy',
 'pym',
 'wby',
 'wcx',
 'zjz',
 'zk',
 'zpk',
 'zx'}

In [5]:
set(FMG_data_df['subject_info_subject_name'])

{'chw-1',
 'chw-2',
 'hpy',
 'lmh',
 'lmt',
 'lpy-1',
 'lpy-2',
 'lry',
 'pym-1',
 'pym-2',
 'wby',
 'wcx',
 'zjz',
 'zk-1',
 'zk-2',
 'zpk-1',
 'zpk-2',
 'zx'}

In [3]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

print(data_df.shape)
data_df.dtypes

(506, 30)


bicps_br_subject_name                   object
bicps_br_height.cm.                      int64
bicps_br_weight.kg.                      int64
bicps_br_gender                          int64
bicps_br_age                             int64
bicps_br_label                         float64
bicps_br_FMG                           float64
bicps_br_mav                           float64
bicps_br_rms                           float64
bicps_br_wave_length                   float64
bicps_br_zero_crossing                 float64
bicps_br_slope_sign_change             float64
bicps_br_mean_freq                     float64
bicps_br_mean_power_freq               float64
tricps_br_medial_FMG                   float64
tricps_br_medial_mav                   float64
tricps_br_medial_rms                   float64
tricps_br_medial_wave_length           float64
tricps_br_medial_zero_crossing         float64
tricps_br_medial_slope_sign_change     float64
tricps_br_medial_mean_freq             float64
tricps_br_med

In [18]:
logger_init(log_file_name="rbf_linear_gridsearchcv")

In [None]:
# 自定义参数搜索SVR模型，每次挑出一个人的数据用于交叉验证，搜索最优参数
# 每次取一个受试者作为测试集，其余人的数据作为训练集
# 删减了一些不需要的特征值

# 建立参数网格
C_array = np.logspace(-3, 3, 7)
gamma_array = np.logspace(-3, 3, 7)
params_list = []
for i in C_array:
    for j in gamma_array:
        params_list.append([i, j])
        pass
    pass
custom_grid_df = pd.DataFrame(params_list, columns=['C', 'gamma'])
print(f"参数网格shape: {custom_grid_df.shape}")

# 需要进行标准化的列名
columns_to_scale = ['bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_wave_length',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_wave_length',
]

for subject in set(data_df["bicps_br_subject_name"]):
    # 记录开始训练时间
    start_time = time.perf_counter()
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["bicps_br_subject_name"] == subject]  # 测试集
    train_df = data_df[data_df["bicps_br_subject_name"] != subject] # 训练集

    # 训练集和测试集分别进行标准化，模拟模型应用到新数据集中
    # test_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(test_df[columns_to_scale])
    # train_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(train_df[columns_to_scale])
    # 训练集和测试集使用同一个scaler进行标准化
    scaler = preprocessing.StandardScaler().fit(train_df[columns_to_scale])
    train_df.loc[:, columns_to_scale] = scaler.transform(train_df[columns_to_scale])
    test_df.loc[:, columns_to_scale] = scaler.transform(test_df[columns_to_scale])
    
    # 遍历参数网格，开始参数搜索
    average_mse = []    # 每个参数交叉验证后得到的平均mse
    average_score = []
    for index, row in custom_grid_df.iterrows():
        # 划分交叉验证数据集，每次单独划分一个人做交叉验证
        mse_list = []   # 每个参数组合对应一个mse列表
        score_list = []
        for validation_subject in set(train_df["bicps_br_subject_name"]):
            # 训练集和验证集数据
            x_validate = train_df[train_df["bicps_br_subject_name"] == validation_subject].loc[:, columns_to_scale].values
            y_validate = train_df[train_df["bicps_br_subject_name"] == validation_subject].iloc[:, 5].values
            x_cv = train_df[train_df["bicps_br_subject_name"] != validation_subject].loc[:, columns_to_scale].values
            y_cv = train_df[train_df["bicps_br_subject_name"] != validation_subject].iloc[:, 5].values
            # 训练模型
            svr_model = SVR(kernel="sigmoid", C=row['C'], gamma=row["gamma"]).fit(x_cv, y_cv)
            y_validate_pred = svr_model.predict(x_validate)
            mse_list.append(mean_squared_error(y_validate, y_validate_pred))
            score_list.append(svr_model.score(x_validate, y_validate))
            pass
        # 获得每个参数对应的mse得分
        average_mse.append(sum(mse_list)/len(mse_list))
        average_score.append(sum(score_list)/len(score_list))
        logging.info(f"C: {row['C']}, gamma: {row['gamma']}, average mse: {sum(mse_list)/len(mse_list)}, average score: {sum(score_list)/len(score_list)}")
        pass
    
    # 将参数网格与对应的mse对应起来
    params_with_score_df = pd.concat([custom_grid_df,
                                      pd.DataFrame(average_mse, columns=["mse"]),
                                      pd.DataFrame(average_score, columns=["score"])], axis=1)

    # 获得最佳参数
    # min_row_index = params_with_score_df['score'].idxmin()
    best_row_index = params_with_score_df['score'].idxmax()
    best_C = float(params_with_score_df.loc[best_row_index]['C'])
    best_gamma = float(params_with_score_df.loc[best_row_index]['gamma'])
    logging.info(f"best C: {best_C}, best gamma: {best_gamma}")

    # 测试集数据
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.iloc[:, 5].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.iloc[:, 5].values

    # 最佳模型
    best_svr = SVR(kernel="rbf", C=best_C, gamma=best_gamma).fit(x_train, y_train)

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = best_svr.score(x_test, y_test)
    score_train = best_svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = best_svr.predict(x_test)
    train_pre = best_svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    logging.info(f"true label: {y_test}")
    logging.info(f"predicted label: {test_pre}")
    logging.info(f"true - predict: {y_test - test_pre}")
    pass

参数网格shape: (49, 2)


In [17]:
# 自定义参数搜索SVR模型，每次挑出一个人的数据用于交叉验证，搜索最优参数
# 每次取一个受试者作为测试集，其余人的数据作为训练集

# 模型核函数名称
svr_kernel = "sigmoid"
# 建立参数网格
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'epsilon': [0.1, 0.2, 0.5],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'coef0': [0, 1, 2]
}
custom_grid_df = form_params_grid(param_grid)
print(f"参数网格shape: {custom_grid_df.shape}")

# 需要进行标准化的列名
'''
columns_to_scale = ['bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']
'''
columns_to_scale = ['bicps_br_height.cm.',
                    'bicps_br_weight.kg.',
                    'bicps_br_age',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length']

for subject in set(data_df["bicps_br_subject_name"]):
    # 记录开始训练时间
    start_time = time.perf_counter()
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["bicps_br_subject_name"] == subject]  # 测试集
    train_df = data_df[data_df["bicps_br_subject_name"] != subject] # 训练集

    # 训练集和测试集分别进行标准化，模拟模型应用到新数据集中
    # test_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(test_df[columns_to_scale])
    # train_df.loc[:, columns_to_scale] = preprocessing.StandardScaler().fit_transform(train_df[columns_to_scale])
    # 训练集和测试集使用同一个scaler进行标准化
    scaler = preprocessing.StandardScaler().fit(train_df[columns_to_scale])
    train_df.loc[:, columns_to_scale] = scaler.transform(train_df[columns_to_scale])
    test_df.loc[:, columns_to_scale] = scaler.transform(test_df[columns_to_scale])
    
    # 遍历参数网格，开始参数搜索
    average_mse = []    # 每个参数交叉验证后得到的平均mse
    average_score = []
    for index, row in custom_grid_df.iterrows():
        # 划分交叉验证数据集，每次单独划分一个人做交叉验证
        mse_list = []   # 每个参数组合对应一个mse列表
        score_list = []
        for validation_subject in set(train_df["bicps_br_subject_name"]):
            # 训练集和验证集数据
            x_validate = train_df[train_df["bicps_br_subject_name"] == validation_subject].loc[:, columns_to_scale].values
            y_validate = train_df[train_df["bicps_br_subject_name"] == validation_subject].iloc[:, 5].values
            x_cv = train_df[train_df["bicps_br_subject_name"] != validation_subject].loc[:, columns_to_scale].values
            y_cv = train_df[train_df["bicps_br_subject_name"] != validation_subject].iloc[:, 5].values
            # 训练模型
            svr_model = SVR(kernel=svr_kernel, C=row['C'], gamma=row["gamma"], epsilon=row["epsilon"], coef0=row["coef0"]).fit(x_cv, y_cv)
            y_validate_pred = svr_model.predict(x_validate)
            mse_list.append(mean_squared_error(y_validate, y_validate_pred))
            score_list.append(svr_model.score(x_validate, y_validate))
            pass
        # 获得每个参数对应的mse得分
        average_mse.append(sum(mse_list)/len(mse_list))
        average_score.append(sum(score_list)/len(score_list))
        # logging.info(f"C: {row['C']}, gamma: {row['gamma']}, average mse: {sum(mse_list)/len(mse_list)}")
        pass
    
    # 将参数网格与对应的mse对应起来
    params_with_score_df = pd.concat([custom_grid_df,
                                      pd.DataFrame(average_mse, columns=["mse"]),
                                      pd.DataFrame(average_score, columns=["score"])], axis=1)

    # 获得最佳参数
    # best_row_index = params_with_score_df['mse'].idxmin()
    best_row_index = params_with_score_df['score'].idxmax()
    best_C = float(params_with_score_df.loc[best_row_index]['C'])
    best_gamma = float(params_with_score_df.loc[best_row_index]['gamma'])
    best_coef0 = float(params_with_score_df.loc[best_row_index]['coef0'])
    best_epsilon = float(params_with_score_df.loc[best_row_index]['epsilon'])
    logging.info(f"best params: {params_with_score_df.loc[best_row_index]}")

    # 测试集数据
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.iloc[:, 5].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.iloc[:, 5].values

    # 最佳模型
    best_svr = SVR(kernel=svr_kernel, C=best_C, gamma=best_gamma, coef0=best_coef0, epsilon=best_epsilon).fit(x_train, y_train)

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = best_svr.score(x_test, y_test)
    score_train = best_svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = best_svr.predict(x_test)
    train_pre = best_svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    logging.info(f"true label: {y_test}")
    logging.info(f"predicted label: {test_pre}")
    pass

参数网格shape: (441, 4)


In [19]:
# 每次取一个受试者作为测试集，其余人的数据作为训练集

# 选取数据集名称
columns_to_scale = ['bicps_br_height.cm.',
                    'bicps_br_weight.kg.',
                    'bicps_br_age',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length']

for subject in set(data_df["bicps_br_subject_name"]):
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["bicps_br_subject_name"] == subject]  # 测试集
    train_df = data_df[data_df["bicps_br_subject_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.iloc[:, 5].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.iloc[:, 5].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")
    
    # 数据标准化
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    """
    Grid searching best parameters of SVR model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
    svr.fit(x_train_scaled, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test_scaled, y_test)
    score_train = svr.score(x_train_scaled, y_train)
    logging.info(f"{str(svr)} train score: {str(score_train)}")
    logging.info(f"{str(svr)} test score: {str(score_test)}")
    test_pre = svr.predict(x_test_scaled)
    train_pre = svr.predict(x_train_scaled)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    logging.info(f"true label: {y_test}")
    logging.info(f"predicted label: {test_pre}")
    logging.info(f"true - predicted: {y_test - test_pre}")
    pass

In [8]:
"""划分训练集和测试集"""
# 带FMG
fea_name_list = ['FMG', 'mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']

x1_data_df = data_df["bicps_br"][fea_name_list]
x1_data_df.columns = [["bicps_br" for i in range(len(fea_name_list))], fea_name_list]
x2_data_df = data_df["tricps_br_medial"][fea_name_list]
x2_data_df.columns = [["tricps_br_medial" for i in range(len(fea_name_list))], fea_name_list]
x3_data_df = data_df["tricps_br_lateral"][fea_name_list]
x3_data_df.columns = [["tricps_br_lateral" for i in range(len(fea_name_list))], fea_name_list]
# 不带FMG
# x1_data = data_df["bicps_br"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values
# x2_data = data_df["tricps_br_medial"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values
# x3_data = data_df["tricps_br_lateral"][['mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values

y_data = data_df["bicps_br"]['label(kg)'].values
x_data_df = pd.concat([x1_data_df, x2_data_df, x3_data_df], axis = 1)
# x_data_df = np.concatenate((x1_data, x2_data, x3_data), axis=1)
print(f"dataset shape: {x_data_df.shape}")
print(f"label shape: {y_data.shape}")

dataset shape: (452, 24)
label shape: (452,)


In [9]:
"""
模型训练
"""
x_data = x_data_df.values
train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
# train_data = preprocessing.normalize(train_data_r, norm = 'max')
# test_data = preprocessing.normalize(test_data_r, norm = 'max')
scaler = preprocessing.StandardScaler().fit(train_data_r)
train_data = scaler.transform(train_data_r)
test_data = scaler.transform(test_data_r)


"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"{str(svr)} train score: {str(score_train)}")
print(f"{str(svr)} test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")
    


Best params: {'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
training time(min): 3.2712933366666523
GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('linear', 'rbf')}) train score: 0.9466061520871283
GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('linear', 'rbf')}) test score: 0.8292950987036305
train mean squared error: 0.029765448783159534
test mean squared error: 0.0894546454746523
