In [5]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid

In [6]:
"""
准备数据集
每个人单独标准化的数据，不需要再进行normalization
"""
data_df = pd.read_csv(r"E:\Data\paper2\iFEMG\iFEMG_curl_abs_normed_all.csv", index_col = None, header = 0)

print(data_df.shape)
data_df.head()

(505, 33)


Unnamed: 0,subject_info_height,subject_info_weight,subject_info_age,subject_info_gender,subject_info_name,subject_info_label,bicps_br_initial_pressure_ave,bicps_br_FMG,bicps_br_mav,bicps_br_rms,...,tricps_br_medial_mean_power_freq,tricps_br_lateral_initial_pressure_ave,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,172,80,23,1,hpy,0.0,-0.686982,-1.795435,-0.890042,-0.851026,...,0.199888,1.140748,-1.671084,0.357637,0.370343,0.268826,-0.884355,-0.280475,-0.852562,-0.92611
1,172,80,23,1,hpy,0.0,-0.686982,-1.542652,-1.03242,-1.30107,...,-0.566797,1.140748,-1.31456,0.252567,0.603241,0.341495,-0.706836,-0.155414,-1.1409,-0.987113
2,172,80,23,1,hpy,0.0,-0.686982,-1.474998,-0.475898,-0.163846,...,-1.158968,1.140748,-1.135798,-0.309588,-0.097664,-0.282617,-0.597825,0.033795,-0.459153,-0.800383
3,172,80,23,1,hpy,0.0,-0.686982,-1.296894,-1.20506,-1.235224,...,-1.291747,1.140748,-1.163667,-1.114766,-1.130611,-1.145293,1.216162,1.176904,1.29044,1.278954
4,172,80,23,1,hpy,0.0,-0.686982,-1.301129,-1.240578,-1.441307,...,-1.055939,1.140748,-1.073255,-1.114295,-1.130419,-1.144324,1.16465,1.227281,1.216445,1.250162


In [8]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

print(data_df.shape)
data_df.dtypes

(505, 33)


subject_info_height                         int64
subject_info_weight                         int64
subject_info_age                            int64
subject_info_gender                         int64
subject_info_name                          object
subject_info_label                        float64
bicps_br_initial_pressure_ave             float64
bicps_br_FMG                              float64
bicps_br_mav                              float64
bicps_br_rms                              float64
bicps_br_wave_length                      float64
bicps_br_zero_crossing                    float64
bicps_br_slope_sign_change                float64
bicps_br_mean_freq                        float64
bicps_br_mean_power_freq                  float64
tricps_br_medial_initial_pressure_ave     float64
tricps_br_medial_FMG                      float64
tricps_br_medial_mav                      float64
tricps_br_medial_rms                      float64
tricps_br_medial_wave_length              float64


In [10]:
logger_init(log_file_name="model_iFEMG_absnormed")

In [9]:
'''
随机划分训练集和测试集，进行cv参数搜索
'''
columns_to_scale = ['bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

y_data = data_df.loc[:, 'subject_info_label'].values
x_data = data_df.loc[:, columns_to_scale].values

print(x_data.shape)
print(y_data.shape)

# 划分训练集和测试集
train_data, test_data, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)

"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train mean squared error: {mean_squared_error(train_label, train_pre)}")
print(f"test mean squared error: {mean_squared_error(test_label, test_pre)}")

(505, 27)
(505,)
Best params: {'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
training time(min): 4.276325856666639
train score: 0.9425507557662818
test score: 0.8088633137231324
train mean squared error: 0.03251926197920508
test mean squared error: 0.1030256905689008


In [11]:
'''
每次取一个受试者作为测试集，其余人的数据作为训练集
'''
# 选取数据集名称
columns_to_scale = ['bicps_br_initial_pressure_ave',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_initial_pressure_ave',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_medial_zero_crossing',
                    'tricps_br_medial_slope_sign_change',
                    'tricps_br_medial_mean_freq',
                    'tricps_br_medial_mean_power_freq',
                    'tricps_br_lateral_initial_pressure_ave',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length',
                    'tricps_br_lateral_zero_crossing',
                    'tricps_br_lateral_slope_sign_change',
                    'tricps_br_lateral_mean_freq',
                    'tricps_br_lateral_mean_power_freq']

for subject in set(data_df["subject_info_name"]):
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    test_df = data_df[data_df["subject_info_name"] == subject]  # 测试集
    train_df = data_df[data_df["subject_info_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'subject_info_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'subject_info_label'].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")

    """
    Grid searching best parameters of SVR model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(SVR(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
    svr.fit(x_train, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test, y_test)
    score_train = svr.score(x_train, y_train)
    logging.info(f"train score: {str(score_train)}")
    logging.info(f"test score: {str(score_test)}")
    test_pre = svr.predict(x_test)
    train_pre = svr.predict(x_train)
    logging.info(f"train mean squared error: {mean_squared_error(y_train, train_pre)}")
    logging.info(f"test mean squared error: {mean_squared_error(y_test, test_pre)}")
    temp_predict_label_df = pd.DataFrame({'y_true': y_test,
                                          'y_predicted': test_pre,
                                          'diff': y_test - test_pre})
    logging.info(f"predicted label: \n{temp_predict_label_df}")
    pass