In [1]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

from utils_logger import logger_init
import logging

from utils_model import form_params_grid

In [14]:
# 读取数据集
data_df = pd.read_csv(r"E:\Code_R\health_iFEMG_US\Data\iFEMG\iFEMG_curl_cleaned.csv", index_col = None, header = 0)
print(data_df.shape)

# 数据清洗
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')
print(data_df.shape)


# 选取0 1 2kg数据集
filtered_df = data_df[data_df['bicps_br_label'].isin([0, 1, 2])]
print(filtered_df.shape)
filtered_df.head()


(506, 30)
(506, 30)
(379, 30)


Unnamed: 0,bicps_br_subject_name,bicps_br_height.cm.,bicps_br_weight.kg.,bicps_br_gender,bicps_br_age,bicps_br_label,bicps_br_FMG,bicps_br_mav,bicps_br_rms,bicps_br_wave_length,...,tricps_br_medial_mean_freq,tricps_br_medial_mean_power_freq,tricps_br_lateral_FMG,tricps_br_lateral_mav,tricps_br_lateral_rms,tricps_br_lateral_wave_length,tricps_br_lateral_zero_crossing,tricps_br_lateral_slope_sign_change,tricps_br_lateral_mean_freq,tricps_br_lateral_mean_power_freq
0,chw,165,55,1,22,0.0,0.785424,0.619119,0.515783,0.599997,...,19.446411,26.535749,0.205748,3.774143,4.928638,2.928105,-0.12667,-0.096836,15.250331,21.728353
1,chw,165,55,1,22,0.0,0.720584,1.148243,1.40336,1.075985,...,16.658652,22.014694,0.150691,0.512222,0.363698,0.286566,-0.157561,0.18279,18.031269,35.60953
2,chw,165,55,1,22,0.0,0.636993,0.281217,0.295154,0.387338,...,18.510912,20.318615,0.149297,0.12377,-0.005632,0.29056,0.02446,0.103409,40.633424,51.415324
3,chw,165,55,1,22,0.0,0.839786,0.330342,0.332777,0.501974,...,20.998436,23.855836,0.201143,0.081555,0.384973,0.593592,0.536885,0.059259,51.219253,55.677856
4,chw,165,55,1,22,0.0,0.937684,0.355456,0.087872,0.744299,...,22.202039,22.144516,0.187731,0.169043,-0.034068,0.257089,-0.044391,0.036053,48.274587,57.545935


In [4]:
# 初始化logging文件
logger_init(log_file_name="svm2")

In [23]:
"""
模型训练
"""
x_data = filtered_df.loc[:, columns_to_scale].values
y_data = filtered_df.loc[:, 'bicps_br_label'].values
print(x_data.shape)
print(y_data.shape)

train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
# train_data = preprocessing.normalize(train_data_r, norm = 'max')
# test_data = preprocessing.normalize(test_data_r, norm = 'max')
scaler = preprocessing.StandardScaler().fit(train_data_r)
train_data = scaler.transform(train_data_r)
test_data = scaler.transform(test_data_r)


"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(svm.SVC(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1, cv=5)
svr.fit(train_data, train_label)
print(f"Best params: {svr.best_params_}")

end_time = time.perf_counter()
print(f"training time(min): {(end_time - start_time)/60}")

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(f"train score: {str(score_train)}")
print(f"test score: {str(score_test)}")
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print(f"train: {classification_report(train_label, train_pre)}")
print(f"test: {classification_report(test_label, test_pre)}")

(379, 19)
(379,)
Best params: {'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'}
training time(min): 0.4622746099999858
train score: 1.0
test score: 0.5526315789473685
train:               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        99
         1.0       1.00      1.00      1.00       101
         2.0       1.00      1.00      1.00       103

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303

test:               precision    recall  f1-score   support

         0.0       0.71      0.50      0.59        24
         1.0       0.55      0.50      0.52        24
         2.0       0.49      0.64      0.55        28

    accuracy                           0.55        76
   macro avg       0.58      0.55      0.55        76
weighted avg       0.57      0.55      0.55        76



In [18]:
# 每次取一个受试者作为测试集，其余人的数据作为训练集

# 选取数据集名称
columns_to_scale = ['bicps_br_height.cm.',
                    'bicps_br_weight.kg.',
                    'bicps_br_age',
                    'bicps_br_FMG',
                    'bicps_br_mav',
                    'bicps_br_rms',
                    'bicps_br_wave_length',
                    'bicps_br_zero_crossing',
                    'bicps_br_slope_sign_change',
                    'bicps_br_mean_freq',
                    'bicps_br_mean_power_freq',
                    'tricps_br_medial_FMG',
                    'tricps_br_medial_mav',
                    'tricps_br_medial_rms',
                    'tricps_br_medial_wave_length',
                    'tricps_br_lateral_FMG',
                    'tricps_br_lateral_mav',
                    'tricps_br_lateral_rms',
                    'tricps_br_lateral_wave_length']

for subject in set(filtered_df["bicps_br_subject_name"]):
    logging.info(f"=======================================================")
    logging.info(f"test subject: {subject}")
    # 划分训练集和测试集
    test_df = filtered_df[filtered_df["bicps_br_subject_name"] == subject]  # 测试集
    train_df = filtered_df[filtered_df["bicps_br_subject_name"] != subject] # 训练集
    x_test = test_df.loc[:, columns_to_scale].values
    y_test = test_df.loc[:, 'bicps_br_label'].values
    x_train = train_df.loc[:, columns_to_scale].values
    y_train = train_df.loc[:, 'bicps_br_label'].values
    logging.info(f"x_test: {x_test.shape}")
    logging.info(f"y_test: {y_test.shape}")
    logging.info(f"x_train: {x_train.shape}")
    logging.info(f"y_train: {y_train.shape}")
    
    # 数据标准化
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    """
    Grid searching best parameters of SVC model
    """
    # 记录开始训练时间
    start_time = time.perf_counter()

    # 自动选择合适的参数
    svr = GridSearchCV(svm.SVC(), param_grid={"kernel": ("linear", "rbf"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, cv = 5, n_jobs=-1)
    svr.fit(x_train_scaled, y_train)
    logging.info(f"Best params: {svr.best_params_}")

    end_time = time.perf_counter()
    logging.info(f"training time(min): {(end_time - start_time)/60}")

    score_test = svr.score(x_test_scaled, y_test)
    score_train = svr.score(x_train_scaled, y_train)
    logging.info(f"{str(svr)} train score: {str(score_train)}")
    logging.info(f"{str(svr)} test score: {str(score_test)}")
    test_pre = svr.predict(x_test_scaled)
    train_pre = svr.predict(x_train_scaled)
    logging.info(confusion_matrix(y_test, test_pre))
    pass