In [2]:
"""
模型训练
"""

import joblib
import time
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# read data
data_df = pd.read_csv(r"D:\code\data\iFEMG_data_set\One_channel_dataset\20221007.csv", index_col=0)
print(data_df.shape)
data_df.head()

(2155, 15)


Unnamed: 0,subject_name,height(cm),weight(kg),gender,age,sensor_channel,label(kg),FMG_increase,mav,rms,wave_length,zero_crossing,slope_sign_change,mean_freq,mean_power_freq
0,Wang Junhan,170,60,1,21,bicps_br,0.0,0.635723,10.453135,17.756081,3.921375,-0.437357,-0.24576,52.549982,57.153273
1,Wang Junhan,170,60,1,21,bicps_br,0.0,0.669451,2.126425,2.229017,0.939814,-0.458228,-0.143381,68.783704,91.455915
2,Wang Junhan,170,60,1,21,bicps_br,0.0,0.799892,1.900349,1.776664,1.31773,-0.397235,-0.123461,66.73004,91.044406
3,Wang Junhan,170,60,1,21,bicps_br,0.0,0.638235,1.742331,1.789428,0.92663,-0.372496,-0.129632,71.40615,96.157089
4,Wang Junhan,170,60,1,21,bicps_br,0.0,0.66184,2.419714,2.047832,1.420618,-0.386364,-0.172241,85.291018,106.993184


In [4]:
"""
数据清洗 remove the data bellow:
    1. NaN value
    2. label is MVC
"""
# replace 'MVC' with NaN
data_df = data_df.replace('MVC', np.nan)
# delete NaN value
data_df = data_df.dropna(how = 'any')

# label: str to num
data_df['label(kg)']= pd.to_numeric(data_df['label(kg)'])

print(data_df.shape)
data_df.dtypes


(1949, 15)


subject_name          object
height(cm)             int64
weight(kg)             int64
gender                 int64
age                    int64
sensor_channel        object
label(kg)            float64
FMG_increase         float64
mav                  float64
rms                  float64
wave_length          float64
zero_crossing        float64
slope_sign_change    float64
mean_freq            float64
mean_power_freq      float64
dtype: object

In [5]:
"""
1. get x and y data
2. split train and test
3. scale
"""
x_data = data_df[['height(cm)', 'weight(kg)', 'gender', 'age', 'label(kg)', 'FMG_increase', 'mav', 'rms', 'wave_length', 'zero_crossing', 'slope_sign_change', 'mean_freq', 'mean_power_freq']].values
y_data = data_df['label(kg)'].values

# train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0, stratify = y_data)
train_data_r, test_data_r, train_label, test_label = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
# train_data = preprocessing.normalize(train_data_r, norm = 'max')
# test_data = preprocessing.normalize(test_data_r, norm = 'max')
scaler = preprocessing.StandardScaler().fit(train_data_r)
train_data = scaler.transform(train_data_r)
test_data = scaler.transform(test_data_r)

print("train data shape: ", train_data.shape)
print("train label length: ", train_label.shape)
print("test data shape: ", test_data.shape)
print("test label length: ", test_label.shape)

train data shape:  (1559, 13)
train label length:  (1559,)
test data shape:  (390, 13)
test label length:  (390,)


In [5]:
"""
Grid searching best parameters of SVR model
"""
# 记录开始训练时间
start_time = time.perf_counter()

# 自动选择合适的参数
svr = GridSearchCV(SVR(), param_grid={"kernel": ("rbf", "linear"), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}, n_jobs=-1)
svr.fit(train_data, train_label)

print(svr.best_params_)

end_time = time.perf_counter()
print("运行时间(min): ", (end_time - start_time)/60)

start training, time:  2022--11--09 16:41:57
{'C': 1000.0, 'gamma': 0.001, 'kernel': 'rbf'}
finish, time:  2022--11--09 16:42:04


In [6]:

score_test = svr.score(test_data, test_label)
score_train = svr.score(train_data, train_label)
print(str(svr) + "train score： " + str(score_train))
print(str(svr) + "test score： " + str(score_test))
test_pre = svr.predict(test_data)
train_pre = svr.predict(train_data)
print("train mean squared error: ", mean_squared_error(train_label, train_pre))
print("test mean squared error: ", mean_squared_error(test_label, test_pre))


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('rbf', 'linear')})train score： 0.9996358359989818
GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ('rbf', 'linear')})test score： 0.9996304775099476
train mean squared error:  0.002440257399917376
test mean squared error:  0.0024763594616069875


In [1]:
"""
train model using the best parameters
"""

# 记录开始训练时间
start_time = time.time()
print("start training, time: ", time.strftime("%Y--%m--%d %H:%M:%S", time.localtime(start_time)))


regression_model = SVR(kernel='rbf', C=1000, gamma=0.001)
regression_model.fit(train_data, train_label)

end_time = time.time()
print("finish, time: ", time.strftime("%Y--%m--%d %H:%M:%S", time.localtime(end_time)))

NameError: name 'time' is not defined

In [6]:
joblib.dump(svr, 'OneChannelRegression.pkl')
joblib.dump(scaler, "OneChannelScaler.save")

['OneChannelScaler.save']