In [None]:
import pandas as pd
import numpy as np
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from feature_engine.timeseries.forecasting import WindowFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# 定义滑窗长度列表
window_lengths = [0.6, 0.8, 1]

# 定义要处理的主题
train_subjects = ['Carpenter1', 'Carpenter2', 'Rebar1', 'Rebar2', 'Rebar3']
test_subjects = ['Masonry1', 'Masonry2']

# 初始化空数据框存储所有主题的数据
all_train_data = []
all_test_data = []

# 首先，我们定义一个函数来过滤出非数值型的列名
def get_numeric_columns(df):
    return [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

# 遍历每个主题文件
for subject in train_subjects + test_subjects:
    # 读取数据
    df = pd.read_csv(f'../{subject}.csv')
    
    # 只选取前八个动作
    df = df[df['Label_2'] <= 8]
    
    # 重新编码标签
    label_mapping = {1: 1, 2: 2, 3: 3, 4: 3, 5: 4, 6: 3, 7: 3, 8: 5}
    df['Label_2'] = df['Label_2'].map(label_mapping)
    
    # 同步 xd 和 kd 的记录
    df_xd = df[df['position'] == 'xd'].copy()
    df_kd = df[df['position'] == 'kd'].copy()

    # 确保 xd 和 kd 有相同的时间戳
    common_times = set(df_xd['seconds_elapsed']).intersection(set(df_kd['seconds_elapsed']))
    df_xd = df_xd[df_xd['seconds_elapsed'].isin(common_times)]
    df_kd = df_kd[df_kd['seconds_elapsed'].isin(common_times)]

    # 将 xd 和 kd 的数据合并为一个数据单元
    df_combined = pd.merge(df_xd, df_kd, on='seconds_elapsed', suffixes=('_xd', '_kd'))

    # 在合并 df_xd 和 df_kd 之后，检查并转换数据类型
    for col in df_combined.columns:
        if df_combined[col].dtype == 'object':
            # 尝试将非数值列转换为数值
            try:
                df_combined[col] = pd.to_numeric(df_combined[col], errors='raise')
            except ValueError:
                # 如果转换失败，打印警告并删除这些列
                print(f"Warning: Column {col} could not be converted to a numeric type and will be dropped.")
                df_combined.drop(columns=[col], inplace=True)

    # 删除包含NaN的行
    df_combined.dropna(inplace=True)

    # 检查并替换inf值为NaN，然后删除包含NaN的行
    df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_combined.dropna(inplace=True)

    # 获取数值型列列表，排除特定的非数值型列
    numeric_columns = get_numeric_columns(df_combined)
    exclude_columns = ['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd']
    numeric_columns = [col for col in numeric_columns if col not in exclude_columns]

    # 设置合理的数值范围（可以根据实际情况调整）
    lower_bound = -10000
    upper_bound = 10000

    # 创建一个新的空DataFrame来存储筛选后的结果
    df_filtered = pd.DataFrame()

    for col in numeric_columns:
        # 使用 & 操作符组合多个条件，并筛选出符合条件的行
        filtered_rows = df_combined[(df_combined[col] >= lower_bound) & (df_combined[col] <= upper_bound)]
        # 将筛选后的结果追加到新的DataFrame中
        df_filtered = pd.concat([df_filtered, filtered_rows], ignore_index=True)

    # 如果你想要替换原来的DataFrame
    df_combined = df_filtered

    # # 设置合理的 chunksize
    # chunksize = 1000  # 你可以根据实际情况调整这个值

    # # 使用 tsfresh 提取特征
    # extracted_features = extract_features(
    #     df_combined,
    #     column_id="seconds_elapsed",
    #     column_sort="seconds_elapsed",
    #     chunksize=chunksize  # 显式设置 chunksize
    # )
        
    impute(df_combined)
    # target = df_combined[['Label_2_xd']].drop_duplicates().set_index('seconds_elapsed')
    target = df_combined[['Label_2_xd']].drop_duplicates()
    # relevant_features_tsfresh = select_features(df_combined, target)

    # 创建不同滑窗长度的特征
    for window_length in window_lengths:
        window_features = WindowFeatures(
            variables=[col for col in df_combined.columns if col not in ['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd']],
            window=f'{window_length}s',
            freq='1s'
        )
        df_combined = pd.concat([df_combined, window_features.fit_transform(df_combined)], axis=1)

    # 合并所有特征
    # all_features = pd.concat([df_combined, relevant_features_tsfresh], axis=1).dropna()

    # 分离特征和目标变量
    X = df_combined.drop(columns=['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd'])
    y = df_combined['Label_2_xd']

    if subject in train_subjects:
        all_train_data.append((X, y))
    else:
        all_test_data.append((X, y))

# 合并训练数据和测试数据
X_train_all, y_train_all = pd.concat([data[0] for data in all_train_data]), pd.concat([data[1] for data in all_train_data])
X_test_all, y_test_all = pd.concat([data[0] for data in all_test_data]), pd.concat([data[1] for data in all_test_data])

# 使用SMOTE处理类别不平衡
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_all, y_train_all)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 定义XGBoost和LightGBM的参数空间
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

# 使用 RandomizedSearchCV 进行XGBoost模型调优
xgb_clf = XGBClassifier(n_jobs=-1, random_state=42)
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for XGBoost:", xgb_random_search.best_params_)
best_xgb_clf = xgb_random_search.best_estimator_

# 使用 RandomizedSearchCV 进行LightGBM模型调优
lgbm_clf = LGBMClassifier(n_jobs=-1, random_state=42)
lgbm_random_search = RandomizedSearchCV(lgbm_clf, lgbm_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
lgbm_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for LightGBM:", lgbm_random_search.best_params_)
best_lgbm_clf = lgbm_random_search.best_estimator_

# 在验证集上评估XGBoost模型
y_pred_xgb = best_xgb_clf.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))

# 在验证集上评估LightGBM模型
y_pred_lgbm = best_lgbm_clf.predict(X_val)
print("LightGBM Validation Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print("LightGBM Classification Report:\n", classification_report(y_val, y_pred_lgbm))

# 对测试集进行预测
y_pred_xgb_test = best_xgb_clf.predict(X_test_all)
print("XGBoost Test Accuracy:", accuracy_score(y_test_all, y_pred_xgb_test))
print("XGBoost Test Classification Report:\n", classification_report(y_test_all, y_pred_xgb_test))

y_pred_lgbm_test = best_lgbm_clf.predict(X_test_all)
print("LightGBM Test Accuracy:", accuracy_score(y_test_all, y_pred_lgbm_test))
print("LightGBM Test Classification Report:\n", classification_report(y_test_all, y_pred_lgbm_test))

In [None]:
import pandas as pd
import numpy as np
from feature_engine.imputation import MeanMedianImputer
from feature_engine.timeseries.forecasting import WindowFeatures
from feature_engine.selection import SelectByShuffling
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# 定义滑窗长度列表
window_lengths = [0.6, 0.8, 1]

# 定义要处理的主题
train_subjects = ['Carpenter1', 'Carpenter2', 'Rebar1', 'Rebar2', 'Rebar3']
test_subjects = ['Masonry1', 'Masonry2']

# 初始化空数据框存储所有主题的数据
all_train_data = []
all_test_data = []

# 首先，我们定义一个函数来过滤出非数值型的列名
def get_numeric_columns(df):
    return [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

# 遍历每个主题文件
for subject in train_subjects + test_subjects:
    # 读取数据
    df = pd.read_csv(f'../{subject}.csv')
    
    # 只选取前八个动作
    df = df[df['Label_2'] <= 8]
    
    # 重新编码标签
    label_mapping = {1: 1, 2: 2, 3: 3, 4: 3, 5: 4, 6: 3, 7: 3, 8: 5}
    df['Label_2'] = df['Label_2'].map(label_mapping)
    
    # 同步 xd 和 kd 的记录
    df_xd = df[df['position'] == 'xd'].copy()
    df_kd = df[df['position'] == 'kd'].copy()

    # 确保 xd 和 kd 有相同的时间戳
    common_times = set(df_xd['seconds_elapsed']).intersection(set(df_kd['seconds_elapsed']))
    df_xd = df_xd[df_xd['seconds_elapsed'].isin(common_times)]
    df_kd = df_kd[df_kd['seconds_elapsed'].isin(common_times)]

    # 将 xd 和 kd 的数据合并为一个数据单元
    df_combined = pd.merge(df_xd, df_kd, on='seconds_elapsed', suffixes=('_xd', '_kd'))

    # 在合并 df_xd 和 df_kd 之后，检查并转换数据类型
    for col in df_combined.columns:
        if df_combined[col].dtype == 'object':
            # 尝试将非数值列转换为数值
            try:
                df_combined[col] = pd.to_numeric(df_combined[col], errors='raise')
            except ValueError:
                # 如果转换失败，打印警告并删除这些列
                print(f"Warning: Column {col} could not be converted to a numeric type and will be dropped.")
                df_combined.drop(columns=[col], inplace=True)

    # 删除包含NaN的行
    df_combined.dropna(inplace=True)

    # 检查并替换inf值为NaN，然后删除包含NaN的行
    df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_combined.dropna(inplace=True)

    # 获取数值型列列表，排除特定的非数值型列
    numeric_columns = get_numeric_columns(df_combined)
    exclude_columns = ['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd']
    numeric_columns = [col for col in numeric_columns if col not in exclude_columns]

    # 创建不同滑窗长度的特征
    for window_length in window_lengths:
        window_features = WindowFeatures(
            variables=[col for col in df_combined.columns if col not in ['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd']],
            window=f'{window_length}s',
            freq='1s'
        )
        df_combined = pd.concat([df_combined, window_features.fit_transform(df_combined)], axis=1)

    # 合并所有特征
    all_features = df_combined

    # 分离特征和目标变量
    X = all_features.drop(columns=['seconds_elapsed', 'time_xd', 'Real Time_xd', 'Label_2_xd'])
    y = all_features['Label_2_xd']

    if subject in train_subjects:
        all_train_data.append((X, y))
    else:
        all_test_data.append((X, y))

# 合并训练数据和测试数据
X_train_all, y_train_all = pd.concat([data[0] for data in all_train_data]), pd.concat([data[1] for data in all_train_data])
X_test_all, y_test_all = pd.concat([data[0] for data in all_test_data]), pd.concat([data[1] for data in all_test_data])

# 使用SMOTE处理类别不平衡
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_all, y_train_all)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 定义XGBoost和LightGBM的参数空间
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

# 使用 RandomizedSearchCV 进行XGBoost模型调优
xgb_clf = XGBClassifier(n_jobs=-1, random_state=42)
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for XGBoost:", xgb_random_search.best_params_)
best_xgb_clf = xgb_random_search.best_estimator_

# 使用 RandomizedSearchCV 进行LightGBM模型调优
lgbm_clf = LGBMClassifier(n_jobs=-1, random_state=42)
lgbm_random_search = RandomizedSearchCV(lgbm_clf, lgbm_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
lgbm_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for LightGBM:", lgbm_random_search.best_params_)
best_lgbm_clf = lgbm_random_search.best_estimator_

# 在验证集上评估XGBoost模型
y_pred_xgb = best_xgb_clf.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))

# 在验证集上评估LightGBM模型
y_pred_lgbm = best_lgbm_clf.predict(X_val)
print("LightGBM Validation Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print("LightGBM Classification Report:\n", classification_report(y_val, y_pred_lgbm))

# 对测试集进行预测
y_pred_xgb_test = best_xgb_clf.predict(X_test_all)
print("XGBoost Test Accuracy:", accuracy_score(y_test_all, y_pred_xgb_test))
print("XGBoost Test Classification Report:\n", classification_report(y_test_all, y_pred_xgb_test))

y_pred_lgbm_test = best_lgbm_clf.predict(X_test_all)
print("LightGBM Test Accuracy:", accuracy_score(y_test_all, y_pred_lgbm_test))
print("LightGBM Test Classification Report:\n", classification_report(y_test_all, y_pred_lgbm_test))

In [16]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

# 定义滑窗长度列表（假设采样频率为1Hz）
sampling_frequency = 1  # 每秒一个样本
window_lengths = [max(int(0.6 * sampling_frequency), 1), max(int(0.8 * sampling_frequency), 1), max(int(1 * sampling_frequency), 1)]

# 定义要处理的主题
train_subjects = ['Carpenter1', 'Carpenter2', 'Rebar1', 'Rebar2', 'Rebar3']
test_subjects = ['Masonry1', 'Masonry2']

# 初始化空数据框存储所有主题的数据
all_train_data = []
all_test_data = []

# 遍历每个主题文件
for subject in train_subjects + test_subjects:
    # 读取数据，设置 low_memory=False
    df = pd.read_csv(f'../{subject}.csv', low_memory=False)
    
    # 将 'Label_2' 列转换为数值类型
    df['Label_2'] = pd.to_numeric(df['Label_2'], errors='coerce')
    
    # 只选取前八个动作
    df = df[df['Label_2'] <= 8]
    
    # 重新编码标签
    label_mapping = {1: 1, 2: 2, 3: 3, 4: 3, 5: 4, 6: 3, 7: 3, 8: 5}
    df['Label_2'] = df['Label_2'].map(label_mapping)
    
    # 同步 xd 和 kd 的记录
    df_xd = df[df['position'] == 'xd'].copy()
    df_kd = df[df['position'] == 'kd'].copy()

    # 确保 xd 和 kd 有相同的时间戳
    common_times = set(df_xd['seconds_elapsed']).intersection(set(df_kd['seconds_elapsed']))
    df_xd = df_xd[df_xd['seconds_elapsed'].isin(common_times)]
    df_kd = df_kd[df_kd['seconds_elapsed'].isin(common_times)]

    # 将 xd 和 kd 的数据合并为一个数据单元
    df_combined = pd.merge(df_xd, df_kd, on='seconds_elapsed', suffixes=('_xd', '_kd'))

    # 删除非数值列
    df_combined = df_combined.select_dtypes(include=[np.number])

    # 删除包含NaN的行
    df_combined.dropna(inplace=True)

    # 检查并替换inf值为NaN，然后删除包含NaN的行
    df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_combined.dropna(inplace=True)

    # 创建不同滑窗长度的特征
    new_features = {}
    for window_length in window_lengths:
        for col in df_combined.columns:
            if col not in ['seconds_elapsed', 'Label_2_xd']:
                # 确保窗口大小至少为1
                window_length = max(window_length, 1)
                new_features[f'{col}_mean_{window_length}'] = df_combined[col].rolling(window=window_length, min_periods=1).mean()
                new_features[f'{col}_std_{window_length}'] = df_combined[col].rolling(window=window_length, min_periods=1).std()

    # 一次性将所有新特征添加到 DataFrame 中
    df_combined = pd.concat([df_combined, pd.DataFrame(new_features)], axis=1)

    # 分离特征和目标变量
    X = df_combined.drop(columns=['seconds_elapsed', 'Label_2_xd'])
    y = df_combined['Label_2_xd']

    if subject in train_subjects:
        all_train_data.append((X, y))
    else:
        all_test_data.append((X, y))

# 合并训练数据和测试数据
X_train_all, y_train_all = pd.concat([data[0] for data in all_train_data]), pd.concat([data[1] for data in all_train_data])
X_test_all, y_test_all = pd.concat([data[0] for data in all_test_data]), pd.concat([data[1] for data in all_test_data])

# 使用 SimpleImputer 处理缺失值
imputer = SimpleImputer(strategy='mean')  # 也可以选择其他策略，如 'median' 或 'most_frequent'
X_train_imputed = imputer.fit_transform(X_train_all)
X_test_imputed = imputer.transform(X_test_all)


 'Accelerometer.xlsx_x_xd_std_1' 'Gravity_z_xd_std_1' 'Gravity_y_xd_std_1'
 'Gravity_x_xd_std_1' 'Gyroscope_z_xd_std_1' 'Gyroscope_y_xd_std_1'
 'Gyroscope_x_xd_std_1' 'Magnetometer_z_xd_std_1'
 'Magnetometer_y_xd_std_1' 'Magnetometer_x_xd_std_1' 'yaw_xd_std_1'
 'qx_xd_std_1' 'qz_xd_std_1' 'roll_xd_std_1' 'qw_xd_std_1' 'qy_xd_std_1'
 'pitch_xd_std_1' 'time_xd_std_1' 'Accelerometer.xlsx_z_kd_std_1'
 'Accelerometer.xlsx_y_kd_std_1' 'Accelerometer.xlsx_x_kd_std_1'
 'Gravity_z_kd_std_1' 'Gravity_y_kd_std_1' 'Gravity_x_kd_std_1'
 'Gyroscope_z_kd_std_1' 'Gyroscope_y_kd_std_1' 'Gyroscope_x_kd_std_1'
 'Magnetometer_z_kd_std_1' 'Magnetometer_y_kd_std_1'
 'Magnetometer_x_kd_std_1' 'yaw_kd_std_1' 'qx_kd_std_1' 'qz_kd_std_1'
 'roll_kd_std_1' 'qw_kd_std_1' 'qy_kd_std_1' 'pitch_kd_std_1'
 'time_kd_std_1' 'Label_2_kd_std_1']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Accelerometer.xlsx_x_xd_std_1' 'Gravity_z_xd_std_1' 'Gravity_y_xd_std_1'
 'Gravity_x_xd_std_1' 'Gy

In [None]:

# # 使用 SMOTE 处理类别不平衡，并设置 n_neighbors 为 2
# smote = SMOTE(random_state=42, k_neighbors=2)
# X_resampled, y_resampled = smote.fit_resample(X_train_imputed, y_train_all)

# 输出结果
# print("Resampled training data shape:", X_resampled.shape, y_resampled.shape)

In [17]:
X_train_imputed

array([[ 2.08363057e-01,  8.01913179e-01, -8.36096896e-01,
        -2.51921628e+00, -8.80946968e+00,  3.49530646e+00,
         3.50175200e-03,  1.36099130e-01, -2.36295402e-01,
         1.51912842e+01, -6.44863968e+01,  2.73372650e+01,
        -5.48659956e-01,  5.58392365e-01,  2.76756300e-02,
         9.46284364e-01,  7.92261506e-01,  2.44445664e-01,
         1.11592127e+00,  1.63288000e+18, -4.39467120e-02,
        -2.85357740e-02,  9.60492210e-02,  2.08847205e-01,
        -9.00688143e+00, -3.87335067e+00, -2.17964370e-02,
        -4.28508070e-02, -8.99861530e-02,  4.21420288e+01,
        -4.67664719e+01, -7.88182545e+00,  3.08764323e+00,
         6.16336533e-01,  5.63572813e-01, -1.62466317e+00,
         4.14412132e-01,  3.61631749e-01,  1.16413445e+00,
         1.63288000e+18,  1.00000000e+00,  2.08363057e-01,
         8.01913179e-01, -8.36096896e-01, -2.51921628e+00,
        -8.80946968e+00,  3.49530646e+00,  3.50175200e-03,
         1.36099130e-01, -2.36295402e-01,  1.51912842e+0

In [None]:
# 合并训练数据和测试数据
X_train_all, y_train_all = pd.concat([data[0] for data in all_train_data]), pd.concat([data[1] for data in all_train_data])
X_test_all, y_test_all = pd.concat([data[0] for data in all_test_data]), pd.concat([data[1] for data in all_test_data])

# 使用SMOTE处理类别不平衡
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_all, y_train_all)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 定义XGBoost和LightGBM的参数空间
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'tree_method': 'gpu_hist',
}

# 使用 RandomizedSearchCV 进行XGBoost模型调优
xgb_clf = XGBClassifier(n_jobs=-1, random_state=42)
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for XGBoost:", xgb_random_search.best_params_)
best_xgb_clf = xgb_random_search.best_estimator_

# 使用 RandomizedSearchCV 进行LightGBM模型调优
lgbm_clf = LGBMClassifier(n_jobs=-1, random_state=42)
lgbm_random_search = RandomizedSearchCV(lgbm_clf, lgbm_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
lgbm_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for LightGBM:", lgbm_random_search.best_params_)
best_lgbm_clf = lgbm_random_search.best_estimator_

# 在验证集上评估XGBoost模型
y_pred_xgb = best_xgb_clf.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))

# 在验证集上评估LightGBM模型
y_pred_lgbm = best_lgbm_clf.predict(X_val)
print("LightGBM Validation Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print("LightGBM Classification Report:\n", classification_report(y_val, y_pred_lgbm))

# 对测试集进行预测
y_pred_xgb_test = best_xgb_clf.predict(X_test_all)
print("XGBoost Test Accuracy:", accuracy_score(y_test_all, y_pred_xgb_test))
print("XGBoost Test Classification Report:\n", classification_report(y_test_all, y_pred_xgb_test))

y_pred_lgbm_test = best_lgbm_clf.predict(X_test_all)
print("LightGBM Test Accuracy:", accuracy_score(y_test_all, y_pred_lgbm_test))
print("LightGBM Test Classification Report:\n", classification_report(y_test_all, y_pred_lgbm_test))

In [19]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# 定义滑窗长度列表（假设采样频率为1Hz）
sampling_frequency = 1  # 每秒一个样本
window_lengths = [max(int(0.6 * sampling_frequency), 1), max(int(0.8 * sampling_frequency), 1), max(int(1 * sampling_frequency), 1)]

# 定义要处理的主题
train_subjects = ['Carpenter1', 'Carpenter2', 'Rebar1', 'Rebar2', 'Rebar3']
test_subjects = ['Masonry1', 'Masonry2']

# 初始化空数据框存储所有主题的数据
all_train_data = []
all_test_data = []

# 遍历每个主题文件
for subject in train_subjects + test_subjects:
    # 读取数据，设置 low_memory=False
    df = pd.read_csv(f'../{subject}.csv', low_memory=False)
    
    # 将 'Label_2' 列转换为数值类型
    df['Label_2'] = pd.to_numeric(df['Label_2'], errors='coerce')
    
    # 只选取前八个动作
    df = df[df['Label_2'] <= 8]
    
    # 重新编码标签
    label_mapping = {1: 1, 2: 2, 3: 3, 4: 3, 5: 4, 6: 3, 7: 3, 8: 5}
    df['Label_2'] = df['Label_2'].map(label_mapping)
    
    # 同步 xd 和 kd 的记录
    df_xd = df[df['position'] == 'xd'].copy()
    df_kd = df[df['position'] == 'kd'].copy()

    # 确保 xd 和 kd 有相同的时间戳
    common_times = set(df_xd['seconds_elapsed']).intersection(set(df_kd['seconds_elapsed']))
    df_xd = df_xd[df_xd['seconds_elapsed'].isin(common_times)]
    df_kd = df_kd[df_kd['seconds_elapsed'].isin(common_times)]

    # 将 xd 和 kd 的数据合并为一个数据单元
    df_combined = pd.merge(df_xd, df_kd, on='seconds_elapsed', suffixes=('_xd', '_kd'))

    # 删除非数值列
    df_combined = df_combined.select_dtypes(include=[np.number])

    # 删除包含NaN的行
    df_combined.dropna(inplace=True)

    # 检查并替换inf值为NaN，然后删除包含NaN的行
    df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_combined.dropna(inplace=True)

    # 创建不同滑窗长度的特征
    new_features = {}
    for window_length in window_lengths:
        for col in df_combined.columns:
            if col not in ['seconds_elapsed', 'Label_2_xd']:
                # 确保窗口大小至少为1
                window_length = max(window_length, 1)
                new_features[f'{col}_mean_{window_length}'] = df_combined[col].rolling(window=window_length, min_periods=1).mean()
                new_features[f'{col}_std_{window_length}'] = df_combined[col].rolling(window=window_length, min_periods=1).std()

    # 一次性将所有新特征添加到 DataFrame 中
    df_combined = pd.concat([df_combined, pd.DataFrame(new_features)], axis=1)

    # 分离特征和目标变量
    X = df_combined.drop(columns=['seconds_elapsed', 'Label_2_xd'])
    y = df_combined['Label_2_xd']

    if subject in train_subjects:
        all_train_data.append((X, y))
    else:
        all_test_data.append((X, y))

# 合并训练数据和测试数据
X_train_all, y_train_all = pd.concat([data[0] for data in all_train_data]), pd.concat([data[1] for data in all_train_data])
X_test_all, y_test_all = pd.concat([data[0] for data in all_test_data]), pd.concat([data[1] for data in all_test_data])

# 使用 SimpleImputer 处理缺失值
imputer = SimpleImputer(strategy='mean')  # 也可以选择其他策略，如 'median' 或 'most_frequent'
X_train_imputed = imputer.fit_transform(X_train_all)
X_test_imputed = imputer.transform(X_test_all)

# # 使用SMOTE处理类别不平衡
# smote = SMOTE(random_state=42, k_neighbors=2)  # 设置 k_neighbors 为 2
# X_resampled, y_resampled = smote.fit_resample(X_train_imputed, y_train_all)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train_imputed, X_test_imputed, test_size=0.2, random_state=42)

# 定义XGBoost和LightGBM的参数空间
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# 使用 RandomizedSearchCV 进行XGBoost模型调优
xgb_clf = XGBClassifier(n_jobs=-1, random_state=42)
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for XGBoost:", xgb_random_search.best_params_)
best_xgb_clf = xgb_random_search.best_estimator_

# 使用 RandomizedSearchCV 进行LightGBM模型调优
lgbm_clf = LGBMClassifier(n_jobs=-1, random_state=42)
lgbm_random_search = RandomizedSearchCV(lgbm_clf, lgbm_param_grid, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
lgbm_random_search.fit(X_train, y_train)

# 输出最佳参数
print("Best parameters for LightGBM:", lgbm_random_search.best_params_)
best_lgbm_clf = lgbm_random_search.best_estimator_

# 在验证集上评估XGBoost模型
y_pred_xgb = best_xgb_clf.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))

# 在验证集上评估LightGBM模型
y_pred_lgbm = best_lgbm_clf.predict(X_val)
print("LightGBM Validation Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print("LightGBM Classification Report:\n", classification_report(y_val, y_pred_lgbm))

# 对测试集进行预测
y_pred_xgb_test = best_xgb_clf.predict(X_test_imputed)
print("XGBoost Test Accuracy:", accuracy_score(y_test_all, y_pred_xgb_test))
print("XGBoost Test Classification Report:\n", classification_report(y_test_all, y_pred_xgb_test))

y_pred_lgbm_test = best_lgbm_clf.predict(X_test_imputed)
print("LightGBM Test Accuracy:", accuracy_score(y_test_all, y_pred_lgbm_test))
print("LightGBM Test Classification Report:\n", classification_report(y_test_all, y_pred_lgbm_test))

 'Accelerometer.xlsx_x_xd_std_1' 'Gravity_z_xd_std_1' 'Gravity_y_xd_std_1'
 'Gravity_x_xd_std_1' 'Gyroscope_z_xd_std_1' 'Gyroscope_y_xd_std_1'
 'Gyroscope_x_xd_std_1' 'Magnetometer_z_xd_std_1'
 'Magnetometer_y_xd_std_1' 'Magnetometer_x_xd_std_1' 'yaw_xd_std_1'
 'qx_xd_std_1' 'qz_xd_std_1' 'roll_xd_std_1' 'qw_xd_std_1' 'qy_xd_std_1'
 'pitch_xd_std_1' 'time_xd_std_1' 'Accelerometer.xlsx_z_kd_std_1'
 'Accelerometer.xlsx_y_kd_std_1' 'Accelerometer.xlsx_x_kd_std_1'
 'Gravity_z_kd_std_1' 'Gravity_y_kd_std_1' 'Gravity_x_kd_std_1'
 'Gyroscope_z_kd_std_1' 'Gyroscope_y_kd_std_1' 'Gyroscope_x_kd_std_1'
 'Magnetometer_z_kd_std_1' 'Magnetometer_y_kd_std_1'
 'Magnetometer_x_kd_std_1' 'yaw_kd_std_1' 'qx_kd_std_1' 'qz_kd_std_1'
 'roll_kd_std_1' 'qw_kd_std_1' 'qy_kd_std_1' 'pitch_kd_std_1'
 'time_kd_std_1' 'Label_2_kd_std_1']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Accelerometer.xlsx_x_xd_std_1' 'Gravity_z_xd_std_1' 'Gravity_y_xd_std_1'
 'Gravity_x_xd_std_1' 'Gy

ValueError: Found input variables with inconsistent numbers of samples: [10, 5]