In [72]:
import  pandas as pd 
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

读取数据

In [67]:
cleaned_merge_data_dir = os.getcwd() + '/cleaned_merge_data/'
cleaned_merge_data_files = ['cleaned_walk_merge_raw.csv', 'cleaned_bike_merge_raw.csv', 'cleaned_run_merge_raw.csv', 'cleaned_sit_merge_raw.csv', 'cleaned_syn_merge_raw.csv']

labeled_data_dir = os.getcwd() +'/feature_engineering_data/add_label/'
labeled_data_files = ['labeled_walk.csv', 'labeled_bike.csv', 'labeled_run.csv', 'labeled_sit.csv', 'labeled_syn.csv']





df_list = []
for i in range(len(cleaned_merge_data_files)):
    df = pd.read_csv(cleaned_merge_data_dir + cleaned_merge_data_files[i])
    df_list.append(df)


加标签

In [65]:
activity_list = ['walk', 'bike', 'run', 'sit']
for i in range(len(df_list)-1):
    df_list[i]['act'] = activity_list[i]
    df_list[i].to_csv(os.getcwd()+'/feature_engineering_data/add_label/' + 'labeled_' + activity_list[i] + '.csv', index=False)

df = df_list[4].copy()
df['dateTime'] = pd.to_datetime(df['dateTime'])
sit_begin = pd.Timestamp('2024-06-06 12:17:00')
sit_end = pd.Timestamp('2024-06-06 12:18:59')
run_end = pd.Timestamp('2024-06-06 12:19:59')
walk_end = pd.Timestamp('2024-06-06 12:20:20')
bike_end = pd.Timestamp('2024-06-06 12:24:05')

conditions = [
    (df['dateTime'] > sit_begin)& (df['dateTime'] <= sit_end),
    (df['dateTime'] > sit_end) & (df['dateTime'] <= run_end),
    (df['dateTime'] > run_end) & (df['dateTime'] <= walk_end),
    (df['dateTime'] > walk_end) & (df['dateTime'] <= bike_end),
]

choices = ['sit', 'Run', 'Walk', 'Bike']
    
df['act'] = pd.Series(pd.Categorical(pd.cut(df['dateTime'].map(pd.Timestamp.timestamp), 
                                            bins=[float('-inf'), sit_end.timestamp(), run_end.timestamp(), walk_end.timestamp(), float('inf')],
                                            labels=choices, right=False)))

df = df[(df['dateTime'] >= sit_begin) & (df['dateTime'] <= bike_end)]
df.to_csv(os.getcwd()+'/feature_engineering_data/add_label/' + 'labeled_' + 'syn' + '.csv', index=False)

最基础的特征工程：只对数据进行基本的标准化和标签化

In [89]:
def only_std_onehot(df):

    categorical_columns = ['usage', 'deviceStatus', 'act']
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    
    encoded = one_hot_encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

    df = pd.concat([df, encoded_df], axis=1)
    df.drop(columns=categorical_columns, inplace=True)
    
    float_columns = df.select_dtypes(include=['float64','float32']).columns
    scaler = MinMaxScaler()
    df[float_columns] = scaler.fit_transform(df[float_columns])
    return df

In [90]:
df_list = []
for i in range(len(labeled_data_files)):
    df = pd.read_csv(labeled_data_dir + labeled_data_files[i])
    df_list.append(df)

train = pd.concat(df_list[:4], axis=0, ignore_index=True)
test = df_list[4].copy()
train.drop(columns=['dateTime'], inplace=True)
test.drop(columns=['dateTime'], inplace=True)
train = only_std_onehot(train)
test = only_std_onehot(test)


train.to_csv(os.getcwd()+'/feature_engineering_data/only_std_onehot/' + 'train.csv', index=False)
test.to_csv(os.getcwd()+'/feature_engineering_data/only_std_onehot/' + 'test.csv', index=False)


特征工程2 1.重新采样 2.添加差分特征3.添加滚动窗口特征

In [96]:
def resample_and_sample(df, samples_per_second=20): #重新采样
    df['dateTime'] = pd.to_datetime(df['dateTime'])

# 提取秒级时间戳
    df['second'] = df['dateTime'].dt.floor('S')

# 对每个秒级时间戳组进行均匀抽样 20 行
    sampled_df = df.groupby('second').apply(lambda x: x.sample(n=min(samples_per_second, len(x)), random_state=42)).reset_index(drop=True)

# 删除临时的 second 列
    sampled_df.drop(columns=['second'], inplace=True)
    sampled_df = sampled_df.sort_values(by='dateTime').reset_index(drop=True)

    return sampled_df

def add_diff_features(df): #添加差分特征
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns

    for col in float_cols:
        df[f'{col}_diff'] = df[col].diff()
    df = df.fillna(0)
    return df

def has_nan(df): #检查是否有缺失值

    print(df.isna().any().any())
    return df.isna().any().any()


def add_rolling_features_for_all_floats(df, window_size): #添加滑动窗口特征
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns

    for col in float_cols:
        df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
        df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
        df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
        df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
    df = df.fillna(0)
    return df

In [100]:
df_list = []
test = None
train = None   

for i in range(len(labeled_data_files)):
    df = pd.read_csv(labeled_data_dir + labeled_data_files[i])
    df_list.append(df)

for i in range(len(df_list)-1):
    df_list[i] = resample_and_sample(df_list[i])
    df_list[i] = add_diff_features(df_list[i])
    df_list[i] = add_rolling_features_for_all_floats(df_list[i], 20)

train = pd.concat(df_list[:4], axis=0, ignore_index=True)

test= df_list[4].copy()
test = resample_and_sample(test)
test = add_diff_features(test)
test = add_rolling_features_for_all_floats(test, 20)

train.drop(columns=['dateTime'], inplace=True)
test.drop(columns=['dateTime'], inplace=True)
train = only_std_onehot(train)
test = only_std_onehot(test)

train.to_csv(os.getcwd()+'/feature_engineering_data/version1/'+'train.csv', index=False)
test.to_csv(os.getcwd()+'/feature_engineering_data/version1/'+'test.csv', index=False)


  df['second'] = df['dateTime'].dt.floor('S')
  sampled_df = df.groupby('second').apply(lambda x: x.sample(n=min(samples_per_second, len(x)), random_state=42)).reset_index(drop=True)
  df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
  df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
  df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
  df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
  df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
  df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
  df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
  df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
  df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
  df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
  df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
  df[f'{col}_rolling_std'] = d