In [15]:
import pandas as pd 
import numpy as np
import os
from scipy.spatial.transform import Rotation as R
from scipy.spatial.transform import Slerp
import matplotlib.pyplot as plt


读取数据

In [16]:
raw_merge_data_dir = os.getcwd() + '/raw_merge_data/'
raw_merge_data_files = ['walk_merge_raw.csv', 'bike_merge_raw.csv', 'run_merge_raw.csv', 'sit_merge_raw.csv', 'syn_merge_raw.csv']

#所有dataframe都存在这里
df_list = []

for i in range(len(raw_merge_data_files)):
    df = pd.read_csv(raw_merge_data_dir + raw_merge_data_files[i], low_memory=False)
    df_list.append(df)

清洗手环数据

In [17]:
def remove_outliers_z_thresh(df, column, z_threshold=3): #基于正态分布Z分数的异常值去除

    mean = df[column].mean()
    std = df[column].std()
    z_scores = (df[column] - mean) / std
    df.loc[np.abs(z_scores) >= z_threshold, column] = np.nan
    return df

#经过我的测试，对于心率数据，线性插值，样条插值，多项式插值效果都非常接近，对于加速度数据，多项式插值效果最好

columns_band = ['rate', 'rateZone', 'BandAccZ', 'BandAccX', 'BandAccY']


for i in range(df_list.__len__()):
    for column in columns_band:
        df_list[i] = remove_outliers_z_thresh(df_list[i], column)
        df_list[i][column] = df_list[i][column].interpolate(method='polynomial', order=3)#样条插值
        df_list[i][column] = df_list[i][column].ffill().bfill()#前向填充，后向填充
        has_nan = df_list[i][column].isnull().any()
        print(f"Column '{column}' has NaN: {has_nan}")


Column 'rate' has NaN: False
Column 'rateZone' has NaN: False
Column 'BandAccZ' has NaN: False
Column 'BandAccX' has NaN: False
Column 'BandAccY' has NaN: False
Column 'rate' has NaN: False
Column 'rateZone' has NaN: False
Column 'BandAccZ' has NaN: False
Column 'BandAccX' has NaN: False
Column 'BandAccY' has NaN: False
Column 'rate' has NaN: False
Column 'rateZone' has NaN: False
Column 'BandAccZ' has NaN: False
Column 'BandAccX' has NaN: False
Column 'BandAccY' has NaN: False
Column 'rate' has NaN: False
Column 'rateZone' has NaN: False
Column 'BandAccZ' has NaN: False
Column 'BandAccX' has NaN: False
Column 'BandAccY' has NaN: False
Column 'rate' has NaN: False
Column 'rateZone' has NaN: False
Column 'BandAccZ' has NaN: False
Column 'BandAccX' has NaN: False
Column 'BandAccY' has NaN: False


清洗手机系统数据

In [18]:
lengths = [len(df) for df in df_list]
# 将lengths中的每个值相加到一起
print(lengths)
print(sum(lengths))
# 读取df_list变量，将这个list按照df_list[0]、1、2、3、4的顺序纵向合并为一个dataframe，名为df_raw。
df_raw = pd.concat(df_list, axis=0, ignore_index=True)
# 输出df_raw的行数
print(len(df_raw))

# 处理 'usage' 列的空值
usage = 'use'
for i, row in df_raw.iterrows():
    if pd.isna(row['usage']):
        df_raw.at[i, 'usage'] = usage
    else:
        usage = row['usage']
        
# 处理 'deviceStatus' 列的空值
if pd.isna(df_raw.at[0, 'deviceStatus']):
    df_raw.at[0, 'deviceStatus'] = 'ACTIVITY_PAUSED'

df_raw['deviceStatus'] = df_raw['deviceStatus'].ffill()  # 向上填充

# 根据 'lengths' 列表将 'df_raw' 分割成 'df_list'
df_list = []
start = 0
for length in lengths:
    df_list.append(df_raw.iloc[start:start+length])
    start += length

print("啦啦啦啦啦啦啦")
lengths = [len(df) for df in df_list]
# 将lengths中的每个值相加到一起
print(lengths)
print(sum(lengths))

[34401, 29865, 12800, 14771, 37936]
129773
129773
啦啦啦啦啦啦啦
[34401, 29865, 12800, 14771, 37936]
129773


In [19]:
columns_band = ['usage', 'deviceStatus']
for i in range(df_list.__len__()):
    for column in columns_band:
        has_nan = df_list[i][column].isnull().any()
        print(f"Column '{column}' has NaN: {has_nan}")

Column 'usage' has NaN: False
Column 'deviceStatus' has NaN: False
Column 'usage' has NaN: False
Column 'deviceStatus' has NaN: False
Column 'usage' has NaN: False
Column 'deviceStatus' has NaN: False
Column 'usage' has NaN: False
Column 'deviceStatus' has NaN: False
Column 'usage' has NaN: False
Column 'deviceStatus' has NaN: False


清洗MATLAB数据

定义清洗函数

In [7]:
# Hyperparameters
# Define thresholds for specific columns
thresholds = {
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "altitude": (-500, 12000),
    "course": (0, 360),
    "hacc": (0, 100),  # Assuming max horizontal accuracy of 100 meters
    "speed": (0, 300),  # Assuming max speed of 300 m/s

}
columns_to_check = [
    "altitude",
    "course",
    "hacc",
    "latitude",
    "longitude",
    "speed",

]
geolocation_columns = ["altitude", "course", "latitude", "longitude"]



def remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds): #移除异常值和阈值以外的值
    for column in columns_to_check:
        if column in df.columns:
            threshold = thresholds.get(column, None)
            if threshold is not None:
                # Filter outliers, retain NaN values
                df = df[(df[column].isna()) | ((df[column] >= threshold[0]) & (df[column] <= threshold[1]))]
    return df

def extract_XYZcolumns(df):#提XYZ和Type列
    return df[["dateTime", "X", "Y", "Z", "Type"]].copy()


def extract_geolocation_columns(df):#提取地理位置列
    return pd.concat([df["dateTime"], df[geolocation_columns]], axis=1).copy()


def extract_other_columns(df):#提取GPS速度和水平精度列
    return df[["dateTime", "hacc", "speed"]].copy()


def extract_remaining_columns(df):#提取手环中的加速度和心率列
    return df[
        ["dateTime", "BandAccX", "BandAccY", "BandAccZ", "rate", "rateZone"]
    ].copy()


def XYZsplit(df):#将TYPE中的列拆分为多个新列
    # Drop rows where 'Type' is NaN
    df = df.dropna(subset=["Type"]).copy()

    # Create columns for each type
    types = df["Type"].unique()
    for t in types:
        for col in ["X", "Y", "Z"]:
            df.loc[:, f"{t}_{col}"] = df.apply(
                lambda row: row[col] if row["Type"] == t else None, axis=1
            )

    # Explicitly list columns to retain, no need for Position columns
    columns_to_keep = ["dateTime"] + [
        f"{t}_{col}"
        for t in types
        for col in ["X", "Y", "Z"]
        if f"{t}_{col}" in df.columns
    ]
    df = df[columns_to_keep]
    # Round the numeric columns to the desired decimal places
    # Uncomment and modify the line below to set the desired decimal places
    df = df.round(6)

    return df


# Convert dateTime to pd.datetime
def convert_to_datetime(df): #将dateTime转换为pd.datetime并且创建出一个新的列，S
    df["dateTime"] = pd.to_datetime(df["dateTime"])
    # Create a new column for the second
    df["Second"] = df["dateTime"].dt.floor("S")
    return df


def linear_columns_interpolation(df, columns):#线性插值
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(
                method="linear", limit_direction="both", limit_area="inside"
            )
    return df


def b_and_ffill_columns_interpolation(df, columns):#先用最近非空数值向后填充，再向前填充
    for col in columns:
        df[col] = df[col].bfill().ffill()
    return df


def limited_columns_interpolation(df, columns):#先用线性插值法对前后20个数值进行插值，剩余再用0填充
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(method="linear", limit_direction="both", limit=20)
            df[col] = df[col].fillna(0)
    return df


def orientation_interpolation(df, columns):#对朝向进行插值
    # Check if the dataframe has the required columns
    length = len(df)
    if not all(col in df.columns for col in columns):
        raise ValueError("DataFrame does not contain all required columns")

    # Convert the orientation columns to Rotation objects, handling NaNs
    indices = []
    rotations = []
    for index, row in df[columns].iterrows():
        if not row.isnull().any():
            indices.append(index)
            rotations.append(R.from_euler("xyz", row, degrees=True))

    # Check if there are at least two valid data points to perform interpolation
    if len(indices) < 2:
        raise ValueError("Not enough valid data points to perform interpolation")

    slerp = Slerp(indices, R.from_quat([r.as_quat() for r in rotations]))

    for i in range(len(df)):
        if df.iloc[i][columns].isnull().any():
            # Ensure the interpolation index is within the valid range
            if i >= indices[0] and i <= indices[-1]:
                df.loc[i, columns] = slerp([i])[0].as_euler("xyz", degrees=True)

    # Fill any remaining NaNs (if interpolation limit is reached) with zeros or other strategy
    df[columns] = df[columns].fillna(0)
    df = df[:length]

    return df

distance_check_columns = [
    "Acceleration_X",
    "Acceleration_Y",
    "Acceleration_Z",
    "MagneticField_X",
    "MagneticField_Y",
    "MagneticField_Z",
    "Orientation_X",
    "Orientation_Y",
    "Orientation_Z",
    "AngularVelocity_X",
    "AngularVelocity_Y",
    "AngularVelocity_Z",
    "longitude",
    "latitude",
    "altitude",
]

distance_thresholds = {
    "Acceleration_X": 50.0,   # m/s^2
    "Acceleration_Y": 50.0,   # m/s^2
    "Acceleration_Z": 50.0,   # m/s^2
    "MagneticField_X": 60.0,  # μT
    "MagneticField_Y": 60.0,  # μT
    "MagneticField_Z": 60.0,  # μT
    "Orientation_X": 360.0,   # 方位角，° (0-360度)
    "Orientation_Y": 90.0,    # 俯仰，° (-90到90度)
    "Orientation_Z": 180.0,   # 滚转，° (-180到180度)
    "AngularVelocity_X": 500.0, # °/s
    "AngularVelocity_Y": 500.0, # °/s
    "AngularVelocity_Z": 500.0, # °/s
    "longitude": 180.0,       # 地理经度范围 (-180到180度)
    "latitude": 90.0,         # 地理纬度范围 (-90到90度)
    "altitude": 8848.0,       # 地理高度，假设不超过珠穆朗玛峰高度，单位为米
}

def distance_based_outlier_detection(df, columns, threshold):#基于距离的异常值检测
    if not all(col in df.columns for col in columns):
        raise ValueError("DataFrame does not contain all required columns")

    outlier_context_indices = set()
    all_outliers = set()

    for col in columns:
        non_nan_df = df[col].dropna()
        non_nan_indices = non_nan_df.index

        for i in range(len(non_nan_indices)):
            idx = non_nan_indices[i]
            if i >= 5:
                mean_value = non_nan_df.iloc[i-5:i].mean()
            else:
                mean_value = non_nan_df.iloc[:i].mean()

            if abs(non_nan_df.loc[idx] - mean_value) > threshold[col]:
                all_outliers.add(idx)
                context = [idx]
                if i - 1 >= 0:
                    context.insert(0, non_nan_indices[i - 1])
                if i - 2 >= 0:
                    context.insert(0, non_nan_indices[i - 2])
                if i + 1 < len(non_nan_indices):
                    context.append(non_nan_indices[i + 1])
                if i + 2 < len(non_nan_indices):
                    context.append(non_nan_indices[i + 2])

                if len(context) >= 4:
                    if len(context) == 4:
                        context.append(2)
                    else:
                        context = context[:4]
                        context.append(2)
                    outlier_context_indices.add(tuple(context))


    return sorted(all_outliers)

测试用例

In [8]:
data = {
    "Acceleration_X": [0, 1, np.nan, np.nan, np.nan, np.nan, 20, 3, 50, 1],
    "Acceleration_Y": [0, 0, 1, 0, 10, 0, 0, 1, 0, 2],
    "Acceleration_Z": [0, 1, 2, 0, 0, 10, 0, 1, 1, 3],
    "MagneticField_X": [30, 30, 30, 30, 30, 30, 30, 30, 100, 30],
    "MagneticField_Y": [40, 40, 40, 40, 40, 40, 40, 40, 150, 40],
    "MagneticField_Z": [50, 50, 50, 50, 50, 50, 50, 50, 200, 50],
    "Orientation_X": [0, 45, 90, 135, 180, 225, 270, 315, 360, 45],
    "Orientation_Y": [0, 10, 20, 30, 40, 50, 60, 70, 80, 10],
    "Orientation_Z": [0, -10, -20, -30, -40, -50, -60, -70, -80, -10],
    "AngularVelocity_X": [0, 0, 0, 0, 0, 0, 0, 0, 5000, 0],
    "AngularVelocity_Y": [0, 0, 0, 0, 0, 0, 0, 0, 3000, 0],
    "AngularVelocity_Z": [0, 0, 0, 0, 0, 0, 0, 0, 600, 0],
    "longitude": [0, 0, 0, 0, 0, 0, 0, 0, 180, 0],
    "latitude": [0, 0, 0, 0, 0, 0, 0, 0, 90, 0],
    "altitude": [0, 0, 0, 0, 0, 0, 0, 0, 9000, 0],
}

df = pd.DataFrame(data)
threshold = {
    "Acceleration_X": 1.0,
    "Acceleration_Y": 5.0,
    "Acceleration_Z": 5.0,
    "MagneticField_X": 50.0,
    "MagneticField_Y": 50.0,
    "MagneticField_Z": 50.0,
    "Orientation_X": 100.0,
    "Orientation_Y": 20.0,
    "Orientation_Z": 20.0,
    "AngularVelocity_X": 1000.0,
    "AngularVelocity_Y": 1000.0,
    "AngularVelocity_Z": 300.0,
    "longitude": 90.0,
    "latitude": 45.0,
    "altitude": 1000.0,
}

all_outliers= distance_based_outlier_detection(df, columns=threshold.keys(), threshold=threshold)
all_outliers

[4, 5, 6, 7, 8, 9]

XYZ变量命名规则：变量名_X
mat数据由：XYZ数据，地理数据和剩余数据以及时间戳构成。上述所有的data均有时间戳做主元。清洗方法给在下面
全部线性插值列：MagneticField_X	MagneticField_Y	MagneticField_Z
有限线性插值列： Acceleration_X	Acceleration_Y	Acceleration_Z AngularVelocity_X	AngularVelocity_Y	AngularVelocity_Z
方位角特殊插值： Orientation_X	Orientation_Y	Orientation_Z 

In [21]:
def process_mat_data(df):
    df = remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds)
    data = convert_to_datetime(df)
    XYZdata = extract_XYZcolumns(data)
    XYZdata_splited = XYZsplit(XYZdata)
    remain_data = data.drop(columns=["X", "Y", "Z", "Type"])
    XYZdata_splited.set_index("dateTime", inplace=True)
    XYZdata_splited = XYZdata_splited.groupby("dateTime").mean()
    XYZdata_splited.drop(
        columns=["Position_X", "Position_Y", "Position_Z"], inplace=True
    )
    XYZdata_splited.reset_index(inplace=True)
    t = pd.merge(remain_data, XYZdata_splited, on="dateTime", how="outer")
    context_indices = distance_based_outlier_detection(t, distance_check_columns, distance_thresholds)
    print(context_indices)
    t=t.drop(context_indices)
    t = linear_columns_interpolation(
        t,
        columns=[
            "MagneticField_X",
            "MagneticField_Y",
            "MagneticField_Z",
            "latitude",
            "longitude",
            "altitude",
            "course",
            "speed",
        ],
    )
    t = limited_columns_interpolation(
        t,
        columns=[
            "Acceleration_X",
            "Acceleration_Y",
            "Acceleration_Z",
            "AngularVelocity_X",
            "AngularVelocity_Y",
            "AngularVelocity_Z",
        ],
    )
    t = orientation_interpolation(
        t, columns=["Orientation_X", "Orientation_Y", "Orientation_Z"]
    )
    t = b_and_ffill_columns_interpolation(t, columns=["hacc"])
    mat_columns = [
        "MagneticField_X",
        "MagneticField_Y",
        "MagneticField_Z",
        "Acceleration_X",
        "Acceleration_Y",
        "Acceleration_Z",
        "AngularVelocity_X",
        "AngularVelocity_Y",
        "AngularVelocity_Z",
        "Orientation_X",
        "Orientation_Y",
        "Orientation_Z",
        "latitude",
        "longitude",
        "altitude",
        "course",
        "hacc",
        "speed",
    ]
    t[mat_columns] = t[mat_columns].bfill().ffill()
    t.drop(columns=["Second"], inplace=True)
    return t

In [10]:
test = process_mat_data(df_list[0])

  df["Second"] = df["dateTime"].dt.floor("S")


KeyboardInterrupt: 

In [22]:
for i in range(df_list.__len__()):
    df_list[i] = process_mat_data(df_list[i])

  df["Second"] = df["dateTime"].dt.floor("S")


[3293, 3401, 3989, 4082, 4160, 4793, 4822, 4853, 4884, 4885, 4976, 5054, 5069, 5115, 5146, 5223, 5224, 5379, 5395, 5458, 5535, 5581, 5597, 5612, 5689, 5736, 5799, 5815, 5875, 5891, 5892, 5921, 5922, 5951, 6092, 6934, 6951, 7044, 7059, 7105, 7120, 7151, 7166, 7243, 7259, 7367, 7368, 7413, 7429, 7475, 7491, 7506, 7552, 7569, 7660, 7768, 7783, 7908, 7923, 7984, 8077, 8247, 8510, 8679, 8851, 9021, 9114, 9190, 9206, 9207, 9222, 9267, 9361, 9532, 10042, 10211, 10380, 10720, 11061, 11167, 11183, 11321, 11492, 11662, 11676, 11677, 11827, 11843, 11858, 11997, 12105, 12198, 13523, 13772, 14078, 14230, 14306, 14551, 14799, 15045, 15212, 15289, 15460, 15538, 15706, 15707, 15772, 16244, 16361, 16719, 16748, 17167, 19451, 20225, 21957, 22018, 22112, 22128, 22282, 22299, 22313, 22360, 22421, 22436, 22651, 22728]


  df["Second"] = df["dateTime"].dt.floor("S")


[26264, 26477, 26478, 26492, 26631, 26648, 26664]


  df["Second"] = df["dateTime"].dt.floor("S")


[3654, 3681, 3712, 3739, 3768, 3793, 3823, 3825, 3879, 3966, 3995, 4001, 4023, 4051, 4058, 4079, 4106, 4107, 4134, 4166, 4173, 4191, 4221, 4240, 4250, 4270, 4277, 4278, 4306, 4335, 4343, 4362, 4390, 4418, 4476, 4503, 4533, 4562, 4648, 4705, 4735, 4761, 4792, 4848, 4877, 4931, 5018, 5048, 5102, 5130, 5188, 5219, 5275, 5303, 5331, 5443, 5473, 5502, 5670, 5700, 5729, 5730, 5815, 5843, 5874, 5904, 5958, 5987, 6102, 6128, 6158, 6597, 6614, 6876, 6891, 6983, 6998]


  df["Second"] = df["dateTime"].dt.floor("S")


[]


  df["Second"] = df["dateTime"].dt.floor("S")


[8561, 8686, 8980, 9090, 9153, 9260, 9601, 9661, 13501, 17488, 19175, 19191, 19252, 19268, 19406, 35186, 35189, 35192, 35207]


In [23]:
df_list[0].describe()

Unnamed: 0,dateTime,BandAccX,BandAccY,BandAccZ,altitude,course,hacc,latitude,longitude,speed,...,Acceleration_Z,MagneticField_X,MagneticField_Y,MagneticField_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z
count,34275,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,...,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0,34275.0
mean,2024-06-06 11:51:34.376754176,3216.697575,1894.099976,245.700238,69.448,0.0,19.642,52.33552,4.8633,0.0,...,3.108203,-0.351952,-23.004456,-14.385113,15.351362,-44.599906,-38.774608,0.002457,0.003039,0.001087
min,2024-06-06 11:47:48.119000,-17238.810865,-7455.12146,-2397.964055,69.448,0.0,19.642,52.33552,4.8633,0.0,...,-39.928432,-44.15625,-66.337502,-49.181252,-179.974899,-89.977197,-179.985885,-0.780572,-1.933587,-2.590588
25%,2024-06-06 11:49:40.766500096,3208.663364,1191.412205,-64.201167,69.448,0.0,19.642,52.33552,4.8633,0.0,...,-0.047956,-16.139532,-36.9375,-33.375,-67.48736,-79.191614,-119.447541,-0.007435,-0.005256,0.0
50%,2024-06-06 11:51:37.135000064,4003.707353,1717.656182,334.474862,69.448,0.0,19.642,52.33552,4.8633,0.0,...,3.44387,2.76,-27.026252,-17.807813,26.370841,-52.678864,-7.21334,0.0,0.0,0.0
75%,2024-06-06 11:53:27.738999808,4609.612535,2460.589452,647.71203,69.448,0.0,19.642,52.33552,4.8633,0.0,...,7.785048,15.431251,-14.483438,3.534375,106.30927,-24.531233,0.409368,0.004312,0.003763,0.008976
max,2024-06-06 11:55:23.448000,17568.931588,11040.522262,3873.115102,69.448,0.0,19.642,52.33552,4.8633,0.0,...,28.344778,38.193752,54.037502,45.956253,179.972149,89.731335,179.980627,1.401754,2.053029,1.120924
std,,2464.909747,1032.36102,637.46011,0.0,0.0,0.0,7.105531e-15,0.0,0.0,...,5.703934,17.734701,22.651828,21.468563,98.289757,41.891892,77.667123,0.111675,0.15767,0.154381


查看是否有空值

In [24]:
for i in range(df_list.__len__()):
    
    # 检查整个数据框是否有 NaN 值
    print("DataFrame contains NaN values:\n", df_list[i].isnull().any().any())

    # 打印出含有 NaN 值的行
    nan_rows = df_list[i][df_list[i].isnull().any(axis=1)]
    print("\nRows with NaN values:\n", nan_rows)

    # 统计各列中 NaN 值的数量
    nan_counts = df_list[i].isnull().sum()
    print("\nCount of NaN values in each column:\n", nan_counts)

DataFrame contains NaN values:
 False

Rows with NaN values:
 Empty DataFrame
Columns: [dateTime, BandAccX, BandAccY, BandAccZ, altitude, course, hacc, latitude, longitude, speed, rate, rateZone, usage, deviceStatus, Acceleration_X, Acceleration_Y, Acceleration_Z, MagneticField_X, MagneticField_Y, MagneticField_Z, Orientation_X, Orientation_Y, Orientation_Z, AngularVelocity_X, AngularVelocity_Y, AngularVelocity_Z]
Index: []

[0 rows x 26 columns]

Count of NaN values in each column:
 dateTime             0
BandAccX             0
BandAccY             0
BandAccZ             0
altitude             0
course               0
hacc                 0
latitude             0
longitude            0
speed                0
rate                 0
rateZone             0
usage                0
deviceStatus         0
Acceleration_X       0
Acceleration_Y       0
Acceleration_Z       0
MagneticField_X      0
MagneticField_Y      0
MagneticField_Z      0
Orientation_X        0
Orientation_Y        0
Orien

In [25]:
cleaned_merge_data_dir = os.getcwd() + '/cleaned_merge_data/'
cleaned_merge_data_files = ['cleaned_walk_merge_raw.csv', 'cleaned_bike_merge_raw.csv', 'cleaned_run_merge_raw.csv', 'cleaned_sit_merge_raw.csv', 'cleaned_syn_merge_raw.csv']
for i in range(df_list.__len__()):
    df_list[i].to_csv(cleaned_merge_data_dir + cleaned_merge_data_files[i], index=False)