In [164]:
import pandas as pd 
import numpy as np
import os
from scipy.spatial.transform import Rotation as R
from scipy.spatial.transform import Slerp

In [165]:
print(os.getcwd())

/home/taoyida/QS4ML-VU-100/final_data


In [166]:
raw_merge_data_dir = os.getcwd() + '/raw_merge_data/'
raw_merge_data_files = ['walk_merge_raw.csv', 'bike_merge_raw.csv', 'run_merge_raw.csv', 'sit_merge_raw.csv', 'syn_merge_raw.csv']

#所有dataframe都存在这里
df_list = []

for i in range(len(raw_merge_data_files)):
    df = pd.read_csv(raw_merge_data_dir + raw_merge_data_files[i], low_memory=False)
    df_list.append(df)

清洗手环数据

清洗手机系统数据

清洗MATLAB数据

In [167]:
# Hyperparameters
# Define thresholds for specific columns
thresholds = {
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "altitude": (-500, 12000),
    "course": (0, 360),
    "hacc": (0, 100),  # Assuming max horizontal accuracy of 100 meters
    "speed": (0, 300),  # Assuming max speed of 300 m/s
}
columns_to_check = [
    "altitude",
    "course",
    "hacc",
    "latitude",
    "longitude",
    "speed",
]
geolocation_columns = ["altitude", "course", "latitude", "longitude"]

In [168]:
def remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds):
    for column in columns_to_check:
        if column in df.columns:
            threshold = thresholds.get(column, None)
            if threshold is not None:
                # Filter outliers, retain NaN values
                df = df[(df[column].isna()) | ((df[column] >= threshold[0]) & (df[column] <= threshold[1]))]
    return df

def extract_XYZcolumns(df):
    return df[["dateTime", "X", "Y", "Z", "Type"]].copy()


def extract_geolocation_columns(df):
    return pd.concat([df["dateTime"], df[geolocation_columns]], axis=1).copy()


def extract_other_columns(df):
    return df[["dateTime", "hacc", "speed"]].copy()


def extract_remaining_columns(df):
    return df[
        ["dateTime", "BandAccX", "BandAccY", "BandAccZ", "rate", "rateZone"]
    ].copy()


def XYZsplit(df):
    # Drop rows where 'Type' is NaN
    df = df.dropna(subset=["Type"]).copy()

    # Create columns for each type
    types = df["Type"].unique()
    for t in types:
        for col in ["X", "Y", "Z"]:
            df.loc[:, f"{t}_{col}"] = df.apply(
                lambda row: row[col] if row["Type"] == t else None, axis=1
            )

    # Explicitly list columns to retain, no need for Position columns
    columns_to_keep = ["dateTime"] + [
        f"{t}_{col}"
        for t in types
        for col in ["X", "Y", "Z"]
        if f"{t}_{col}" in df.columns
    ]
    df = df[columns_to_keep]
    # Round the numeric columns to the desired decimal places
    # Uncomment and modify the line below to set the desired decimal places
    df = df.round(6)

    return df


# Convert dateTime to pd.datetime
def convert_to_datetime(df):
    df["dateTime"] = pd.to_datetime(df["dateTime"])
    # Create a new column for the second
    df["Second"] = df["dateTime"].dt.floor("S")
    return df


def linear_columns_interpolation(df, columns):
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(
                method="linear", limit_direction="both", limit_area="inside"
            )
    return df


def b_and_ffill_columns_interpolation(df, columns):
    for col in columns:
        df[col] = df[col].bfill().ffill()
    return df


def limited_columns_interpolation(df, columns):
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(method="linear", limit_direction="both", limit=20)
            df[col] = df[col].fillna(0)
    return df


def orientation_interpolation(df, columns):
    # Check if the dataframe has the required columns
    length = len(df)
    if not all(col in df.columns for col in columns):
        raise ValueError("DataFrame does not contain all required columns")

    # Convert the orientation columns to Rotation objects, handling NaNs
    indices = []
    rotations = []
    for index, row in df[columns].iterrows():
        if not row.isnull().any():
            indices.append(index)
            rotations.append(R.from_euler("xyz", row, degrees=True))

    # Check if there are at least two valid data points to perform interpolation
    if len(indices) < 2:
        raise ValueError("Not enough valid data points to perform interpolation")

    slerp = Slerp(indices, R.from_quat([r.as_quat() for r in rotations]))

    for i in range(len(df)):
        if df.iloc[i][columns].isnull().any():
            # Ensure the interpolation index is within the valid range
            if i >= indices[0] and i <= indices[-1]:
                df.loc[i, columns] = slerp([i])[0].as_euler("xyz", degrees=True)

    # Fill any remaining NaNs (if interpolation limit is reached) with zeros or other strategy
    df[columns] = df[columns].fillna(0)
    df = df[:length]

    return df

XYZ变量命名规则：变量名_X
mat数据由：XYZ数据，地理数据和剩余数据以及时间戳构成。上述所有的data均有时间戳做主元。清洗方法给在下面
全部线性插值列：MagneticField_X	MagneticField_Y	MagneticField_Z
有限线性插值列： Acceleration_X	Acceleration_Y	Acceleration_Z AngularVelocity_X	AngularVelocity_Y	AngularVelocity_Z
方位角特殊插值： Orientation_X	Orientation_Y	Orientation_Z 

In [169]:
def process_mat_data(df):
    print("Processing" + str(df) + "...")
    df = remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds)
    data = convert_to_datetime(df)
    XYZdata = extract_XYZcolumns(data)
    XYZdata_splited = XYZsplit(XYZdata)
    remain_data = data.drop(columns=["X", "Y", "Z", "Type"])
    XYZdata_splited.set_index("dateTime", inplace=True)
    XYZdata_splited = XYZdata_splited.groupby("dateTime").mean()
    XYZdata_splited.drop(
        columns=["Position_X", "Position_Y", "Position_Z"], inplace=True
    )
    XYZdata_splited.reset_index(inplace=True)
    t = pd.merge(remain_data, XYZdata_splited, on="dateTime", how="outer")
    t = linear_columns_interpolation(
        t,
        columns=[
            "MagneticField_X",
            "MagneticField_Y",
            "MagneticField_Z",
            "latitude",
            "longitude",
            "altitude",
            "course",
            "speed",
        ],
    )
    t = limited_columns_interpolation(
        t,
        columns=[
            "Acceleration_X",
            "Acceleration_Y",
            "Acceleration_Z",
            "AngularVelocity_X",
            "AngularVelocity_Y",
            "AngularVelocity_Z",
        ],
    )
    t = orientation_interpolation(
        t, columns=["Orientation_X", "Orientation_Y", "Orientation_Z"]
    )
    t = b_and_ffill_columns_interpolation(t, columns=["hacc"])
    mat_columns = [
        "MagneticField_X",
        "MagneticField_Y",
        "MagneticField_Z",
        "Acceleration_X",
        "Acceleration_Y",
        "Acceleration_Z",
        "AngularVelocity_X",
        "AngularVelocity_Y",
        "AngularVelocity_Z",
        "Orientation_X",
        "Orientation_Y",
        "Orientation_Z",
        "latitude",
        "longitude",
        "altitude",
        "course",
        "hacc",
        "speed",
    ]
    t[mat_columns] = t[mat_columns].bfill().ffill()
    t.drop(columns=["Second"], inplace=True)
    return t

In [170]:
processed_dfs = [process_mat_data(df) for df in df_list]