In [15]:
import pandas as pd 
import numpy as np
import os
from scipy.spatial.transform import Rotation as R
from scipy.spatial.transform import Slerp

In [16]:
print(os.getcwd())

/home/taoyida/QS4ML-VU-100/final_data


In [17]:
raw_merge_data_dir = os.getcwd() + '/raw_merge_data/'
raw_merge_data_files = ['walk_merge_raw.csv', 'bike_merge_raw.csv', 'run_merge_raw.csv', 'sit_merge_raw.csv', 'syn_merge_raw.csv']

#所有dataframe都存在这里
df_list = []

for i in range(len(raw_merge_data_files)):
    df = pd.read_csv(raw_merge_data_dir + raw_merge_data_files[i], low_memory=False)
    df_list.append(df)

清洗手环数据

清洗手机系统数据

In [None]:
lengths = [len(df) for df in df_list]
# 将lengths中的每个值相加到一起
print(lengths)
print(sum(lengths))
# 读取df_list变量，将这个list按照df_list[0]、1、2、3、4的顺序纵向合并为一个dataframe，名为df_raw。
df_raw = pd.concat(df_list, axis=0, ignore_index=True)
# 输出df_raw的行数
print(len(df_raw))

# 处理 'usage' 列的空值
usage = 'use'
for i, row in df_raw.iterrows():
    if pd.isna(row['usage']):
        df_raw.at[i, 'usage'] = usage
    else:
        usage = row['usage']
        
# 处理 'deviceStatus' 列的空值
if pd.isna(df_raw.at[0, 'deviceStatus']):
    df_raw.at[0, 'deviceStatus'] = 'ACTIVITY_PAUSED'

df_raw['deviceStatus'] = df_raw['deviceStatus'].ffill()  # 向上填充

# 根据 'lengths' 列表将 'df_raw' 分割成 'df_list'
df_list = []
start = 0
for length in lengths:
    df_list.append(df_raw.iloc[start:start+length])
    start += length

print("啦啦啦啦啦啦啦")
lengths = [len(df) for df in df_list]
# 将lengths中的每个值相加到一起
print(lengths)
print(sum(lengths))

In [None]:
# 保存 'df_list' 到 '../test' 文件夹
for i in range(len(raw_merge_data_files)):
    df_list[i].to_csv(os.getcwd() + '/../temp/' + raw_merge_data_files[i], index=False)

清洗MATLAB数据

In [18]:
# Hyperparameters
# Define thresholds for specific columns
thresholds = {
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "altitude": (-500, 12000),
    "course": (0, 360),
    "hacc": (0, 100),  # Assuming max horizontal accuracy of 100 meters
    "speed": (0, 300),  # Assuming max speed of 300 m/s
}
columns_to_check = [
    "altitude",
    "course",
    "hacc",
    "latitude",
    "longitude",
    "speed",
]
geolocation_columns = ["altitude", "course", "latitude", "longitude"]

In [19]:
def remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds):
    for column in columns_to_check:
        if column in df.columns:
            threshold = thresholds.get(column, None)
            if threshold is not None:
                # Filter outliers, retain NaN values
                df = df[(df[column].isna()) | ((df[column] >= threshold[0]) & (df[column] <= threshold[1]))]
    return df

def extract_XYZcolumns(df):
    return df[["dateTime", "X", "Y", "Z", "Type"]].copy()


def extract_geolocation_columns(df):
    return pd.concat([df["dateTime"], df[geolocation_columns]], axis=1).copy()


def extract_other_columns(df):
    return df[["dateTime", "hacc", "speed"]].copy()


def extract_remaining_columns(df):
    return df[
        ["dateTime", "BandAccX", "BandAccY", "BandAccZ", "rate", "rateZone"]
    ].copy()


def XYZsplit(df):
    # Drop rows where 'Type' is NaN
    df = df.dropna(subset=["Type"]).copy()

    # Create columns for each type
    types = df["Type"].unique()
    for t in types:
        for col in ["X", "Y", "Z"]:
            df.loc[:, f"{t}_{col}"] = df.apply(
                lambda row: row[col] if row["Type"] == t else None, axis=1
            )

    # Explicitly list columns to retain, no need for Position columns
    columns_to_keep = ["dateTime"] + [
        f"{t}_{col}"
        for t in types
        for col in ["X", "Y", "Z"]
        if f"{t}_{col}" in df.columns
    ]
    df = df[columns_to_keep]
    # Round the numeric columns to the desired decimal places
    # Uncomment and modify the line below to set the desired decimal places
    df = df.round(6)

    return df


# Convert dateTime to pd.datetime
def convert_to_datetime(df):
    df["dateTime"] = pd.to_datetime(df["dateTime"])
    # Create a new column for the second
    df["Second"] = df["dateTime"].dt.floor("S")
    return df


def linear_columns_interpolation(df, columns):
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(
                method="linear", limit_direction="both", limit_area="inside"
            )
    return df


def b_and_ffill_columns_interpolation(df, columns):
    for col in columns:
        df[col] = df[col].bfill().ffill()
    return df


def limited_columns_interpolation(df, columns):
    for col in columns:
        if df[col].notnull().sum() > 1:
            df[col] = df[col].interpolate(method="linear", limit_direction="both", limit=20)
            df[col] = df[col].fillna(0)
    return df


def orientation_interpolation(df, columns):
    # Check if the dataframe has the required columns
    length = len(df)
    if not all(col in df.columns for col in columns):
        raise ValueError("DataFrame does not contain all required columns")

    # Convert the orientation columns to Rotation objects, handling NaNs
    indices = []
    rotations = []
    for index, row in df[columns].iterrows():
        if not row.isnull().any():
            indices.append(index)
            rotations.append(R.from_euler("xyz", row, degrees=True))

    # Check if there are at least two valid data points to perform interpolation
    if len(indices) < 2:
        raise ValueError("Not enough valid data points to perform interpolation")

    slerp = Slerp(indices, R.from_quat([r.as_quat() for r in rotations]))

    for i in range(len(df)):
        if df.iloc[i][columns].isnull().any():
            # Ensure the interpolation index is within the valid range
            if i >= indices[0] and i <= indices[-1]:
                df.loc[i, columns] = slerp([i])[0].as_euler("xyz", degrees=True)

    # Fill any remaining NaNs (if interpolation limit is reached) with zeros or other strategy
    df[columns] = df[columns].fillna(0)
    df = df[:length]

    return df

XYZ变量命名规则：变量名_X
mat数据由：XYZ数据，地理数据和剩余数据以及时间戳构成。上述所有的data均有时间戳做主元。清洗方法给在下面
全部线性插值列：MagneticField_X	MagneticField_Y	MagneticField_Z
有限线性插值列： Acceleration_X	Acceleration_Y	Acceleration_Z AngularVelocity_X	AngularVelocity_Y	AngularVelocity_Z
方位角特殊插值： Orientation_X	Orientation_Y	Orientation_Z 

In [20]:
def process_mat_data(df):
    df = remove_outliers_and_apply_thresholds(df, columns_to_check, thresholds)
    data = convert_to_datetime(df)
    XYZdata = extract_XYZcolumns(data)
    XYZdata_splited = XYZsplit(XYZdata)
    remain_data = data.drop(columns=["X", "Y", "Z", "Type"])
    XYZdata_splited.set_index("dateTime", inplace=True)
    XYZdata_splited = XYZdata_splited.groupby("dateTime").mean()
    XYZdata_splited.drop(
        columns=["Position_X", "Position_Y", "Position_Z"], inplace=True
    )
    XYZdata_splited.reset_index(inplace=True)
    t = pd.merge(remain_data, XYZdata_splited, on="dateTime", how="outer")
    t = linear_columns_interpolation(
        t,
        columns=[
            "MagneticField_X",
            "MagneticField_Y",
            "MagneticField_Z",
            "latitude",
            "longitude",
            "altitude",
            "course",
            "speed",
        ],
    )
    t = limited_columns_interpolation(
        t,
        columns=[
            "Acceleration_X",
            "Acceleration_Y",
            "Acceleration_Z",
            "AngularVelocity_X",
            "AngularVelocity_Y",
            "AngularVelocity_Z",
        ],
    )
    t = orientation_interpolation(
        t, columns=["Orientation_X", "Orientation_Y", "Orientation_Z"]
    )
    t = b_and_ffill_columns_interpolation(t, columns=["hacc"])
    mat_columns = [
        "MagneticField_X",
        "MagneticField_Y",
        "MagneticField_Z",
        "Acceleration_X",
        "Acceleration_Y",
        "Acceleration_Z",
        "AngularVelocity_X",
        "AngularVelocity_Y",
        "AngularVelocity_Z",
        "Orientation_X",
        "Orientation_Y",
        "Orientation_Z",
        "latitude",
        "longitude",
        "altitude",
        "course",
        "hacc",
        "speed",
    ]
    t[mat_columns] = t[mat_columns].bfill().ffill()
    t.drop(columns=["Second"], inplace=True)
    return t

In [21]:
processed_dfs = [process_mat_data(df) for df in df_list]

In [22]:
processed_dfs[0].describe()

Unnamed: 0,dateTime,BandAccX,BandAccY,BandAccZ,altitude,course,hacc,latitude,longitude,speed,...,Acceleration_Z,MagneticField_X,MagneticField_Y,MagneticField_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z
count,34401,2126.0,2126.0,2126.0,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0,...,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0,34401.0
mean,2024-06-06 11:51:34.047408128,3605.002226,1805.011312,362.367654,69.448,0.0,19.642,52.33552,4.8633,0.0,...,3.094431,-0.317992,-22.997045,-14.332205,18.212454,-47.60036,-37.074738,0.002452,0.003017,0.001085
min,2024-06-06 11:47:48.119000,-4481.4,-1898.2,-2124.2,69.448,0.0,19.642,52.33552,4.8633,0.0,...,-39.928432,-44.15625,-66.337502,-49.181252,-179.974899,-89.977197,-179.989661,-0.780572,-1.933587,-2.590588
25%,2024-06-06 11:49:40.358000128,3379.25,1151.275,26.658333,69.448,0.0,19.642,52.33552,4.8633,0.0,...,-0.058026,-16.096876,-36.971251,-33.300003,-70.783164,-79.72432,-121.14707,-0.007002,-0.005007,0.0
50%,2024-06-06 11:51:36.500999936,4070.5,1641.0,386.533333,69.448,0.0,19.642,52.33552,4.8633,0.0,...,3.412763,2.817188,-27.075,-17.681252,32.468168,-56.098621,-8.566053,0.0,0.0,0.0
75%,2024-06-06 11:53:27.348999936,4685.5,2308.2,668.1,69.448,0.0,19.642,52.33552,4.8633,0.0,...,7.773683,15.450001,-14.4825,3.58125,109.514174,-31.823968,1.741572,0.003998,0.003519,0.00862
max,2024-06-06 11:55:23.448000,8269.0,7045.4,6013.8,69.448,0.0,19.642,52.33552,4.8633,0.0,...,28.344778,38.193752,54.037502,45.956253,179.972149,89.750073,179.996905,1.401754,2.053029,1.120924
std,,1860.071295,950.985917,675.714709,1.47795e-11,0.0,5.332701e-12,1.46445e-11,2.079256e-12,0.0,...,5.705926,17.731397,22.719992,21.466977,101.155354,41.412413,83.066344,0.111538,0.157522,0.154141
