In [483]:
import pandas as pd 
import numpy as np
import os

In [484]:
print(os.getcwd())

/home/taoyida/QS4ML-VU-100/final_data


In [485]:
raw_merge_data_dir = os.getcwd() + '/raw_merge_data/'
raw_merge_data_files = ['walk_merge_raw.csv', 'bike_merge_raw.csv', 'run_merge_raw.csv', 'sit_merge_raw.csv', 'syn_merge_raw.csv']

#所有dataframe都存在这里
df_list = []

for i in range(len(raw_merge_data_files)):
    df = pd.read_csv(raw_merge_data_dir + raw_merge_data_files[i], low_memory=False)
    df_list.append(df)

清洗数据

In [486]:
# Hyperparameters
# Define thresholds for specific columns
thresholds = {
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "altitude": (-500, 12000),
    "course": (0, 360),
    "hacc": (0, 100),  # Assuming max horizontal accuracy of 100 meters
    "speed": (0, 300),  # Assuming max speed of 300 m/s
}
XYZ_columns = ["dateTime", "X", "Y", "Z", "Type"]
relevant_columns = [
    "dateTime",
    "X",
    "Y",
    "Z",
    "Type",
    "altitude",
    "course",
    "hacc",
    "latitude",
    "longitude",
    "speed",
]
columns_to_check = [
    "X",
    "Y",
    "Z",
    "altitude",
    "course",
    "hacc",
    "latitude",
    "longitude",
    "speed",
]
geolocation_columns = ["altitude", "course", "latitude", "longitude"]

In [487]:
def remove_outliers_and_apply_thresholds(df, columns,thresholds):
    for col in columns:
        if col in thresholds:
            df = df[(df[col] >= thresholds[col][0]) & (df[col] <= thresholds[col][1])]
        else:
            df = df[(df[col] >= df[col].min()) & (df[col] <= df[col].max())]
    return df

In [488]:
def extract_XYZcolumns(df):
    return df[["dateTime", "X", "Y", "Z", "Type"]].copy()


def extract_geolocation_columns(df):
    return pd.concat([df["dateTime"], df[geolocation_columns]], axis=1).copy()


def extract_remaining_columns(df):
    return df[["dateTime","hacc", "speed"]].copy()

XYZ变量命名规则：变量名_X

In [489]:
def XYZsplit(df):
    # Drop rows where 'Type' is NaN
    df = df.dropna(subset=['Type']).copy()

    # Create columns for each type
    types = df["Type"].unique()
    for t in types:
        for col in ["X", "Y", "Z"]:
            df.loc[:, f"{t}_{col}"] = df.apply(
                lambda row: row[col] if row["Type"] == t else None, axis=1
            )

    # Explicitly list columns to retain, no need for Position columns
    columns_to_keep = ["dateTime"] + [
        f"{t}_{col}"
        for t in types
        for col in ["X", "Y", "Z"]
        if f"{t}_{col}" in df.columns
    ]
    df = df[columns_to_keep]
    # Round the numeric columns to the desired decimal places
    # Uncomment and modify the line below to set the desired decimal places
    df = df.round(6)

    return df

In [490]:
test=df_list[0]

In [491]:
test.describe()

Unnamed: 0,BandAccX,BandAccY,BandAccZ,X,Y,Z,altitude,course,hacc,latitude,longitude,speed,rate,rateZone
count,2126.0,2126.0,2126.0,30099.0,30099.0,30099.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,88.0
mean,3605.002226,1805.011312,362.367654,2.080908,-4.754735,-3.582044,,,,,,,118.647727,0.619773
std,1860.071295,950.985917,675.714709,28.818155,24.109581,27.370891,,,,,,,15.768468,0.083059
min,-4481.4,-1898.2,-2124.2,-179.864453,-89.637921,-179.962289,,,,,,,80.0,0.42
25%,3379.25,1151.275,26.658333,-1.289133,-14.242152,-3.766303,,,,,,,112.0,0.58
50%,4070.5,1641.0,386.533333,0.681356,5.53101,1.079761,,,,,,,120.0,0.63
75%,4685.5,2308.2,668.1,3.59043,8.86481,6.930213,,,,,,,131.0,0.68
max,8269.0,7045.4,6013.8,179.533886,88.752277,179.929709,,,,,,,142.0,0.74


In [492]:
# Convert dateTime to pd.datetime
def convert_to_datetime(df):
    df['dateTime'] = pd.to_datetime(df['dateTime'])
    #Create a new column for the second
    df['Second'] = df['dateTime'].dt.floor('S')
    return df

思路：将总数据表拆分为XYZ数据，相同秒内前向插值；和地理数据，线性插值；和其他数据（手环和手机），保持时间戳以横向合并

In [493]:
data = convert_to_datetime(test)
XYZdata=extract_XYZcolumns(data)
geodata=extract_geolocation_columns(data)
otherdata=extract_remaining_columns(data)

In [497]:
XYZdata_splited = XYZsplit(XYZdata.copy())
XYZdata_splited.head()

Unnamed: 0,dateTime,MagneticField_X,MagneticField_Y,MagneticField_Z,Acceleration_X,Acceleration_Y,Acceleration_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z
1,2024-06-06 11:48:12.536,-6.0375,-10.06875,-45.375,,,,,,,,,
2,2024-06-06 11:48:12.539,,,,4.573282,-0.102891,8.361719,,,,,,
3,2024-06-06 11:48:12.559,,,,4.464409,-0.059821,8.483752,,,,,,
4,2024-06-06 11:48:12.579,,,,4.690531,0.062213,8.280363,,,,,,
5,2024-06-06 11:48:12.596,-6.46875,-10.21875,-45.712502,,,,,,,,,


In [499]:
from scipy.spatial.transform import Rotation as R
from scipy.spatial.transform import Slerp


def linear_columns_interpolation(df, columns):
    for col in columns:
        df[col] = df[col].interpolate(
            method="linear", limit_direction="both", limit_area="inside"
        )
    return df


def b_and_ffill_columns_interpolation(df, columns):
    for col in columns:
        df[col] = df[col].bfill().ffill()
    return df


def limited_columns_interpolation(df, columns):
    for col in columns:
        # 对整个列进行插值
        df[col] = df[col].interpolate(method="linear", limit_direction="both", limit=5)
        # 对仍然为 NaN 的部分设为 0
        df[col] = df[col].fillna(0)
    return df


def interpolate_orientation(df, columns):
    # Check if the dataframe has the required columns
    length = len(df)
    if not all(col in df.columns for col in columns):
        raise ValueError("DataFrame does not contain all required columns")
    
    # Convert the orientation columns to Rotation objects, handling NaNs
    indices = []
    rotations = []
    for index, row in df[columns].iterrows():
        if not row.isnull().any():
            indices.append(index)
            rotations.append(R.from_euler('xyz', row, degrees=True))
    
    # Check if there are at least two valid data points to perform interpolation
    if len(indices) < 2:
        raise ValueError("Not enough valid data points to perform interpolation")
    
    slerp = Slerp(indices, R.from_quat([r.as_quat() for r in rotations]))
    
    for i in range(len(df)):
        if df.iloc[i][columns].isnull().any():
            # Ensure the interpolation index is within the valid range
            if i >= indices[0] and i <= indices[-1]:
                df.loc[i, columns] = slerp([i])[0].as_euler('xyz', degrees=True)
    
    # Fill any remaining NaNs (if interpolation limit is reached) with zeros or other strategy
    df[columns] = df[columns].fillna(0)
    df=df[:length]
    
    return df


全部线性插值列：MagneticField_X	MagneticField_Y	MagneticField_Z
有限线性插值列： Acceleration_X	Acceleration_Y	Acceleration_Z
方位角特殊插值： Orientation_X	Orientation_Y	Orientation_Z AngularVelocity_X	AngularVelocity_Y	AngularVelocity_Z

In [500]:
t = XYZdata_splited.copy()
t = linear_columns_interpolation(t, t.columns[1:4])

In [501]:
t=limited_columns_interpolation(t, t.columns[4:7])

In [502]:
t = interpolate_orientation(t, t.columns[7:10])

In [503]:
t=limited_columns_interpolation(t, t.columns[10:])

In [504]:
t.describe()

Unnamed: 0,dateTime,MagneticField_X,MagneticField_Y,MagneticField_Z,Acceleration_X,Acceleration_Y,Acceleration_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z
count,30099,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0,30099.0
mean,2024-06-06 11:51:42.952599296,-0.63178,-23.609118,-12.466921,0.789002,6.263739,2.729571,17.020694,-44.857148,-35.812008,0.000777,0.000927,2.3e-05
min,2024-06-06 11:48:12.536000,-44.15625,-66.337502,-49.181252,-18.704695,-42.055054,-39.928432,-179.974899,-89.977197,-179.989661,-0.780572,-1.933587,-2.590588
25%,2024-06-06 11:49:56.288000,-16.6875,-37.58344,-30.431252,-0.65683,4.949554,-0.215653,-49.113598,-79.730513,-121.080871,0.0,0.0,0.0
50%,2024-06-06 11:51:45.400999936,2.6625,-29.653126,-14.400001,0.610768,7.007381,2.749353,23.458512,-56.115129,-5.603095,0.0,0.0,0.0
75%,2024-06-06 11:53:28.969499904,15.004688,-15.770625,5.683125,2.224328,9.690333,7.089933,102.350185,-25.688131,1.465309,0.0,0.0,0.0
max,2024-06-06 11:55:12.708000,38.193752,54.037502,45.956253,19.944775,42.606003,28.344778,179.972149,89.750073,179.996905,1.401754,2.053029,1.068992
std,,17.835665,23.368788,20.982571,2.821309,5.945643,5.691013,95.089513,43.472541,83.503838,0.06669,0.09366,0.085756


In [505]:
geodata.head()

Unnamed: 0,dateTime,altitude,course,latitude,longitude
0,2024-06-06 11:48:12.529,,,,
1,2024-06-06 11:48:12.536,,,,
2,2024-06-06 11:48:12.539,,,,
3,2024-06-06 11:48:12.559,,,,
4,2024-06-06 11:48:12.579,,,,


In [506]:
u = geodata.copy()
u = linear_columns_interpolation(u, u.columns[1:])

In [507]:
otherdata.head()

Unnamed: 0,dateTime,hacc,speed
0,2024-06-06 11:48:12.529,,
1,2024-06-06 11:48:12.536,,
2,2024-06-06 11:48:12.539,,
3,2024-06-06 11:48:12.559,,
4,2024-06-06 11:48:12.579,,


In [508]:
y=otherdata.copy()
y=b_and_ffill_columns_interpolation(y, y.columns)

In [510]:
merged_df = pd.merge(t, y, on='dateTime', how='inner')
merged_df = pd.merge(merged_df, u, on='dateTime', how='inner')


In [509]:
# XYZdata_splited_interpolated = linear_columns_interpolation(XYZdata_splited, XYZdata_splited.columns[1:4])
# XYZdata_splited_interpolated = limited_columns_interpolation(XYZdata_splited_interpolated, XYZdata_splited.columns[4:7])
# XYZdata_splited_interpolated = orientation_interpolation(XYZdata_splited_interpolated, XYZdata_splited.columns[7:10])

In [511]:
merged_df.head()

Unnamed: 0,dateTime,MagneticField_X,MagneticField_Y,MagneticField_Z,Acceleration_X,Acceleration_Y,Acceleration_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z,hacc,speed,altitude,course,latitude,longitude
0,2024-06-06 11:48:12.536,-6.0375,-10.06875,-45.375,4.573282,-0.102891,8.361719,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
1,2024-06-06 11:48:12.539,-6.145312,-10.10625,-45.459376,4.573282,-0.102891,8.361719,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
2,2024-06-06 11:48:12.559,-6.253125,-10.14375,-45.543751,4.464409,-0.059821,8.483752,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
3,2024-06-06 11:48:12.579,-6.360938,-10.18125,-45.628127,4.690531,0.062213,8.280363,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
4,2024-06-06 11:48:12.596,-6.46875,-10.21875,-45.712502,4.783053,0.110867,8.202397,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
