In [301]:
import pandas as pd
import numpy as np

In [302]:


def remove_outliers_and_apply_thresholds(df, columns,thresholds):
    for col in columns:
        if col in thresholds:
            df = df[(df[col] >= thresholds[col][0]) & (df[col] <= thresholds[col][1])]
        else:
            df = df[(df[col] >= df[col].min()) & (df[col] <= df[col].max())]
    return df

# Define thresholds for specific columns
thresholds = {
    "latitude": (-90, 90),
    "longitude": (-180, 180),
    "altitude": (-500, 12000),
    "course": (0, 360),
    "hacc": (0, 100),  # Assuming max horizontal accuracy of 100 meters
    "speed": (0, 300)  # Assuming max speed of 300 m/s
}
columns_to_check = ["X", "Y", "Z", "altitude", "course", "hacc", "latitude", "longitude", "speed"]


In [303]:
def merge_and_average(df):

    # Keep the relevant columns
    # Including extra columns: 'altitude', 'course', 'hacc', 'latitude', 'longitude', 'speed'
    df = df[["Timestamp", "X", "Y", "Z", "Type", "altitude", "course", "hacc", "latitude", "longitude", "speed"]].copy()

    # Create columns for each type
    types = df["Type"].unique()
    for t in types:
        for col in ["X", "Y", "Z"]:
            df.loc[:, f"{t}_{col}"] = df.apply(
                lambda row: row[col] if row["Type"] == t else None, axis=1
            )

    # Explicitly list columns to retain
    columns_to_keep = ["Timestamp", "altitude", "course", "hacc", "latitude", "longitude", "speed"] + [
        f"{t}_{col}"
        for t in types
        for col in ["X", "Y", "Z"]
        if f"{t}_{col}" in df.columns
    ]
    df = df[columns_to_keep]

    # Round the numeric columns to the desired decimal places
    # Uncomment and modify the line below to set the desired decimal places
    df = df.round(6)

    return df

In [304]:
# import os

# files=os.listdir('./raw/')
# for file in files:
#     original_file = file
#     data=pd.read_csv('./raw/'+original_file)
#     '''
#     注意，未来有标签，字符串类型
#     '''
#     
#     cleaned_data = remove_outliers_and_apply_thresholds(data, columns_to_check, thresholds)
#     # Apply the function to the data
#     cleaned_merged_data = merge_and_average(data)
#     cleaned_merged_data = cleaned_merged_data.drop(['Position_X', 'Position_Y', 'Position_Z'], axis=1)
#     cleaned_merged_data = cleaned_merged_data.bfill().interpolate()
#     file_name=original_file.replace('-raw','')
#     cleaned_merged_data.to_csv(file_name,index=False)


In [305]:
import pandas as pd

# Load the dataset
list_of_files = ["sensorlog_20240606_114748 walk", "sensorlog_20240606_115847 cycling", "sensorlog_20240606_120745 run","sensorlog_20240606_121202 sit","sensorlog_20240606_121611 synthesis"]
file_name = list_of_files[4]
file_path = "./raw/" + file_name + "-raw.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Timestamp,X,Y,Z,Type,altitude,course,hacc,latitude,longitude,speed
0,06-Jun-2024 12:16:11.345,,,,Position,51.469,0.0,16.86,52.334767,4.861806,0.0
1,06-Jun-2024 12:16:11.499,0.212961,3.571288,9.809975,Acceleration,,,,,,
2,06-Jun-2024 12:16:11.519,0.29013,3.616751,9.575478,Acceleration,,,,,,
3,06-Jun-2024 12:16:11.539,0.607179,3.382853,8.596215,Acceleration,,,,,,
4,06-Jun-2024 12:16:11.557,8.025001,-21.356251,-35.043751,MagneticField,,,,,,


In [306]:
# Convert Timestamp to datetime format for easier handling

data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# Create a new column for the second of each timestamp
data['Second'] = data['Timestamp'].dt.floor('S')

# Fill geolocation columns forward and backward to ensure non-NaN values
geolocation_columns = ['altitude', 'course', 'hacc', 'latitude', 'longitude', 'speed']

data[geolocation_columns] = data[geolocation_columns].interpolate(method='linear', limit_direction='both',limit_area='inside')

# Group by the new 'Second' column
grouped = data.groupby('Second')

def process_group(group):
    # Calculate the average of the previous and next rows for other columns
    other_columns = ['X', 'Y', 'Z']
    for column in other_columns:
        # If the first row is NaN, use the next row's value
        if pd.isna(group[column].iloc[0]) and len(group) > 1:
            group.loc[group.index[0], column] = group[column].iloc[1]
        # If the last row is NaN, use the previous row's value
        if pd.isna(group[column].iloc[-1]) and len(group) > 1:
            group.loc[group.index[-1], column] = group[column].iloc[-2]
        # For other rows, use the average of the previous and next rows
        group[column] = group[column].fillna((group[column].shift() + group[column].shift(-1)) / 2)
    
    return group

# Apply the process_group function to each group
processed_data = grouped.apply(process_group).reset_index(drop=True)
processed_data.drop(columns=['Second'], inplace=True)
processed_data.head()
# # Save the processed data to a new CSV file
# processed_file_path = './cleaned/sensorlog_20240606_114748 walk.csv'
# processed_data.to_csv(processed_file_path, index=False)


Unnamed: 0,Timestamp,X,Y,Z,Type,altitude,course,hacc,latitude,longitude,speed
0,2024-06-06 12:16:11.345,0.212961,3.571288,9.809975,Position,51.469,0.0,16.86,52.334767,4.861806,0.0
1,2024-06-06 12:16:11.499,0.212961,3.571288,9.809975,Acceleration,51.4815,0.0,16.8119,52.334767,4.861806,0.0
2,2024-06-06 12:16:11.519,0.29013,3.616751,9.575478,Acceleration,51.494,0.0,16.7638,52.334767,4.861806,0.0
3,2024-06-06 12:16:11.539,0.607179,3.382853,8.596215,Acceleration,51.5065,0.0,16.7157,52.334767,4.861806,0.0
4,2024-06-06 12:16:11.557,8.025001,-21.356251,-35.043751,MagneticField,51.519,0.0,16.6676,52.334767,4.861806,0.0


In [312]:
processed_data_cleaned=remove_outliers_and_apply_thresholds(processed_data, columns_to_check, thresholds)
processed_data_cleaned=merge_and_average(data)
processed_data_cleaned = processed_data_cleaned.drop(['Position_X', 'Position_Y', 'Position_Z'], axis=1)
processed_data_cleaned=processed_data_cleaned.interpolate('linear', limit_direction='both').ffill().bfill()
processed_data_cleaned.head()

Unnamed: 0,Timestamp,altitude,course,hacc,latitude,longitude,speed,Acceleration_X,Acceleration_Y,Acceleration_Z,MagneticField_X,MagneticField_Y,MagneticField_Z,Orientation_X,Orientation_Y,Orientation_Z,AngularVelocity_X,AngularVelocity_Y,AngularVelocity_Z
0,2024-06-06 12:16:11.345,51.469,0.0,16.86,52.334767,4.861806,0.0,0.212961,3.571288,9.809975,8.025001,-21.356251,-35.043751,-134.235852,-21.904164,-3.961019,-0.03036,-0.061785,-0.033289
1,2024-06-06 12:16:11.499,51.4815,0.0,16.8119,52.334767,4.861806,0.0,0.212961,3.571288,9.809975,8.025001,-21.356251,-35.043751,-134.235852,-21.904164,-3.961019,-0.03036,-0.061785,-0.033289
2,2024-06-06 12:16:11.519,51.494,0.0,16.7638,52.334767,4.861806,0.0,0.29013,3.616751,9.575478,8.025001,-21.356251,-35.043751,-134.235852,-21.904164,-3.961019,-0.03036,-0.061785,-0.033289
3,2024-06-06 12:16:11.539,51.5065,0.0,16.7157,52.334767,4.861806,0.0,0.607179,3.382853,8.596215,8.025001,-21.356251,-35.043751,-134.235852,-21.904164,-3.961019,-0.03036,-0.061785,-0.033289
4,2024-06-06 12:16:11.557,51.519,0.0,16.6676,52.334767,4.861806,0.0,0.769891,3.322434,8.275876,8.025001,-21.356251,-35.043751,-134.235852,-21.904164,-3.961019,-0.03036,-0.061785,-0.033289


In [308]:
def check_for_outliers(df):
    # Check longitude difference between consecutive timestamps
    longitude_diff = df['longitude'].diff()
    outliers_longitude = longitude_diff > 0.001
    if outliers_longitude.any():
        print("Longitude difference between consecutive timestamps exceeds the threshold.")
        print(df[outliers_longitude])

    latitude_diff = df['latitude'].diff()
    outliers_latitude = latitude_diff > 0.001
    if outliers_latitude.any():
        print("Latitude difference between consecutive timestamps exceeds the threshold.")
        print(df[outliers_latitude])

    altitude_diff = df['altitude'].diff()
    outliers_altitude = altitude_diff > 200
    if outliers_altitude.any():
        print("Altitude difference between consecutive timestamps exceeds the threshold.")
        print(df[outliers_altitude])

    speed_diff = df['speed'].diff()
    outliers_speed = speed_diff > 1000
    if outliers_speed.any():
        print("Speed difference between consecutive timestamps exceeds the threshold.")
        print(df[outliers_speed])

    # Return True if all checks pass, False otherwise
    return not (outliers_longitude.any() or outliers_latitude.any() or outliers_altitude.any() or outliers_speed.any())

print(check_for_outliers(processed_data_cleaned))

True


In [309]:
processed_data_cleaned.to_csv('./cleaned/'+file_name+'.csv',index=False)