In [None]:
import numpy as np
import gzip
import pandas as pd
from sklearn.impute import KNNImputer 

In [1]:
# this method take a dataframe as input, return the feature part and label part
def parse_header_of_csv(csv_df):
    # Isolate the headline columns:

    for (ci,col) in enumerate(csv_df.columns):
        # find the start of label column
            if col.startswith('label:'):
                first_label_ind = ci
                break
            pass
    # use the "start of label" find above to split feature and label
    feature_names = csv_df.columns[1:first_label_ind]
    label_names = list(csv_df.columns[first_label_ind:-1])

    # remove "label: " get pure label name
    for (li,label) in enumerate(label_names):
    # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
            assert label.startswith('label:')
            label_names[li] = label.replace('label:','')
            pass

    csv_df.rename(columns=dict(zip(csv_df.columns[first_label_ind:-1],label_names)),inplace=True)
        
    return (feature_names,label_names)

"""
this method take a dataframe and number of features as input, 
return sensor matrix, label matrix, missing label matrix and timestamp matrix(index)
"""
def parse_body_of_csv(csv_df,n_features):


    # Read the entire CSV body into a single numeric matrix:
    
    # Timestamp is the primary key for the records (examples):
    timestamps = csv_df.index
    # Read the sensor features:
    X = csv_df[csv_df.columns[0:n_features+1]]
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = csv_df[csv_df.columns[n_features+1:-1]] # This should have values of either 0., 1. or NaN

    M = pd.isna(trinary_labels_mat) # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0. # Y is the label matrix

    
    return (X,Y,M,timestamps)

'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
this method take id of subject as input
return sensor matrix, label matrix, missing label matrix and timestamp matrix(index) by calling parse_body_of_csv()method

'''
def read_user_data(uuid):
    user_data_file = 'Datasets/%s.features_labels.csv.gz' % uuid

    with gzip.open(user_data_file,'rb') as fid:
        csv_df = pd.read_csv(fid,delimiter=',', index_col= 0)
        pass

    (feature_names,label_names) = parse_header_of_csv(csv_df)
    n_features = len(feature_names)
    (X,Y,M,timestamps) = parse_body_of_csv(csv_df,n_features)

    return (X,Y,M,timestamps,feature_names,label_names)

#To create uuid_list which includes all uuid
uuid_list = []
f = open('UUID List.txt', 'r')
for line in f.readlines():
    uuid_list.append(line.strip())

# To create main feature list
    main_feature = []
    f = open('Main Feature.txt', 'r')
    for line in f.readlines():
        main_feature.append(line.strip())

"""
by calling this method we can get a list of dataframe which contain all the user's sensor data
//3.6 v0 may get label lists later w.
Author chen
"""
def get_df_list():
    #To create uuid_list which includes all uuid
    uuid_list = []
    f = open('UUID List.txt', 'r')
    for line in f.readlines():
        uuid_list.append(line.strip())

    main_feature = []
    f = open('Main Feature.txt', 'r')
    for line in f.readlines():
        main_feature.append(line.strip())

    instance = []
    # Run all uuid
    for i in range(len(uuid_list)):    
        (X,Y,M,timestamps,feature_names,label_names) = read_user_data(uuid_list[i])

        # Create dataframe for all Main Feature value
        Main_X = pd.DataFrame(X.loc[:,X.columns.str.startswith(main_feature[0])], columns = [main_feature[0]])
        for j in range(1,len(main_feature)):
            Main_X = pd.concat([Main_X, X.loc[:,X.columns.str.startswith(main_feature[j])]], axis=1)
        instance.append(Main_X)
    return instance 


In [13]:
# 用均值补除了手表的数据
def non_watch_value_imputer(df):
    # get the data except watch
    non_watch_values = df.loc[:,(df.columns.str.startswith('watch_') == False)]
    valid_data = pd.DataFrame(columns = ['blank'])
    # use mean values to fill the none value
    for column in non_watch_values.columns:
        column_df = non_watch_values[column]
        mean_value = non_watch_values[column].mean()
        column_df = column_df.fillna(mean_value)
        valid_data = pd.concat([valid_data, column_df],axis=1,ignore_index=False)

    valid_data = valid_data[valid_data.columns[1:]]
    #combine the watch data
    combine_data = pd.concat([valid_data,df.loc[:,df.columns.str.startswith('watch_')]],axis=1,ignore_index=False)
    return combine_data
# 用其他传感器数据的KNN补手表数据
def KNN_for_watch_data(df,K):
    #input data and K neighbors
    imputer = KNNImputer(n_neighbors=K)
    df[list(df.columns)] = imputer.fit_transform(df)
    return df 

In [15]:
instance = get_df_list()

test_example = instance[0]

cleaned_data = non_watch_value_imputer(test_example)
cleaned_data = KNN_for_watch_data(cleaned_data,10)
cleaned_data

Unnamed: 0,raw_acc:magnitude_stats:mean,raw_acc:3d:mean_x,raw_acc:3d:mean_y,raw_acc:3d:mean_z,proc_gyro:magnitude_stats:mean,proc_gyro:3d:mean_x,proc_gyro:3d:mean_y,proc_gyro:3d:mean_z,raw_magnet:magnitude_stats:mean,raw_magnet:3d:mean_x,...,audio_naive:mfcc8:mean,audio_naive:mfcc9:mean,audio_naive:mfcc10:mean,audio_naive:mfcc11:mean,audio_naive:mfcc12:mean,watch_acceleration:magnitude_stats:mean,watch_acceleration:3d:mean_x,watch_acceleration:3d:mean_y,watch_acceleration:3d:mean_z,watch_heading:mean_cos
1444079161,0.996815,0.002331,0.004614,-0.996790,0.002558,0.000412,0.000448,-0.000541,618.751929,223.246192,...,-0.503573,-0.518072,-0.907876,-0.681724,-0.683515,1032.508157,-0.592000,-55.824000,-1030.912000,0.239999
1444079221,0.996864,0.003557,0.005495,-0.996832,0.002279,0.000091,-0.000082,0.000002,618.834334,223.362545,...,-0.665443,-0.580521,-0.865482,-0.703950,-0.718397,1031.283026,0.000000,-56.096000,-1029.664000,0.280546
1444079281,0.996825,0.004180,0.005776,-0.996785,0.002579,0.000011,0.000179,0.000026,618.632716,223.252828,...,-0.670674,-0.580025,-0.710762,-0.694606,-0.720886,1031.643125,2.912000,-57.696000,-1029.920000,0.280546
1444079341,0.996874,0.004179,0.005551,-0.996836,0.002497,-0.000006,-0.000020,0.000042,618.757055,223.281584,...,-0.614741,-0.531608,-0.836680,-0.704615,-0.719910,1032.123385,2.640000,-56.400000,-1030.496000,0.239999
1444079431,0.997371,-0.008383,-0.000009,-0.994184,0.105566,-0.001074,-0.000934,0.006930,613.681143,86.943267,...,-0.577766,-0.579184,-0.746564,-0.592549,-0.506650,1042.973796,66.458947,-101.187368,-1016.269474,-0.279508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1444750610,1.002052,0.004934,0.022278,-1.001781,0.002418,-0.000052,0.000125,0.000206,500.899097,76.519650,...,-1.052019,-0.793894,-1.123509,-0.746483,-0.777447,1023.383704,3.996800,509.691200,-670.745600,0.043374
1444750670,1.002993,0.007210,-0.008207,-0.996744,0.126576,0.000681,-0.002937,-0.001118,506.277105,71.156095,...,-0.840079,-0.925808,-0.888791,-0.734246,-0.584388,1022.034562,16.086400,526.772800,-687.646400,0.043374
1444750731,1.002894,0.009963,-0.208783,-0.884378,0.351290,-0.015156,0.010175,0.020482,505.831997,73.140524,...,-0.884990,-0.909655,-0.877803,-0.772583,-0.684588,1022.646160,0.955200,520.224000,-663.926400,0.066573
1444750768,1.002477,0.009740,-0.003631,-1.002408,0.003065,-0.000143,0.000361,-0.000267,518.997585,65.111670,...,-1.000558,-0.962642,-0.873980,-0.663058,-0.763480,1020.716008,-9.876800,581.459200,-641.929600,0.184809


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=51389dd1-61f3-47fa-93db-d3cd8b3bb8b3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>