In [1]:
import pandas as pd
import gzip
import numpy as np

In [2]:
def parse_header_of_csv(csv_df):
    # Isolate the headline columns:

    for (ci,col) in enumerate(csv_df.columns):
        # find the start of label column
            if col.startswith('label:'):
                first_label_ind = ci
                break
            pass
    # use the "start of label" find above to split feature and label
    feature_names = csv_df.columns[1:first_label_ind]
    label_names = list(csv_df.columns[first_label_ind:-1])

    # remove "label: " get pure label name
    for (li,label) in enumerate(label_names):
    # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
            assert label.startswith('label:')
            label_names[li] = label.replace('label:','')
            pass

    csv_df.rename(columns=dict(zip(csv_df.columns[first_label_ind:-1],label_names)),inplace=True)
        
    return (feature_names,label_names)
def parse_body_of_csv(csv_df,n_features):


    # Read the entire CSV body into a single numeric matrix:
    
    # Timestamp is the primary key for the records (examples):
    timestamps = csv_df.index
    # Read the sensor features:
    X = csv_df[csv_df.columns[0:n_features+1]]
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = csv_df[csv_df.columns[n_features+1:-1]] # This should have values of either 0., 1. or NaN

    M = pd.isna(trinary_labels_mat) # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0. # Y is the label matrix

    
    return (X,Y,M,timestamps)
def read_user_data(uuid):
    user_data_file = 'Datasets/%s.features_labels.csv.gz' % uuid

    with gzip.open(user_data_file,'rb') as fid:
        csv_df = pd.read_csv(fid,delimiter=',', index_col= 0)
        pass

    (feature_names,label_names) = parse_header_of_csv(csv_df)
    n_features = len(feature_names)
    (X,Y,M,timestamps) = parse_body_of_csv(csv_df,n_features)

    return (X,Y,M,timestamps,feature_names,label_names)
def get_df(uuid):
    main_feature = []
    f = open('Main Feature.txt', 'r')
    for line in f.readlines():
        main_feature.append(line.strip())

    # Run all uuid 
    (X,Y,M,timestamps,feature_names,label_names) = read_user_data(uuid)

    # Create dataframe for all Main Feature value
    Main_X = pd.DataFrame(X.loc[:,X.columns.str.startswith(main_feature[0])], columns = [main_feature[0]])
    for j in range(1,len(main_feature)):
        Main_X = pd.concat([Main_X, X.loc[:,X.columns.str.startswith(main_feature[j])]], axis=1)
    return Main_X

In [3]:
uuid_list = []
f = open('UUID List.txt', 'r')
for line in f.readlines():
    uuid_list.append(line.strip())
Main_X = pd.DataFrame()
droped_uuid = ['40E170A7-607B-4578-AF04-F021C3B0384A' ,
                '61359772-D8D8-480D-B623-7C636EAD0C81' ,
                '61976C24-1C50-4355-9C49-AAE44A7D09F6' ,
                '7D9BB102-A612-4E2A-8E22-3159752F55D8' ,
                'CCAF77F0-FABB-4F2F-9E24-D56AD0C5A82F' ,
                'ECECC2AB-D32F-4F90-B74C-E12A1C69BBE2' ,
                'F50235E0-DD67-4F2A-B00B-1F31ADA998B9' ,
                'FDAA70A1-42A3-4E3F-9AE3-3FDA412E03BF']
for uuid in uuid_list:
    if uuid not in droped_uuid:
        df = get_df(uuid)
        Main_X = pd.concat([Main_X, df],ignore_index=True)