In [None]:
import os
import numpy as np
import gzip
import pandas as pd
import seaborn as sns

# this method take a dataframe as input, return the feature part and label part
def parse_header_of_csv(csv_df):
    # Isolate the headline columns:

    for (ci,col) in enumerate(csv_df.columns):
        # find the start of label column
            if col.startswith('label:'):
                first_label_ind = ci
                break
            pass
    # use the "start of label" find above to split feature and label
    feature_names = csv_df.columns[1:first_label_ind]
    label_names = list(csv_df.columns[first_label_ind:-1])

    # remove "label: " get pure label name
    for (li,label) in enumerate(label_names):
    # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
            assert label.startswith('label:')
            label_names[li] = label.replace('label:','')
            pass

    csv_df.rename(columns=dict(zip(csv_df.columns[first_label_ind:-1],label_names)),inplace=True)
        
    return (feature_names,label_names)

"""
this method take a dataframe and number of features as input, 
return sensor matrix, label matrix, missing label matrix and timestamp matrix(index)
"""
def parse_body_of_csv(csv_df,n_features):


    # Read the entire CSV body into a single numeric matrix:
    
    # Timestamp is the primary key for the records (examples):
    timestamps = csv_df.index
    # Read the sensor features:
    X = csv_df[csv_df.columns[0:n_features+1]]
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = csv_df[csv_df.columns[n_features+1:-1]] # This should have values of either 0., 1. or NaN

    M = pd.isna(trinary_labels_mat) # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0. # Y is the label matrix

    
    return (X,Y,M,timestamps)

'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
this method take id of subject as input
return sensor matrix, label matrix, missing label matrix and timestamp matrix(index) by calling parse_body_of_csv()method

'''
def read_user_data(uuid):
    # user_data_file = '%s.features_labels.csv.gz' % uuid
    user_data_file = os.path.join('Dataset', '%s.features_labels.csv.gz' % uuid)

    with gzip.open(user_data_file,'rb') as fid:
        csv_df = pd.read_csv(fid,delimiter=',', index_col= 0)
        pass

    (feature_names,label_names) = parse_header_of_csv(csv_df)
    n_features = len(feature_names)
    (X,Y,M,timestamps) = parse_body_of_csv(csv_df,n_features)

    return (X,Y,M,timestamps,feature_names,label_names)


uuid_list = []
f = open('Dataset/UUID List.txt', 'r')
for line in f.readlines():
    uuid_list.append(line.strip())

In [None]:
with gzip.open('cleaned_data.zip','rb')as file:
    cleaned_data = pd.read_csv(file, index_col=[0,1])

In [None]:
main_label_list = [['SLEEPING'],
                   ['LAB_WORK', 'IN_CLASS', 'IN_A_MEETING', 'LOC_main_workplace','COMPUTER_WORK','AT_SCHOOL', 'WITH_CO-WORKERS'],
                   ['FIX_walking', 'FIX_running', 'BICYCLING','OR_exercise'],
                   ['COOKING', 'BATHING_-_SHOWER', 'CLEANING', 'DOING_LAUNDRY', 'WASHING_DISHES', 'EATING', 'TOILET', 'GROOMING', 'DRESSING'],
                   ['FIX_restaurant','SHOPPING', 'STROLLING', 'DRINKING__ALCOHOL_','WATCHING_TV', 'SURFING_THE_INTERNET', 'AT_A_PARTY', 'AT_A_BAR', 'LOC_beach', 'SINGING', 'WITH_FRIENDS'],                   
                   ['IN_A_CAR', 'ON_A_BUS', 'DRIVE_-_I_M_THE_DRIVER', 'DRIVE_-_I_M_A_PASSENGER','STAIRS_-_GOING_DOWN', 'ELEVATOR']]

new_label_list = ['sleep','efficiency','exercise','life_activity','entertainment','on_the_way']
new_label_dict = {'sleep':0, 'efficiency':1, 'exercise':2, 'life_activity':3, 'entertainment':4, 'on_the_way':5, 'Normal':6}

all_label_list = []

for i in main_label_list:
    all_label_list = all_label_list + i

  
new_label_data = pd.DataFrame()
for uuid in cleaned_data.groupby('uuid').count().index:
    X,Y,M,timestamps,feature_names,label_names = read_user_data(uuid)
    label_pair = pd.DataFrame(
        columns = ['Label Name'],
        index = timestamps
    )
    s = Y.shape
    
    for i in range(0,s[0]): #跑每個timestamps
        arr = np.where(Y[i]==1) #尋找這個timestamp 哪些label是ture
        temp = []
        for j in arr[0]:
            temp.append(label_names[j]) #將這個timestamp true的label name拼成list
        label_pair.loc[timestamps[i], 'Label Name'] = temp #把list放進對應的dataframe位置
        
    
    new_label = []
    new_index = []
    for index in label_pair.index:
        label = label_pair.loc[index].values[0]
        if bool(label) == True:
            for num,status in enumerate(main_label_list):
                if bool(set(status) & set(label)):
                    new_label.append(num)
                    new_index.append(index)
                    break
                elif bool(set(label) & set(all_label_list)):
                    continue
                else:
                    new_label.append(new_label_dict['Normal'])
                    new_index.append(index)
                    break

    muti_index = pd.MultiIndex.from_product([[uuid], new_index], names=['uuid','timestamps'])
    new_label = pd.DataFrame(data = new_label, index = muti_index,columns = ['Status'])
    new_label_data = pd.concat([new_label_data,new_label],axis=0,ignore_index=False)

    
new_label_data.value_counts()


In [None]:
# Combine cleaned X and new Y
Final_data = pd.concat([cleaned_data,new_label_data], axis=1, join='inner')

def TS_data(ts, lag, n_ahead, target_index=-1):
    """
    A method to create X and Y matrix from a time series array for the training of Time series model
    Input:
    ts should be np.array
    lag is number of lags (timestamps back) to use for models
    n_ahead is steps ahead to forecast
    
    Output is tuple
    """
    # Extracting the number of features that are passed from the array 
    n_features = ts.shape[1]
    
    # Creating placeholder lists
    X, Y = [], []

    if len(ts) - lag <= 0:
        X.append(ts)
    else:
        for i in range(len(ts) - lag - n_ahead):
            Y.append(ts[(i + lag):(i + lag + n_ahead), target_index])
            X.append(ts[i:(i + lag)])

    X, Y = np.array(X), np.array(Y)

    # Reshaping the X array to an RNN input shape 
    X = np.reshape(X, (X.shape[0], lag, n_features))

    return X, Y


def TS_split(Final_data, lag, n_ahead):
    import random
    with gzip.open('cleaned_data.zip','rb') as data:
        data = pd.read_csv(data,index_col=[0,1])
    idlist = data.groupby('uuid').count().index

    random.seed(505)
    randid = random.sample(list(range(0,len(idlist))), len(idlist))

    train_data = pd.DataFrame()
    validation_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for i in randid[0:38]:
        train_data = pd.concat([train_data, Final_data.loc[idlist[i]]], axis = 0)
        
    for i in randid[38:45]:
        validation_data = pd.concat([validation_data, Final_data.loc[idlist[i]]], axis = 0)

    for i in randid[45:]:
        test_data = pd.concat([test_data, Final_data.loc[idlist[i]]], axis = 0)
    
    
    x_train, y_train = TS_data(train_data.values, lag, n_ahead)
    
    x_val, y_val = TS_data(validation_data.values, lag, n_ahead)
    
    x_test, y_test = TS_data(test_data.values, lag, n_ahead)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
x_train, y_train, x_val, y_val, x_test, y_test  = TS_split(Final_data, lag=3, n_ahead=1)

print(f"Shape of training X: {x_train.shape}")
print(f"Shape of the training Y data: {y_train.shape}")

print(f"Shape of validation X: {x_val.shape}")
print(f"Shape of the validation Y data: {y_val.shape}")

print(f"Shape of testing X: {x_test.shape}")
print(f"Shape of the testing Y: {y_test.shape}")