In [4]:
# ========================
# library
# ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler,RobustScaler
import pickle
import glob
from tqdm import tqdm
%matplotlib inline

In [5]:
# ========================
# constant
# ========================
TDCSFOG_META_PATH = "../data/tdcsfog_metadata.csv"
TDCSFOG_FOLDER = "../data/train/tdcsfog/*.csv"

In [6]:
# ========================
# settings
# ========================
fe = "022"
if not os.path.exists(f"../output/fe/fe{fe}"):
    os.makedirs(f"../output/fe/fe{fe}")
    os.makedirs(f"../output/fe/fe{fe}/save")

In [7]:
data_list = glob.glob(TDCSFOG_FOLDER)

In [10]:
meta = pd.read_parquet("../output/fe/fe001/fe001_tdcsfog_meta.parquet")

In [11]:
cols = ["AccV","AccML","AccAP"]
num_cols = ['AccV', 'AccML', 'AccAP', 
       'AccV_lag_diff', 'AccV_lead_diff', 'AccV_cumsum', 'AccML_lag_diff',
       'AccML_lead_diff', 'AccML_cumsum', 'AccAP_lag_diff', 'AccAP_lead_diff',
       'AccAP_cumsum']
target_cols = ["StartHesitation","Turn","Walking"]
seq_len = 1000
shift = 500
offset = 250

In [12]:
num_array = []
target_array = []
subject_list = []
id_list = []
mask_array = []
pred_use_array = []
time_array = []

In [13]:
for i,s in tqdm(zip(meta["Id"].values,
               meta["sub_id"].values)):
    path = f"../data/train/tdcsfog/{i}.csv"
    df = pd.read_csv(path)
    batch = (len(df)-1) // shift
    for c in cols:
        df[f"{c}_lag_diff"] = df[c].diff()
        df[f"{c}_lead_diff"] = df[c].diff(-1)
        df[f"{c}_cumsum"] = df[c].cumsum()
    sc = RobustScaler()
    df[num_cols] = sc.fit_transform(df[num_cols].values)
    df[num_cols] = df[num_cols].fillna(0)
    #for c in num_cols:
    #    df[c] = (df[c] - mean_std_dict[c][0]) / mean_std_dict[c][1]
    #    df[c] = df[c].fillna(0)
    num = df[num_cols].values
    target = df[target_cols].values
    time = df["Time"].values
    num_array_ = np.zeros([batch,seq_len,12])
    target_array_ = np.zeros([batch,seq_len,3])
    time_array_ = np.zeros([batch,seq_len],dtype=int)
    mask_array_ = np.zeros([batch,seq_len],dtype=int)
    pred_use_array_ = np.zeros([batch,seq_len],dtype=int)
    for n,b in enumerate(range(batch)):
        if b == (batch - 1):
            num_ = num[b*shift : ]
            num_array_[b,:len(num_),:] = num_
            target_ = target[b*shift : ]
            target_array_[b,:len(target_),:] = target_
            mask_array_[b,:len(target_)] = 1
            pred_use_array_[b,offset:len(target_)] = 1
            time_ = time[b*shift : ]
            time_array_[b,:len(time_)] = time_
        elif b == 0:
            num_ = num[b*shift:b*shift+seq_len]
            num_array_[b,:,:] = num_
            target_ = target[b*shift:b*shift + seq_len]
            target_array_[b,:,:] = target_
            mask_array_[b,:] = 1
            pred_use_array_[b,:offset+shift] = 1
            time_ = time[b*shift:b*shift + seq_len]
            time_array_[b,:] = time_
        else:
            num_ = num[b*shift:b*shift+seq_len]
            num_array_[b,:,:] = num_
            target_ = target[b*shift:b*shift + seq_len]
            target_array_[b,:,:] = target_
            mask_array_[b,:] = 1
            pred_use_array_[b,offset:offset+shift] = 1
            time_ = time[b*shift:b*shift + seq_len]
            time_array_[b,:] = time_
            
    num_array.append(num_array_)
    target_array.append(target_array_)
    mask_array.append(mask_array_)
    pred_use_array.append(pred_use_array_)
    time_array.append(time_array_)
    subject_list += [s for _ in range(batch)]
    id_list += [i for _ in range(batch)] 

833it [00:19, 41.81it/s]


In [14]:
num_array = np.concatenate(num_array,axis=0)
target_array =np.concatenate(target_array,axis=0)
mask_array =  np.concatenate(mask_array,axis=0)
pred_use_array = np.concatenate(pred_use_array,axis=0)
time_array = np.concatenate(time_array,axis=0)

In [15]:
df_id = pd.DataFrame()
df_id["Id"] = id_list
df_id["subject"] = subject_list

In [16]:
np.save(f"../output/fe/fe{fe}/fe{fe}_num_array.npy",num_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_target_array.npy",target_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_mask_array.npy",mask_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_time_array.npy",time_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_pred_use_array.npy",pred_use_array)

In [17]:
df_id.to_parquet(f"../output/fe/fe{fe}/fe{fe}_id.parquet")