In [1]:
# ========================
# library
# ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle
import glob
from tqdm import tqdm
%matplotlib inline

In [2]:
# ========================
# constant
# ========================
DEFOG_META_PATH = "../data/defog_metadata.csv"
DEFOG_FOLDER = "../data/train/defog/*.csv"

In [3]:
# ========================
# settings
# ========================
fe = "047"
if not os.path.exists(f"../output/fe/fe{fe}"):
    os.makedirs(f"../output/fe/fe{fe}")
    os.makedirs(f"../output/fe/fe{fe}/save")

In [7]:
meta = pd.read_parquet("../output/fe/fe039/fe039_defog_meta.parquet")

In [8]:
cols = ["AccV","AccML","AccAP"]
num_cols = ["AccV","AccML","AccAP",'AccV_lag_diff', 'AccV_lead_diff', 'AccML_lag_diff', 'AccML_lead_diff',
       'AccAP_lag_diff', 'AccAP_lead_diff']
target_cols = ["StartHesitation","Turn","Walking"]
seq_len = 5000
shift = 2500
offset = 1250

In [9]:
num_array = []
target_array = []
subject_list = []
valid_array = []
id_list = []
mask_array = []
pred_use_array = []
time_array = []
d_list = []

In [10]:
data_list = glob.glob(DEFOG_FOLDER)

In [11]:
for i,s in tqdm(zip(meta["Id"].values,
               meta["sub_id"].values)):
    path = f"../data/train/defog/{i}.csv"
    if path in data_list:
        d_list.append(1)
        df = pd.read_csv(path)
        df["valid"] = df["Valid"] & df["Task"]
        df["valid"] = df["valid"].astype(int)
        batch = (len(df)-1) // shift
        for c in cols:
            df[f"{c}_lag_diff"] = df[c].diff()
            df[f"{c}_lead_diff"] = df[c].diff(-1)
        
        sc = StandardScaler()
        df[num_cols] = sc.fit_transform(df[num_cols].values)
        df[num_cols] = df[num_cols].fillna(0)
        
        num = df[num_cols].values
        target = df[target_cols].values
        valid = df["valid"].values
        time = df["Time"].values
        num_array_ = np.zeros([batch,seq_len,9])
        target_array_ = np.zeros([batch,seq_len,3])
        time_array_ = np.zeros([batch,seq_len],dtype=int)
        mask_array_ = np.zeros([batch,seq_len],dtype=int)
        pred_use_array_ = np.zeros([batch,seq_len],dtype=int)
        valid_array_ = np.zeros([batch,seq_len],dtype=int)
        for n,b in enumerate(range(batch)):
            if b == (batch - 1):
                num_ = num[b*shift : ]
                num_array_[b,:len(num_),:] = num_
                target_ = target[b*shift : ]
                target_array_[b,:len(target_),:] = target_
                mask_array_[b,:len(target_)] = 1
                pred_use_array_[b,offset:len(target_)] = 1
                time_ = time[b*shift : ]
                time_array_[b,:len(time_)] = time_
                valid_ = valid[b*shift : ]
                valid_array_[b,:len(valid_)] = valid_
            elif b == 0:
                num_ = num[b*shift:b*shift+seq_len]
                num_array_[b,:,:] = num_
                target_ = target[b*shift:b*shift + seq_len]
                target_array_[b,:,:] = target_
                mask_array_[b,:] = 1
                pred_use_array_[b,:shift + offset] = 1
                time_ = time[b*shift:b*shift + seq_len]
                time_array_[b,:] = time_
                valid_ = valid[b*shift:b*shift + seq_len]
                valid_array_[b,:] = valid_
            else:
                num_ = num[b*shift:b*shift+seq_len]
                num_array_[b,:,:] = num_
                target_ = target[b*shift:b*shift + seq_len]
                target_array_[b,:,:] = target_
                mask_array_[b,:] = 1
                pred_use_array_[b,offset:shift + offset] = 1
                time_ = time[b*shift:b*shift + seq_len]
                time_array_[b,:] = time_
                valid_ = valid[b*shift:b*shift + seq_len]
                valid_array_[b,:] = valid_

        num_array.append(num_array_)
        target_array.append(target_array_)
        mask_array.append(mask_array_)
        pred_use_array.append(pred_use_array_)
        time_array.append(time_array_)
        valid_array.append(valid_array_)
        subject_list += [s for _ in range(batch)]
        id_list += [i for _ in range(batch)] 
    else:
        d_list.append(0)

137it [00:18,  7.44it/s]


In [12]:
num_array = np.concatenate(num_array,axis=0)
target_array =np.concatenate(target_array,axis=0)
mask_array =  np.concatenate(mask_array,axis=0)
pred_use_array = np.concatenate(pred_use_array,axis=0)
time_array = np.concatenate(time_array,axis=0)
valid_array = np.concatenate(valid_array,axis=0)

In [13]:
df_id = pd.DataFrame()
df_id["Id"] = id_list
df_id["subject"] = subject_list

In [14]:
np.save(f"../output/fe/fe{fe}/fe{fe}_num_array.npy",num_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_target_array.npy",target_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_mask_array.npy",mask_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_time_array.npy",time_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_pred_use_array.npy",pred_use_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_valid_array.npy",valid_array)

In [15]:
df_id.to_parquet(f"../output/fe/fe{fe}/fe{fe}_id.parquet")