In [1]:
# ========================
# library
# ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
import pickle
import glob
from tqdm import tqdm
%matplotlib inline

In [3]:
# ========================
# constant
# ========================
DEFOG_META_PATH = "../data/defog_metadata.csv"
DEFOG_FOLDER = "../data/train/defog/*.csv"

In [4]:
# ========================
# settings
# ========================
fe = "039"
if not os.path.exists(f"../output/fe/fe{fe}"):
    os.makedirs(f"../output/fe/fe{fe}")
    os.makedirs(f"../output/fe/fe{fe}/save")

In [5]:
meta = pd.read_csv(DEFOG_META_PATH)

In [6]:
sub_dict = {}
for n,i in enumerate(meta["Subject"].unique()):
    sub_dict[i] = n

In [7]:
meta["sub_id"] = meta["Subject"].map(sub_dict)

In [8]:
meta

Unnamed: 0,Id,Subject,Visit,Medication,sub_id
0,02ab235146,e1f62e,2,on,0
1,02ea782681,ae2d35,2,on,1
2,06414383cf,8c1f5e,2,off,2
3,092b4c1819,2874c5,1,off,3
4,0a900ed8a2,0e3d49,2,on,4
...,...,...,...,...,...
132,f3a921edee,1a778d,1,off,8
133,f40e8c6ebe,575c60,1,off,38
134,f8ddbdd98d,107712,1,on,39
135,f9efef91fb,5d9cae,2,off,44


In [10]:
with open(f'../output/fe/fe{fe}/fe{fe}_sub_id.pkl', 'wb') as p:
    pickle.dump(sub_dict, p)

In [11]:
data_list = glob.glob(DEFOG_FOLDER)

In [13]:
df_all = []
for i in tqdm(data_list):
    df = pd.read_csv(i)
    df_all.append(df)
df_all = pd.concat(df_all).reset_index(drop=True)

100%|██████████| 91/91 [00:10<00:00,  8.85it/s]


In [14]:
print(len(df_all))

13525702


In [15]:
mean_std_dict = {}
for c in ["AccV","AccML","AccAP"]:
    mean = df_all[c].mean()
    std = df_all[c].std()
    mean_std_dict[c] = [mean,std]
    print(c,mean,std)

AccV -0.9401728865187032 0.08606513545195248
AccML 0.0011727847170217099 0.12000798550598998
AccAP -0.13061518435795438 0.28238873112045165


In [16]:
mean_std_dict

{'AccV': [-0.9401728865187032, 0.08606513545195248],
 'AccML': [0.0011727847170217099, 0.12000798550598998],
 'AccAP': [-0.13061518435795438, 0.28238873112045165]}

In [17]:
with open(f'../output/fe/fe{fe}/save/fe{fe}_sc.pkl', 'wb') as p:
    pickle.dump(mean_std_dict, p)

In [24]:
d_list = []
num_array = []
target_array = []
valid_array = []
subject_list = []
id_list = []
mask_array = []
num_cols = ["AccV","AccML","AccAP"]
target_cols = ["StartHesitation","Turn","Walking"]
seq_len = 1000
for i,s in tqdm(zip(meta["Id"].values,
               meta["sub_id"].values)):
    path = f"../data/train/defog/{i}.csv"
    if path in data_list:
        d_list.append(1)
        df = pd.read_csv(path)
        df["valid"] = df["Valid"] & df["Task"]
        df["valid"] = df["valid"].astype(int)
        batch = (len(df) // seq_len) + 1
        for c in num_cols:
            df[c] = (df[c] - mean_std_dict[c][0]) / mean_std_dict[c][1]
        num = df[num_cols].values
        target = df[target_cols].values
        valid = df["valid"].values
        num_array_ = np.zeros([batch,seq_len,3])
        target_array_ = np.zeros([batch,seq_len,3])
        mask_array_ = np.zeros([batch,seq_len])
        valid_array_ = np.zeros([batch,seq_len])
        for n,b in enumerate(range(batch)):
            if b == (batch - 1):
                num_ = num[b*seq_len : ]
                num_array_[b,:len(num_),:] = num_
                target_ = target[b*seq_len : ]
                target_array_[b,:len(target_),:] = target_
                valid_ = valid[b*seq_len : ]
                valid_array_[b,:len(valid_)] = valid_
                mask_array_[b,:len(target_)] = 1
            else:
                num_ = num[b*seq_len:(b+1)*seq_len]
                num_array_[b,:,:] = num_
                target_ = target[b*seq_len :(b+1)*seq_len]
                target_array_[b,:,:] = target_
                valid_ = valid[b*seq_len :(b+1)*seq_len]
                valid_array_[b,:] = valid_
                mask_array_[b,:] = 1
        num_array.append(num_array_)
        target_array.append(target_array_)
        mask_array.append(mask_array_)
        valid_array.append(valid_array_)
        subject_list += [s for _ in range(batch)]
        id_list += [i for _ in range(batch)] 
    else:
        d_list.append(0)

137it [00:11, 12.40it/s]


In [25]:
num_array = np.concatenate(num_array,axis=0)
target_array =np.concatenate(target_array,axis=0)
mask_array =  np.concatenate(mask_array,axis=0)

In [26]:
valid_array = np.concatenate(valid_array,axis=0)

In [27]:
df_id = pd.DataFrame()
df_id["Id"] = id_list
df_id["subject"] = subject_list

In [28]:
meta["data_is"] = d_list

In [29]:
np.save(f"../output/fe/fe{fe}/fe{fe}_num_array.npy",num_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_target_array.npy",target_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_mask_array.npy",mask_array)
np.save(f"../output/fe/fe{fe}/fe{fe}_valid_array.npy",valid_array)

In [30]:
df_id.to_parquet(f"../output/fe/fe{fe}/fe{fe}_id.parquet")

In [31]:
meta.to_parquet(f"../output/fe/fe{fe}/fe{fe}_defog_meta.parquet")