In [47]:
import torch
from torch import nn
from torchinfo import summary

from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy  as np

import json
import os
import glob

import warnings
warnings.filterwarnings("ignore")

In [48]:
save_folder = "../data/jikken1/"

feature_save_file = os.path.join(save_folder, "features.npy")
label_save_file = os.path.join(save_folder, "label.npy")
label_name_save_file = os.path.join(save_folder, "label_name.json")

kfold_split_save_file = os.path.join(save_folder, "kfold_train_val_test.npy")

## Data setup

In [49]:
_data_file = os.path.join(save_folder, "raw", "*.csv")
data_files = sorted(glob.glob(_data_file))

data_files

['../data/jikken1/raw/Conv-sensorA-1.csv',
 '../data/jikken1/raw/Conv-sensorB-1.csv',
 '../data/jikken1/raw/Conv-sensorC-1.csv',
 '../data/jikken1/raw/Conv-sensorD-1.csv',
 '../data/jikken1/raw/Conv-sensorE-1.csv']

In [50]:
data_list = []

for data_file in data_files:
    identifier = os.path.basename(data_file)[-7]
    
    df = pd.read_csv(data_file, encoding="shift-jis", low_memory=False)
    
    df_nona = df.dropna(subset="label")
    print("***df_nona first", df_nona.index[0])
    
#     df_filter_start_na = df.iloc[df_nona.index[0]:df_nona.index[-1], :]
    
    df_nona = df_nona.reset_index(drop=True)
    
    label = df_nona.loc[:, "label"]
    df_nona = df_nona.drop(["ts", "label"], axis=1)
    
    df_nona = df_nona.rename(lambda x: x + f"_{identifier}", axis=1)
    
    print("len(df_filter_start_na)", len(df_nona))
  
    data_list.append(df_nona)

data_df = pd.concat(data_list, axis=1)
data_df["label"] = label

data_df

***df_nona first 25422
len(df_filter_start_na) 74221
***df_nona first 24679
len(df_filter_start_na) 74221
***df_nona first 24396
len(df_filter_start_na) 74221
***df_nona first 24161
len(df_filter_start_na) 74221
***df_nona first 23883
len(df_filter_start_na) 74221


Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,gx_D,gy_D,gz_D,ax_E,ay_E,az_E,gx_E,gy_E,gz_E,label
0,-7935,1954,3465,-2788,323,-3586,-11585,-2828,1490,-6382,...,-4613,10980,-530,-8155,1505,3740,120,-1570,3440,階段降り
1,-7847,1998,3445,-2855,158,-3629,-11443,-2667,1373,-7248,...,-5308,11516,-280,-7842,1520,3598,241,-1710,3196,階段降り
2,-7647,2051,3387,-3008,-48,-3592,-11199,-2462,1299,-8083,...,-5363,11230,-129,-7432,1407,3462,223,-1685,2915,階段降り
3,-7495,2100,3309,-3124,-310,-3531,-10965,-2296,1021,-8907,...,-6266,10797,194,-7208,1354,3354,34,-1649,2635,階段降り
4,-7315,1954,3245,-3191,-548,-3421,-10652,-2233,782,-9650,...,-7906,11059,322,-7017,1158,3261,-264,-1673,2366,階段降り
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74216,-6358,6031,4749,278,-487,-196,-5145,-6197,5645,20,...,44,-99,-183,-8536,1476,4800,217,45,-108,座っている
74217,-6280,5997,4847,351,-554,-202,-5174,-6188,5577,130,...,-4,-129,-183,-8497,1529,4721,205,45,-120,座っている
74218,-6265,6011,4837,400,-652,-214,-5193,-6144,5543,160,...,-29,-123,-219,-8511,1515,4707,205,15,-169,座っている
74219,-6236,6085,4881,436,-749,-244,-5149,-6222,5572,142,...,-10,-135,-201,-8482,1520,4746,199,-16,-157,座っている


In [51]:
label_list = ['歩いている', '立っている', '走っている', '階段降り', '階段上り', '座っている']
eng_label_dict = dict(zip(
    ['歩いている', '立っている', '走っている', '階段降り', '階段上り', '座っている'],
    ['Walking', 'Standing', 'Jogging', 'DownStair', 'Upstair', 'Sit']
))

eng_label_list = [eng_label_dict[i] for i in label_list]

In [52]:
def segment(data_df, label_list, Window_size = 200, over_lap = 0.5, margin = 200):
    data_df_index_list = []
    index_label_list = []
    
    overlap_data = Window_size * over_lap

    index = 0
    loop = 0
    current_label = None
    
    while index < data_df.shape[0]:
        if index + Window_size >= data_df.shape[0]: break
    
        if current_label is not None and data_df['label'][index] == current_label:
            index += 1
            continue
            
        if pd.isna(data_df['label'][index]):
            index = index + 1
            continue

        count = 0
        while count < margin:
            count, index = count + 1, index + 1

        if index + Window_size >= data_df.shape[0]: break
        current_label = data_df.loc[index, 'label']

        while not pd.isna(data_df['label'][index + Window_size]) and data_df['label'][index + Window_size] == current_label:
#                 signal_data = data_df.iloc[index: index + Window_size, :]

            # Each data will be at size column(6 x 5 = 30) x Window_size
            # achieved by `data_df.loc[index: index + self.Window_size, :]` for index in data_df_index_list.
            # Data label will be `data_df.loc[index + self.Window_size, "label"]` for index in data_df_index_list.
            data_df_index_list.append(index)
            index = index + Window_size

            # add label of the last row of sequence
            index_label_list.append(label_list.index(data_df['label'][index]))

            index = index - int(overlap_data)
            if index + Window_size >= data_df.shape[0]: break
                
    return data_df_index_list, index_label_list

In [53]:
window_size = 200
data_df_index_list, index_label_list = segment(data_df, label_list, Window_size=window_size)

In [54]:
print(*zip(data_df_index_list, index_label_list))

(200, 3) (300, 3) (400, 3) (500, 3) (600, 3) (700, 3) (800, 3) (900, 3) (1000, 3) (1100, 3) (1200, 3) (1300, 3) (1400, 3) (1500, 3) (1600, 3) (1700, 3) (1800, 3) (1900, 3) (2000, 3) (2100, 3) (2200, 3) (2300, 3) (2400, 3) (2500, 3) (2600, 3) (2700, 3) (2800, 3) (2900, 3) (3000, 3) (3100, 3) (3200, 3) (3300, 3) (3400, 3) (3500, 3) (3600, 3) (3700, 3) (3800, 3) (3900, 3) (4000, 3) (4100, 3) (4200, 3) (4300, 3) (4400, 3) (4500, 3) (4600, 3) (4700, 3) (4800, 3) (4900, 3) (5000, 3) (5100, 3) (5200, 3) (5300, 3) (5400, 3) (5500, 3) (5600, 3) (5700, 3) (5800, 3) (6233, 4) (6333, 4) (6433, 4) (6533, 4) (6633, 4) (6733, 4) (6833, 4) (6933, 4) (7033, 4) (7133, 4) (7233, 4) (7333, 4) (7433, 4) (7533, 4) (7633, 4) (7733, 4) (7833, 4) (7933, 4) (8033, 4) (8133, 4) (8233, 4) (8333, 4) (8433, 4) (8533, 4) (8633, 4) (8733, 4) (8833, 4) (8933, 4) (9033, 4) (9133, 4) (9233, 4) (9333, 4) (9433, 4) (9533, 4) (9633, 4) (9733, 4) (9833, 4) (9933, 4) (10033, 4) (10133, 4) (10233, 4) (10333, 4) (10433, 4) (10

In [55]:
len(data_df_index_list)

699

In [56]:
data_list = []
for index, label_number in zip(data_df_index_list, index_label_list):
    x = data_df.iloc[index: index + window_size, :].drop(["label"], axis=1).values
    data_list.append(x)
data_list = np.array(data_list)

In [57]:
data_list.shape

(699, 200, 30)

In [58]:
index_label_list = np.array(index_label_list)
index_label_list.shape

(699,)

In [59]:
label_dict = dict(enumerate(label_list))
label_dict

{0: '歩いている', 1: '立っている', 2: '走っている', 3: '階段降り', 4: '階段上り', 5: '座っている'}

In [60]:
np.save(feature_save_file, data_list)

In [61]:
np.save(label_save_file, index_label_list)

In [62]:
with open(label_name_save_file, "w", encoding="shift-jis") as f:
    json.dump(label_dict, f)

In [63]:
with open(label_name_save_file, encoding="shift-jis") as f:
    label_dict = json.load(f)


## KFold setup

In [64]:
# the dataset label and index list correspond to each other
dataset_label_array = np.array(index_label_list)
dataset_index_list = np.array(range(len(data_list)))

#ラベル名ごとにカウント
index_data_all = []

# separate dataset_index_list and fill into index_data_all base on the label in dataset_label_array
# the order of index_data_all will follow label_list index
for activity_label in label_list:
    print("activity_label:", activity_label)
    
    one_activity_data = dataset_index_list[dataset_label_array == label_list.index(activity_label)]
    index_data_all.append(one_activity_data)

index_data_all

activity_label: 歩いている
activity_label: 立っている
activity_label: 走っている
activity_label: 階段降り
activity_label: 階段上り
activity_label: 座っている


[array([235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247,
        248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
        261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
        274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286,
        287, 288, 289, 290, 291, 292, 293, 580, 581, 582, 583, 584, 585,
        586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598,
        599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611,
        612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
        625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637,
        638, 639, 640, 641]),
 array([114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
        127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
        153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
        166, 167, 168

In [65]:
from sklearn.model_selection import KFold

n_split = 10

#KFoldの設定(10個に分割)
kf = KFold(n_splits = n_split, shuffle = True, random_state = 42)

# k_index_list (size: label number (6) x kfold number (10) x train or test (2)) 
k_index_list = []

for activity_data in index_data_all:
    k_index_list.append(list(kf.split(activity_data)))
    
k_index_list[0][0] 

(array([  0,   1,   2,   3,   5,   6,   7,   8,   9,  12,  13,  14,  15,
         16,  17,  19,  20,  21,  22,  23,  24,  25,  27,  28,  29,  30,
         31,  32,  33,  34,  35,  36,  37,  38,  39,  41,  42,  43,  45,
         46,  48,  49,  50,  51,  52,  53,  54,  56,  57,  58,  59,  60,
         61,  63,  65,  66,  67,  68,  69,  70,  71,  72,  74,  75,  76,
         77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
         90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120]),
 array([  4,  10,  11,  18,  26,  40,  44,  47,  55,  62,  64,  73, 108]))

In [66]:
kfold_train_test_index_list = []

for k in range(n_split):
#     train test for 1 fold
    train_x, val_x, test_x = [], [], []

    for i, activity_data in enumerate(index_data_all):

        activity_train_index, activity_test_index = \
            activity_data[ k_index_list[i][k][0] ], activity_data[ k_index_list[i][k][1] ]
        
#         activity_y_train, activity_y_test = label_all[i][ k_index_list[i][k][0] ], label_all[i].values[ k_index_list[i][k][1] ]

        activity_train, activity_val = train_test_split(activity_train_index, random_state=42, test_size=0.15)

        train_x.append(activity_train)
        val_x.append(activity_val)
        test_x.append(activity_test_index)

    train_x = np.concatenate(train_x, axis=0)
    val_x = np.concatenate(val_x, axis=0)
    test_x = np.concatenate(test_x, axis=0)

    kfold_train_test_index_list.append((train_x, val_x, test_x))

In [67]:
kfold_train_test_index_list

[(array([609, 640, 285, 250, 277, 597, 266, 262, 271, 291, 634, 254, 287,
         621, 622, 247, 273, 587, 637, 268, 283, 241, 584, 641, 281, 276,
         255, 611, 274, 599, 243, 286, 598, 605, 267, 258, 623, 639, 265,
         601, 293, 251, 264, 238, 256, 280, 244, 617, 242, 603, 627, 633,
         586, 580, 606, 289, 631, 593, 638, 618, 284, 590, 292, 626, 589,
         608, 272, 635, 591, 596, 624, 278, 269, 630, 236, 582, 260, 237,
         263, 620, 632, 607, 619, 615, 259, 592, 604, 252, 625, 581, 636,
         488, 475, 161, 128, 154, 478, 144, 139, 149, 176, 509, 131, 163,
         502, 501, 125, 151, 479, 514, 146, 158, 120, 173, 494, 157, 153,
         133, 491, 152, 122, 162, 474, 521, 145, 136, 511, 504, 143, 471,
         168, 129, 142, 117, 134, 156, 123, 496, 121, 480, 506, 498, 174,
         169, 483, 165, 484, 470, 517, 497, 160, 467, 167, 518, 466, 486,
         150, 513, 468, 473, 503, 155, 147, 508, 115, 172, 138, 116, 141,
         500, 510, 485, 499, 495, 137,

In [68]:
# train_data_df_index_list, val_data_df_index_list, test_data_df_index_list
np.array(kfold_train_test_index_list).shape

(10, 3)

In [69]:
np.save(kfold_split_save_file, kfold_train_test_index_list)