In [111]:
import torch
from torch import nn
from torchinfo import summary

from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy  as np

import json
import os
import glob

import warnings
warnings.filterwarnings("ignore")

In [112]:
save_folder = "../data/jikken2/"

feature_save_file = os.path.join(save_folder, "features.npy")
label_save_file = os.path.join(save_folder, "label.npy")
label_name_save_file = os.path.join(save_folder, "label_name.json")

kfold_split_save_file = os.path.join(save_folder, "kfold_train_val_test.npy")

## Data setup

In [113]:
_data_file = os.path.join(save_folder, "raw", "*.csv")
data_files = sorted(glob.glob(_data_file))

data_files

['../data/jikken2/raw/Conv-sensorA-1.csv',
 '../data/jikken2/raw/Conv-sensorB-1.csv',
 '../data/jikken2/raw/Conv-sensorC-1.csv',
 '../data/jikken2/raw/Conv-sensorD-1.csv',
 '../data/jikken2/raw/Conv-sensorE-1.csv']

In [114]:
data_list = []

for data_file in data_files:
    identifier = os.path.basename(data_file)[-7]
    
    df = pd.read_csv(data_file, encoding="shift-jis", low_memory=False)
    
    df_nona = df.dropna(subset="label")
    print("***df_nona first", df_nona.index[0])
    
#     df_filter_start_na = df.iloc[df_nona.index[0]:df_nona.index[-1], :]
    
    df_nona = df_nona.reset_index(drop=True)
    
    label = df_nona.loc[:, "label"]
    df_nona = df_nona.drop(["ts", "label"], axis=1)
    
    df_nona = df_nona.rename(lambda x: x + f"_{identifier}", axis=1)
    
    print("len(df_filter_start_na)", len(df_nona))
  
    data_list.append(df_nona)

data_df = pd.concat(data_list, axis=1)
data_df["label"] = label

data_df

***df_nona first 26804
len(df_filter_start_na) 74124
***df_nona first 27075
len(df_filter_start_na) 74124
***df_nona first 27232
len(df_filter_start_na) 74124
***df_nona first 27397
len(df_filter_start_na) 74124
***df_nona first 27612
len(df_filter_start_na) 74124


Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,gx_D,gy_D,gz_D,ax_E,ay_E,az_E,gx_E,gy_E,gz_E,label
0,-9409,2901,1448,1113,-268,-153,-9378,-2975,1382,2849,...,1471,-105,-117,-9678,1105,1772,217,58,-200,階段降り
1,-9453,2930,1512,1046,-237,-177,-9432,-2887,1368,2824,...,2752,-166,-74,-9703,1051,1723,199,9,-224,階段降り
2,-9492,2984,1473,955,-213,-177,-9441,-2823,1329,2678,...,3233,-312,-68,-9732,934,1636,223,-22,-279,階段降り
3,-9434,2945,1483,827,-219,-183,-9388,-2755,1324,2464,...,2801,-251,-56,-9805,929,1645,217,-155,-346,階段降り
4,-9400,2940,1502,741,-182,-190,-9305,-2613,1402,2105,...,1697,-111,-74,-9903,900,1650,205,-344,-383,階段降り
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74119,-5386,5933,5882,125,-36,-104,-7298,-5099,4161,51,...,81,-50,-92,-9962,338,-551,95,-83,-102,座っている
74120,-5449,5894,5950,144,-60,-98,-7308,-5133,4102,32,...,69,-25,-86,-10001,226,-566,132,-83,-127,座っている
74121,-5430,5894,5980,150,-36,-62,-7269,-5123,4180,26,...,75,-1,-92,-9996,309,-517,95,-89,-96,座っている
74122,-5430,5894,5892,205,-36,-74,-7337,-5128,4190,51,...,75,-38,-92,-9986,299,-531,114,-95,-120,座っている


In [115]:
label_list = ['歩いている', '立っている', '走っている', '階段降り', '階段上り', '座っている']
eng_label_dict = dict(zip(
    ['歩いている', '立っている', '走っている', '階段降り', '階段上り', '座っている'],
    ['Walking', 'Standing', 'Jogging', 'DownStair', 'Upstair', 'Sit']
))

eng_label_list = [eng_label_dict[i] for i in label_list]

In [116]:
def segment(data_df, label_list, Window_size = 200, over_lap = 0.5, margin = 200):
    data_df_index_list = []
    index_label_list = []
    
    overlap_data = Window_size * over_lap

    index = 0
    loop = 0
    current_label = None
    
    while index < data_df.shape[0]:
        if index + Window_size >= data_df.shape[0]: break
    
        if current_label is not None and data_df['label'][index] == current_label:
            index += 1
            continue
            
        if pd.isna(data_df['label'][index]):
            index = index + 1
            continue

        count = 0
        while count < margin:
            count, index = count + 1, index + 1

        if index + Window_size >= data_df.shape[0]: break
        current_label = data_df.loc[index, 'label']

        while not pd.isna(data_df['label'][index + Window_size]) and data_df['label'][index + Window_size] == current_label:
#                 signal_data = data_df.iloc[index: index + Window_size, :]

            # Each data will be at size column(6 x 5 = 30) x Window_size
            # achieved by `data_df.loc[index: index + self.Window_size, :]` for index in data_df_index_list.
            # Data label will be `data_df.loc[index + self.Window_size, "label"]` for index in data_df_index_list.
            data_df_index_list.append(index)
            index = index + Window_size

            # add label of the last row of sequence
            index_label_list.append(label_list.index(data_df['label'][index]))

            index = index - int(overlap_data)
            if index + Window_size >= data_df.shape[0]: break
                
    return data_df_index_list, index_label_list

In [117]:
window_size = 200
data_df_index_list, index_label_list = segment(data_df, label_list, Window_size=window_size)

In [118]:
print(*zip(data_df_index_list, index_label_list))

(200, 3) (300, 3) (400, 3) (500, 3) (600, 3) (700, 3) (800, 3) (900, 3) (1000, 3) (1100, 3) (1200, 3) (1300, 3) (1400, 3) (1500, 3) (1600, 3) (1700, 3) (1800, 3) (1900, 3) (2000, 3) (2100, 3) (2200, 3) (2300, 3) (2400, 3) (2500, 3) (2600, 3) (2700, 3) (2800, 3) (2900, 3) (3000, 3) (3100, 3) (3200, 3) (3300, 3) (3400, 3) (3500, 3) (3600, 3) (3700, 3) (3800, 3) (3900, 3) (4000, 3) (4100, 3) (4200, 3) (4300, 3) (4400, 3) (4500, 3) (4600, 3) (4700, 3) (4800, 3) (4900, 3) (5356, 4) (5456, 4) (5556, 4) (5656, 4) (5756, 4) (5856, 4) (5956, 4) (6056, 4) (6156, 4) (6256, 4) (6356, 4) (6456, 4) (6556, 4) (6656, 4) (6756, 4) (6856, 4) (6956, 4) (7056, 4) (7156, 4) (7256, 4) (7356, 4) (7456, 4) (7556, 4) (7656, 4) (7756, 4) (7856, 4) (7956, 4) (8056, 4) (8156, 4) (8256, 4) (8356, 4) (8456, 4) (8556, 4) (8656, 4) (8756, 4) (8856, 4) (8956, 4) (9056, 4) (9156, 4) (9256, 4) (9356, 4) (9456, 4) (9556, 4) (9656, 4) (9756, 4) (9856, 4) (9956, 4) (10056, 4) (10156, 4) (10256, 4) (10356, 4) (10456, 4) (10

In [119]:
len(data_df_index_list)

698

In [120]:
data_list = []
for index, label_number in zip(data_df_index_list, index_label_list):
    x = data_df.iloc[index: index + window_size, :].drop(["label"], axis=1).values
    data_list.append(x)
data_list = np.array(data_list)

In [121]:
data_list.shape

(698, 200, 30)

In [122]:
index_label_list = np.array(index_label_list)
index_label_list.shape

(698,)

In [123]:
label_dict = dict(enumerate(label_list))
label_dict

{0: '歩いている', 1: '立っている', 2: '走っている', 3: '階段降り', 4: '階段上り', 5: '座っている'}

In [124]:
np.save(feature_save_file, data_list)

In [125]:
np.save(label_save_file, index_label_list)

In [126]:
with open(label_name_save_file, "w", encoding="shift-jis") as f:
    json.dump(label_dict, f)

In [127]:
with open(label_name_save_file, encoding="shift-jis") as f:
    label_dict = json.load(f)


## KFold setup

In [128]:
# the dataset label and index list correspond to each other
dataset_label_array = np.array(index_label_list)
dataset_index_list = np.array(data_list)

#ラベル名ごとにカウント
index_data_all = []

# separate dataset_index_list and fill into index_data_all base on the label in dataset_label_array
# the order of index_data_all will follow label_list index
for activity_label in label_list:
    print("activity_label:", activity_label)
    
    one_activity_data = dataset_index_list[dataset_label_array == label_list.index(activity_label)]
    index_data_all.append(one_activity_data)

index_data_all

activity_label: 歩いている
activity_label: 立っている
activity_label: 走っている
activity_label: 階段降り
activity_label: 階段上り
activity_label: 座っている


[array([24056, 24156, 24256, 24356, 24456, 24556, 24656, 24756, 24856,
        24956, 25056, 25156, 25256, 25356, 25456, 25556, 25656, 25756,
        25856, 25956, 26056, 26156, 26256, 26356, 26456, 26556, 26656,
        26756, 26856, 26956, 27056, 27156, 27256, 27356, 27456, 27556,
        27656, 27756, 27856, 27956, 28056, 28156, 28256, 28356, 28456,
        28556, 28656, 28756, 28856, 28956, 29056, 29156, 29256, 29356,
        29456, 29556, 29656, 29756, 61927, 62027, 62127, 62227, 62327,
        62427, 62527, 62627, 62727, 62827, 62927, 63027, 63127, 63227,
        63327, 63427, 63527, 63627, 63727, 63827, 63927, 64027, 64127,
        64227, 64327, 64427, 64527, 64627, 64727, 64827, 64927, 65027,
        65127, 65227, 65327, 65427, 65527, 65627, 65727, 65827, 65927,
        66027, 66127, 66227, 66327, 66427, 66527, 66627, 66727, 66827,
        66927, 67027, 67127, 67227, 67327, 67427, 67527, 67627, 67727]),
 array([11484, 11584, 11684, 11784, 11884, 11984, 12084, 12184, 12284,
    

In [129]:
from sklearn.model_selection import KFold

n_split = 10

#KFoldの設定(10個に分割)
kf = KFold(n_splits = n_split, shuffle = True, random_state = 42)

# k_index_list (size: label number (6) x kfold number (10) x train or test (2)) 
k_index_list = []

for activity_data in index_data_all:
    k_index_list.append(list(kf.split(activity_data)))
    
k_index_list[0][0] 

(array([  0,   1,   2,   3,   5,   6,   7,   8,   9,  12,  13,  14,  15,
         16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  27,  28,  29,
         30,  31,  32,  33,  34,  35,  37,  38,  39,  41,  43,  45,  46,
         47,  48,  49,  50,  51,  52,  54,  55,  56,  57,  58,  59,  60,
         61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,
         75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,
         89,  90,  91,  92,  93,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116]),
 array([ 4, 10, 11, 26, 36, 40, 42, 44, 53, 72, 85, 94]))

In [130]:
kfold_train_test_index_list = []

for k in range(n_split):
#     train test for 1 fold
    train_x, val_x, test_x = [], [], []

    for i, activity_data in enumerate(index_data_all):

        activity_train_index, activity_test_index = \
            activity_data[ k_index_list[i][k][0] ], activity_data[ k_index_list[i][k][1] ]
        
#         activity_y_train, activity_y_test = label_all[i][ k_index_list[i][k][0] ], label_all[i].values[ k_index_list[i][k][1] ]

        activity_train, activity_val = train_test_split(activity_train_index, random_state=42, test_size=0.15)

        train_x.append(activity_train)
        val_x.append(activity_val)
        test_x.append(activity_test_index)

    train_x = np.concatenate(train_x, axis=0)
    val_x = np.concatenate(val_x, axis=0)
    test_x = np.concatenate(test_x, axis=0)

    kfold_train_test_index_list.append((train_x, val_x, test_x))

In [131]:
kfold_train_test_index_list

[(array([65227, 27856, 25556, 27056, 67227, 62527, 26556, 64827, 29256,
         64327, 25856, 29056, 28856, 25256, 65827, 25456, 67627, 65027,
         27256, 65127, 24656, 63227, 62627, 28756, 28156, 25956, 63727,
         27956, 24856, 29156, 63927, 64027, 27156, 26256, 65727, 26956,
         64427, 61927, 25656, 26856, 24356, 26056, 28656, 24956, 65327,
         24756, 63827, 28356, 66427, 65627, 62427, 62027, 64127, 29556,
         67327, 63127, 67427, 67027, 28956, 62827, 29756, 66327, 62727,
         64727, 27756, 67127, 62927, 63427, 66127, 28556, 27356, 66627,
         24156, 62227, 26456, 24256, 26756, 66027, 66827, 64527, 65927,
         65427, 26356, 63027, 64227, 25756, 66527, 62127, 67527, 51364,
         55164, 16384, 12884, 15584, 50664, 14384, 13984, 14984, 17084,
         51764, 13184, 16584, 53164, 53264, 12584, 15184, 49664, 51064,
         14684, 16084, 12084, 49464, 50364, 15884, 15384, 13384, 54664,
         15284, 50864, 12284, 16484, 50764, 51464, 14484, 13684,

In [132]:
# train_data_df_index_list, val_data_df_index_list, test_data_df_index_list
np.array(kfold_train_test_index_list).shape

(10, 3)

In [133]:
np.save(kfold_split_save_file, kfold_train_test_index_list)