In [32]:
import pandas as pd
import numpy as np
import glob
import os

In [67]:
save_x_file = os.path.join("..", "data_tran", "extracted_feature.csv")

_data_file = os.path.join("..", "data_tran", "tran_labeled", "*.csv")
data_files = sorted(glob.glob(_data_file))

data_files

['../data_tran/tran_labeled/Conv-sensorA-1.csv',
 '../data_tran/tran_labeled/Conv-sensorB-1.csv',
 '../data_tran/tran_labeled/Conv-sensorC-1.csv',
 '../data_tran/tran_labeled/Conv-sensorD-1.csv',
 '../data_tran/tran_labeled/Conv-sensorE-1.csv']

In [34]:
data_list = []

for data_file in data_files:
    identifier = os.path.basename(data_file)[-7]
    
    df = pd.read_csv(data_file, encoding="shift-jis")
    
    df_nona = df.dropna(subset="label")
    print("***df_nona first", df_nona.index[0])
    
    df_filter_start_na = df.iloc[df_nona.index[0]:df_nona.index[-1], :]
    
    df_filter_start_na = df_filter_start_na.reset_index(drop=True)
    
    label = df_filter_start_na.loc[:, "label"]
    df_filter_start_na = df_filter_start_na.drop(["ts", "label"], axis=1)
    
    df_filter_start_na = df_filter_start_na.rename(lambda x: x + f"_{identifier}", axis=1)
    
    print("len(df_filter_start_na)", len(df_filter_start_na))
  
    data_list.append(df_filter_start_na)

data_df = pd.concat(data_list, axis=1)
data_df["label"] = label

data_df

***df_nona first 25422
len(df_filter_start_na) 90432
***df_nona first 24679
len(df_filter_start_na) 90432
***df_nona first 24396
len(df_filter_start_na) 90432
***df_nona first 24161
len(df_filter_start_na) 90432
***df_nona first 23883
len(df_filter_start_na) 90432


Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,gx_D,gy_D,gz_D,ax_E,ay_E,az_E,gx_E,gy_E,gz_E,label
0,-7935,1954,3465,-2788,323,-3586,-11585,-2828,1490,-6382,...,-4613,10980,-530,-8155,1505,3740,120,-1570,3440,階段降り
1,-7847,1998,3445,-2855,158,-3629,-11443,-2667,1373,-7248,...,-5308,11516,-280,-7842,1520,3598,241,-1710,3196,階段降り
2,-7647,2051,3387,-3008,-48,-3592,-11199,-2462,1299,-8083,...,-5363,11230,-129,-7432,1407,3462,223,-1685,2915,階段降り
3,-7495,2100,3309,-3124,-310,-3531,-10965,-2296,1021,-8907,...,-6266,10797,194,-7208,1354,3354,34,-1649,2635,階段降り
4,-7315,1954,3245,-3191,-548,-3421,-10652,-2233,782,-9650,...,-7906,11059,322,-7017,1158,3261,-264,-1673,2366,階段降り
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90427,-6387,6016,4700,199,-353,-147,-5169,-6183,5640,-16,...,69,-92,-189,-8550,1476,4765,205,76,-66,座っている
90428,-6358,6031,4749,278,-487,-196,-5145,-6197,5645,20,...,44,-99,-183,-8536,1476,4800,217,45,-108,座っている
90429,-6280,5997,4847,351,-554,-202,-5174,-6188,5577,130,...,-4,-129,-183,-8497,1529,4721,205,45,-120,座っている
90430,-6265,6011,4837,400,-652,-214,-5193,-6144,5543,160,...,-29,-123,-219,-8511,1515,4707,205,15,-169,座っている


## Train test split

In [35]:
data_df["label"].value_counts()

歩いている    12856
立っている    12566
階段上り     12278
走っている    12233
階段降り     12222
座っている    12065
Name: label, dtype: int64

In [36]:
label_list = list(data_df["label"].value_counts().keys())
label_list

['歩いている', '立っている', '階段上り', '走っている', '階段降り', '座っている']

In [37]:
from sklearn.model_selection import train_test_split

train_data = []
test_data = []

for label in label_list:
    df_label = data_df[data_df["label"] == label]
    
    _train_df, _test_df = train_test_split(df_label, test_size=0.2, random_state=42)
    
    train_data.append(_train_df)
    test_data.append(_test_df)
    
train_df = pd.concat(train_data, axis=0)
test_df = pd.concat(test_data, axis=0)

print("train_df.shape", train_df.shape)
print("test_df.shape", test_df.shape)

train_df.shape (59373, 31)
test_df.shape (14847, 31)


## preprocessing

In [38]:
%%writefile preprocessing.py

from scipy.fftpack import fft
from scipy import signal
import numpy as np
import pandas as pd


def Calc_Time(signalData):
    Mean = np.mean(signalData) # 平均値
    Std = np.std(signalData - Mean)  #標準偏差

    return [Mean, Std]

# RMSを求める関数
def Calc_RMS(signalData):
    a = signalData * signalData  # 二乗
    sum_a = np.sum(a)  # 総和
    sqrt_a = np.sqrt(sum_a) # 平方根
    RMS = np.mean(sqrt_a) # 平均値
    
    return RMS

def Calc_Freq(signalData):
    L = len(signalData) # 信号長
    Win = signal.hann(L) #ハニング窓を使用

    S = np.fft.fft(Win * signalData) # 振幅スペクトル
    PowerSpectrum = (S * S.conjugate()) # パワースペクトル
    Energy = np.sum(PowerSpectrum.real)

    P = PowerSpectrum.real/np.sum(Energy)
    Entropy = -np.sum(P*np.log(P))


    return [Energy, Entropy]


Overwriting preprocessing.py


In [39]:
import preprocessing

In [40]:
# angular_data = [f"g{axis}_{sensor_type}" for axis in "xyz" for sensor_type in "ABCDE"]
# accelleration_data = [f"a{axis}_{sensor_type}" for axis in "xyz" for sensor_type in "ABCDE"]

# for data_name in angular_data:
#     exec("data_df[data_name + '_mean'] = 0")
#     exec("data_df[data_name + '_std'] = 0")
#     exec("data_df[data_name + '_RMS'] = 0")
    
# for data_name in accelleration_data:
#     exec("data_df[data_name + '_mean'] = 0")
#     exec("data_df[data_name + '_RMS'] = 0")
#     exec("data_df[data_name + '_energy'] = 0")
#     exec("data_df[data_name + '_entropy'] = 0")


In [41]:
data_df.head()

Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,gx_D,gy_D,gz_D,ax_E,ay_E,az_E,gx_E,gy_E,gz_E,label
0,-7935,1954,3465,-2788,323,-3586,-11585,-2828,1490,-6382,...,-4613,10980,-530,-8155,1505,3740,120,-1570,3440,階段降り
1,-7847,1998,3445,-2855,158,-3629,-11443,-2667,1373,-7248,...,-5308,11516,-280,-7842,1520,3598,241,-1710,3196,階段降り
2,-7647,2051,3387,-3008,-48,-3592,-11199,-2462,1299,-8083,...,-5363,11230,-129,-7432,1407,3462,223,-1685,2915,階段降り
3,-7495,2100,3309,-3124,-310,-3531,-10965,-2296,1021,-8907,...,-6266,10797,194,-7208,1354,3354,34,-1649,2635,階段降り
4,-7315,1954,3245,-3191,-548,-3421,-10652,-2233,782,-9650,...,-7906,11059,322,-7017,1158,3261,-264,-1673,2366,階段降り


In [53]:
# data: 100Hz 
Window_size = 200 # 2s
over_lap = 0.5

overlap_data = Window_size * over_lap

margin = 300
index = 0
loop = 0

feature_list = []


while index < data_df.shape[0]:
    if pd.isna(data_df['label'][index]):
        index = index + 1
        continue

    count = 0
    while count < margin:
        count, index = count + 1, index + 1
        
        
    if index  >= data_df.shape[0]: break
    current_label = data_df.loc[index, 'label']
    
    while not pd.isna(data_df['label'][index + Window_size]) and data_df['label'][index + Window_size] == current_label:
        signal_data = data_df.iloc[index: index + Window_size, :]
        index = index + Window_size

        for sensor_name in "ABCDE":
            for axis in "xyz":
                exec(f"data_df.loc[index, 'a{axis}_{sensor_name}_mean'], data_df.loc[index, 'a{axis}_{sensor_name}_std'] = preprocessing.Calc_Time(signal_data.loc[:, 'a{axis}_{sensor_name}'])")
                exec(f"data_df.loc[index, 'a{axis}_{sensor_name}_RMS'] = preprocessing.Calc_RMS(signal_data.loc[:, 'a{axis}_{sensor_name}'])")
                exec(f"data_df.loc[index, 'a{axis}_{sensor_name}_energy'], data_df.loc[index, 'a{axis}_{sensor_name}_entropy'] = preprocessing.Calc_Freq(signal_data.loc[:, 'a{axis}_{sensor_name}'])")
                
        index = index - int(overlap_data)
        if index + Window_size >= data_df.shape[0]: break
        
data_df

Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,ay_E_mean,ay_E_std,ay_E_RMS,ay_E_energy,ay_E_entropy,az_E_mean,az_E_std,az_E_RMS,az_E_energy,az_E_entropy
0,-7935,1954,3465,-2788,323,-3586,-11585,-2828,1490,-6382,...,,,,,,,,,,
1,-7847,1998,3445,-2855,158,-3629,-11443,-2667,1373,-7248,...,,,,,,,,,,
2,-7647,2051,3387,-3008,-48,-3592,-11199,-2462,1299,-8083,...,,,,,,,,,,
3,-7495,2100,3309,-3124,-310,-3531,-10965,-2296,1021,-8907,...,,,,,,,,,,
4,-7315,1954,3245,-3191,-548,-3421,-10652,-2233,782,-9650,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90427,-6387,6016,4700,199,-353,-147,-5169,-6183,5640,-16,...,,,,,,,,,,
90428,-6358,6031,4749,278,-487,-196,-5145,-6197,5645,20,...,,,,,,,,,,
90429,-6280,5997,4847,351,-554,-202,-5174,-6188,5577,130,...,,,,,,,,,,
90430,-6265,6011,4837,400,-652,-214,-5193,-6144,5543,160,...,,,,,,,,,,


In [54]:
feature_df = data_df[~ pd.isna(data_df["ay_D_mean"])]
feature_df

Unnamed: 0,ax_A,ay_A,az_A,gx_A,gy_A,gz_A,ax_B,ay_B,az_B,gx_B,...,ay_E_mean,ay_E_std,ay_E_RMS,ay_E_energy,ay_E_entropy,az_E_mean,az_E_std,az_E_RMS,az_E_energy,az_E_entropy
500,-7232,2388,1644,-12429,2244,8895,-7239,-1613,640,-6480,...,1806.145000,1795.656586,36018.168568,1.110045e+11,3.056860,3081.645000,1990.262201,51880.014524,2.080846e+11,2.106770
556,-9546,3716,-904,7772,-201,340,-12557,-1979,4610,28836,...,1706.808594,2366.021737,46678.486854,1.745063e+11,3.181850,3136.132812,2178.539603,61096.883210,3.458789e+11,2.148186
600,-13570,2652,760,31290,-3085,-11007,-6204,-1442,1036,6160,...,1706.790000,2538.833273,43263.856717,1.574565e+11,3.429951,3067.520000,2349.352196,54642.720814,2.184749e+11,2.321673
684,-8565,3194,-406,-15063,8737,7852,-11355,-2413,4590,5092,...,1509.148438,2764.306774,50390.909517,2.745205e+11,3.636137,3219.917969,2560.565715,65822.779773,3.814031e+11,2.482349
700,-8145,665,2195,-24069,7036,3468,-9847,-10382,-102,-13809,...,1464.050000,2965.510483,46771.133897,1.341559e+11,3.441022,3302.915000,2665.429948,60022.936124,2.246754e+11,2.352513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90160,-6338,5977,4935,113,-24,-80,-5394,-6237,5328,106,...,1512.519531,31.488583,24205.556325,5.614137e+10,0.876080,5131.332031,47.572181,82104.840728,6.456568e+11,0.872914
90256,-6358,6055,5008,-57,311,-14,-5369,-6222,5425,51,...,1525.745000,37.712597,21583.883084,3.473797e+10,0.875050,5089.265000,45.620662,71975.967496,3.861778e+11,0.873098
90288,-6177,6001,5032,223,-237,-74,-5354,-6256,5308,112,...,1523.816406,39.023149,24389.055886,5.712010e+10,0.876621,5094.851562,47.695009,81521.196863,6.329898e+11,0.871624
90356,-5879,6031,5374,-75,664,206,-5584,-6295,5254,-236,...,1527.960000,74.706281,21634.429921,3.456917e+10,0.881159,5051.960000,66.585422,71451.708811,3.841805e+11,0.874286


In [59]:
for sensor_name in "ABCDE":
    for axis in "xyz":
        feature_df = feature_df.drop([f"a{axis}_{sensor_name}"], axis=1)
        feature_df = feature_df.drop([f"g{axis}_{sensor_name}"], axis=1)

In [60]:
feature_df = feature_df.reset_index(drop=True)
feature_df.head()

Unnamed: 0,label,ax_A_mean,ax_A_std,ax_A_RMS,ax_A_energy,ax_A_entropy,ay_A_mean,ay_A_std,ay_A_RMS,ay_A_energy,...,ay_E_mean,ay_E_std,ay_E_RMS,ay_E_energy,ay_E_entropy,az_E_mean,az_E_std,az_E_RMS,az_E_energy,az_E_entropy
0,階段降り,-10095.115,3193.308573,149738.816948,1589442000000.0,1.291517,2333.925,2153.205443,44907.68216,114313300000.0,...,1806.145,1795.656586,36018.168568,111004500000.0,3.05686,3081.645,1990.262201,51880.014524,208084600000.0,2.10677
1,階段降り,-10094.328125,3333.864469,170089.943336,2651143000000.0,1.374896,2014.660156,2457.361669,50842.456382,181056400000.0,...,1706.808594,2366.021737,46678.486854,174506300000.0,3.18185,3136.132812,2178.539603,61096.88321,345878900000.0,2.148186
2,階段降り,-9774.82,3475.480257,146714.736172,1608769000000.0,1.477448,1566.395,2285.059553,39179.306923,137626100000.0,...,1706.79,2538.833273,43263.856717,157456500000.0,3.429951,3067.52,2349.352196,54642.720814,218474900000.0,2.321673
3,階段降り,-9947.925781,4048.729614,171844.349174,2704598000000.0,1.440465,2117.660156,2588.711144,53512.549379,289836900000.0,...,1509.148438,2764.306774,50390.909517,274520500000.0,3.636137,3219.917969,2560.565715,65822.779773,381403100000.0,2.482349
4,階段降り,-9877.695,4102.551451,151261.222344,1672245000000.0,1.481259,2008.595,2648.684368,47010.600411,239731900000.0,...,1464.05,2965.510483,46771.133897,134155900000.0,3.441022,3302.915,2665.429948,60022.936124,224675400000.0,2.352513


In [57]:
len(feature_df)

1209

In [71]:
feature_df.to_csv(save_x_file, index=False)