In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
main_directory = "./data"
subdirectories = [f.path for f in os.scandir(main_directory) if f.is_dir()]
dfs = []

for directory in subdirectories:
    files = [f for f in os.scandir(directory) if f.name.endswith('.csv')]
    
    for file in files:
        file_path = file.path
        df_temp = pd.read_csv(file_path)
        df_temp['folder'] = os.path.basename(directory)
        df_temp['file'] = os.path.splitext(file.name)[0]
        dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)

df.head()

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,folder,file
0,1.000776,4.616021,8.576031,idle,idle-1
1,0.718261,4.209007,8.446744,idle,idle-1
2,-0.909797,-0.282516,9.203311,idle,idle-1
3,5.09965,0.148441,8.418014,idle,idle-1
4,1.762132,-0.162806,9.251195,idle,idle-1


In [3]:
df.folder.unique()

array(['idle', 'running', 'stairs', 'walking'], dtype=object)

In [4]:
features = ['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']
def custom_normalize(data, features):
    for feature in features:
        feature_data = data[feature]
        mean = feature_data.mean()
        std = feature_data.std()
        normalized_column = (feature_data - mean) / std
        data.loc[:, feature + '_n'] = normalized_column

custom_normalize(df, features)

df.describe()

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,accelerometer_X_n,accelerometer_Y_n,accelerometer_Z_n
count,193860.0,193860.0,193860.0,193860.0,193860.0,193860.0
mean,1.92355,1.598343,1.804896,8.561992000000001e-17,9.383005e-18,4.2223520000000006e-17
std,8.404867,12.474041,7.19159,1.0,1.0,1.0
min,-39.188293,-39.188293,-39.188293,-4.891433,-3.269721,-5.700156
25%,-2.494758,-8.327033,-2.494758,-0.5256844,-0.7956825,-0.5978725
50%,0.248997,-0.009577,0.905008,-0.1992361,-0.1289013,-0.1251306
75%,4.668694,8.671799,7.187394,0.3266136,0.5670541,0.7484434
max,39.188293,39.188293,39.188293,4.43371,3.013454,5.19821


In [5]:
train = pd.DataFrame()
files = df.file.unique()
dfs_tr = []

sample = np.random.choice(files, size=int(len(files)*0.3))
for filename in sample:
    dfs_tr.append(df[df['file']==filename])
train = pd.concat(dfs_tr, ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58140 entries, 0 to 58139
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accelerometer_X    58140 non-null  float64
 1   accelerometer_Y    58140 non-null  float64
 2   accelerometer_Z    58140 non-null  float64
 3   folder             58140 non-null  object 
 4   file               58140 non-null  object 
 5   accelerometer_X_n  58140 non-null  float64
 6   accelerometer_Y_n  58140 non-null  float64
 7   accelerometer_Z_n  58140 non-null  float64
dtypes: float64(6), object(2)
memory usage: 3.5+ MB


In [6]:
test = pd.DataFrame()
dfs_te = []
test_filenames = [filename for filename in files if filename not in sample]
for filename in test_filenames:
    dfs_te.append(df[df['file']==filename])
test = pd.concat(dfs_te, ignore_index=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143490 entries, 0 to 143489
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   accelerometer_X    143490 non-null  float64
 1   accelerometer_Y    143490 non-null  float64
 2   accelerometer_Z    143490 non-null  float64
 3   folder             143490 non-null  object 
 4   file               143490 non-null  object 
 5   accelerometer_X_n  143490 non-null  float64
 6   accelerometer_Y_n  143490 non-null  float64
 7   accelerometer_Z_n  143490 non-null  float64
dtypes: float64(6), object(2)
memory usage: 8.8+ MB


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193860 entries, 0 to 193859
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   accelerometer_X    193860 non-null  float64
 1   accelerometer_Y    193860 non-null  float64
 2   accelerometer_Z    193860 non-null  float64
 3   folder             193860 non-null  object 
 4   file               193860 non-null  object 
 5   accelerometer_X_n  193860 non-null  float64
 6   accelerometer_Y_n  193860 non-null  float64
 7   accelerometer_Z_n  193860 non-null  float64
dtypes: float64(6), object(2)
memory usage: 11.8+ MB
