## 1. Time Series Classification Part 1: Feature Creation/Extraction

### (a) Download Data

Package imports

In [210]:
import numpy as np
import os
import pandas as pd
from scipy.stats import bootstrap

import warnings
warnings.filterwarnings('ignore')

Get the AReM Data Set

Inconsistencies in dataset is manually handled: <br>
bending/dataset4.csv - spaces are replaced with commas to fit in .csv format <br>
cycling/dataset9.csv and cycling/dataset14.csv - removed extra comma at the end of file

### (b) Test and Train Data

In [212]:
folders = os.listdir("../data/AReM/")
folders

['bendingType.pdf',
 '.DS_Store',
 'bending1',
 'walking',
 'sensorsPlacement.pdf',
 'bending2',
 'standing',
 'sitting',
 'lying',
 'cycling']

In [214]:
def train_test_split(file_path):
    
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    all_data = []
    
    folders = os.listdir(file_path)

    for folder in folders:
        if 'pdf' in folder or 'DS_Store' in folder:
            continue
        path = os.path.join(file_path, folder)
        files = os.listdir(path)
        files = sorted(files)
        if folder == 'bending1' or folder == 'bending2':
            for file in files:
                data = pd.read_csv(os.path.join(file_path, folder, file), skiprows=4, usecols=range(7))
                if file == 'dataset1.csv' or file == 'dataset2.csv':
                    df_test = pd.concat([df_test, data], ignore_index = True)
                else:
                    df_train = pd.concat([df_train, data], ignore_index = True)       
        else:
            for file in files:
                data = pd.read_csv(os.path.join(file_path, folder, file), skiprows=4 ,usecols=range(7))
                if file == 'dataset1.csv' or file == 'dataset2.csv' or file == 'dataset3.csv':
                    df_test = pd.concat([df_test, data], ignore_index = True)
                else:
                    df_train = pd.concat([df_train, data], ignore_index = True)
                    
        # all data for feature extraction
        for file in files:
            all_data.append(pd.read_csv(os.path.join(path, file), skiprows=4 ,usecols=range(7)))
    
    return df_train, df_test,all_data

In [216]:
filepath = "../data/AReM/"
df_train, df_test,all_data = train_test_split(filepath)
print(df_train.head())
print(df_test.head())
print(df_train.shape)
print(df_test.shape)

   # Columns: time  avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  \
0                0      42.00       0.71      21.25       0.43      30.00   
1              250      41.50       0.50      20.25       1.48      31.25   
2              500      41.50       0.50      14.25       1.92      33.00   
3              750      40.75       0.83      15.75       0.43      33.00   
4             1000      40.00       0.71      20.00       2.74      32.75   

   var_rss23  
0       0.00  
1       1.09  
2       0.00  
3       0.00  
4       0.43  
   # Columns: time  avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  \
0                0      39.25       0.43      22.75       0.43      33.75   
1              250      39.25       0.43      23.00       0.00      33.00   
2              500      39.25       0.43      23.25       0.43      33.00   
3              750      39.50       0.50      23.00       0.71      33.00   
4             1000      39.50       0.50      24.00       0.00

### (c) Feature Extraction

#### i. Research

1. Basic Statistical Features:<br>
Minimum,
Maximum,
Mean,
Median,
Standard Deviation,
Variance,
Range <br>
2. Percentile-Based Features:<br>
First Quartile (Q1),
Third Quartile (Q3),
Interquartile Range (IQR)<br>
3. Shape-Based Features:<br>
Skewness, Peak-to-Peak,
Kurtosis<br>
4. Energy-Based Features:<br>
Root Mean Square (RMS),
Signal Energy, Power
Absolute Sum of Changes<br>
5. Other Useful Features:<br>
Autocorrelation,
Zero Crossing Rate,
Entropy,slope

#### ii. Extraction

In [218]:
print("length = ",len(all_data))
print(all_data)

length =  88
[     # Columns: time  avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  \
0                  0      39.25       0.43      22.75       0.43      33.75   
1                250      39.25       0.43      23.00       0.00      33.00   
2                500      39.25       0.43      23.25       0.43      33.00   
3                750      39.50       0.50      23.00       0.71      33.00   
4               1000      39.50       0.50      24.00       0.00      33.00   
..               ...        ...        ...        ...        ...        ...   
475           118750      43.33       0.47      25.00       0.00      30.00   
476           119000      43.50       0.50      25.50       0.50      30.00   
477           119250      43.50       0.50      24.75       0.43      30.00   
478           119500      43.50       0.50      24.33       0.47      30.00   
479           119750      43.50       0.50      24.25       0.43      30.00   

     var_rss23  
0          1.3  
1  

In [220]:
time_series_strings = ['min', 'max', 'mean', 'median', 'standard deviation', '1st quartile', '3rd quartile' ]
columns = []
for i in range (1,7):
    columns.append([a+str(i) for a in time_series_strings])
columns = [item for sublist in columns for item in sublist]
df = pd.DataFrame(columns = columns)


for d in all_data:
    summary = d.describe()
    #print(summary)
    row = []
    for i in range(1,7):
        row.append(summary.iloc[3,i])
        row.append(summary.iloc[7,i])
        row.append(summary.iloc[1,i])
        row.append(summary.iloc[5,i])
        row.append(summary.iloc[2,i])
        row.append(summary.iloc[4,i])
        row.append(summary.iloc[6,i])
    #print(row)
    df = df._append(pd.DataFrame([row], columns = columns), ignore_index = True)

df

Unnamed: 0,min1,max1,mean1,median1,standard deviation1,1st quartile1,3rd quartile1,min2,max2,mean2,...,standard deviation5,1st quartile5,3rd quartile5,min6,max6,mean6,median6,standard deviation6,1st quartile6,3rd quartile6
0,37.25,45.00,40.624792,40.50,1.476967,39.25,42.0000,0.0,1.30,0.358604,...,2.188449,33.0000,36.00,0.0,1.92,0.570583,0.430,0.582915,0.0000,1.3000
1,38.00,45.67,42.812812,42.50,1.435550,42.00,43.6700,0.0,1.22,0.372437,...,1.995255,32.0000,34.50,0.0,3.11,0.571083,0.430,0.601010,0.0000,1.3000
2,35.00,47.40,43.954500,44.33,1.558835,43.00,45.0000,0.0,1.70,0.426250,...,1.999604,35.3625,36.50,0.0,1.79,0.493292,0.430,0.513506,0.0000,0.9400
3,33.00,47.75,42.179812,43.50,3.670666,39.15,45.0000,0.0,3.00,0.696042,...,3.849448,30.4575,36.33,0.0,2.18,0.613521,0.500,0.524317,0.0000,1.0000
4,33.00,45.75,41.678063,41.75,2.243490,41.33,42.7500,0.0,2.83,0.535979,...,2.411026,28.4575,31.25,0.0,1.79,0.383292,0.430,0.389164,0.0000,0.5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,26.50,44.33,36.687292,36.00,3.529404,34.25,39.3725,0.0,12.89,2.973042,...,2.978238,14.6700,18.50,0.0,8.19,3.073313,2.690,1.629675,1.9125,4.0875
84,25.33,45.00,37.114313,36.25,3.710385,34.50,40.2500,0.0,10.84,2.730000,...,2.847876,14.7500,18.50,0.0,9.50,3.076354,2.770,1.824534,1.7000,4.0375
85,26.75,44.75,36.863375,36.33,3.555787,34.50,39.7500,0.0,11.68,2.757312,...,2.655906,15.0000,18.67,0.0,8.81,2.773313,2.590,1.569919,1.6400,3.6325
86,26.25,44.25,36.957458,36.29,3.434863,34.50,40.2500,0.0,8.64,2.420083,...,2.851673,14.0000,18.25,0.0,8.34,2.934625,2.525,1.631380,1.6600,4.0300


#### iii. Standard Deviation

In [222]:
df.describe().loc[['std']]

Unnamed: 0,min1,max1,mean1,median1,standard deviation1,1st quartile1,3rd quartile1,min2,max2,mean2,...,standard deviation5,1st quartile5,3rd quartile5,min6,max6,mean6,median6,standard deviation6,1st quartile6,3rd quartile6
std,9.569975,4.394362,5.335718,5.440054,1.772153,6.15359,5.138925,0.0,5.062729,1.574164,...,1.024898,6.096465,5.53172,0.045838,2.518921,1.154812,1.086474,0.517617,0.758584,1.523599


In [224]:
results = []

for feature in columns:
    data = df[feature].values
    std_dev = np.std(data)
    
    bootstrap_ci = bootstrap((data,), np.std, confidence_level=0.90, method='percentile')
    ci_lower, ci_upper = bootstrap_ci.confidence_interval
    
    results.append({
        'feature': feature,
        'std_dev': std_dev,
        'lower': ci_lower,
        'upper': ci_upper
    })

results_df = pd.DataFrame(results)
print(results_df)

                feature   std_dev     lower      upper
0                  min1  9.515445  8.220548  10.694922
1                  max1  4.369322  3.311964   5.260374
2                 mean1  5.305314  4.680345   5.847455
3               median1  5.409056  4.759619   5.968435
4   standard deviation1  1.762056  1.559918   1.938384
5         1st quartile1  6.118526  5.529733   6.605998
6         3rd quartile1  5.109643  4.307624   5.804707
7                  min2  0.000000  0.000000   0.000000
8                  max2  5.033882  4.607541   5.373213
9                 mean2  1.565194  1.389432   1.697396
10              median2  1.404197  1.227991   1.536024
11  standard deviation2  0.879068  0.798933   0.936296
12        1st quartile2  0.940994  0.828508   1.032128
13        3rd quartile2  2.113157  1.879435   2.281515
14                 min3  2.939616  2.743061   3.087828
15                 max3  4.847358  4.149476   5.424585
16                mean3  3.985540  3.410057   4.465586
17        

#### iv. Select Features

Mean: Represents the central tendency while considering outliers.<br>
Median: A robust measure of central tendency, unaffected by outliers.<br>
Standard Deviation: Indicates data spread and deviation from the mean.