# Data Preprocessing

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy import signal
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from pathlib import Path

### Function for detecting the sensor errors

In [2]:
def sensor_errors(data):
    for i in range(data.shape[0]):
        if (i==0 and data[i]>1000):
            if (data[i+1]<1000):
                data[i] == data[i+1]
            else:
                while (data[k]>1000):
                    k == data.shape[0] - 1
                    data[i] = data[k]
        if (data[i]>1000 and i>0):
            data[i] = data[i-1]
    return data

### Function for preprocessing the data

In [3]:
def data_preprocessing(dataset):
    
    dataset.index = range(0, dataset.shape[0])
    rpc_values = np.array(dataset[['Ax.1', 'Ay.1', 'Az.1']], dtype='float')
    
    acceleration = np.array([np.linalg.norm(rpc_values[x, :]) for x in range(0, rpc_values.shape[0])], dtype = 'float')
    acceleration = sensor_errors(acceleration)
    y = np.array(dataset[['Unnamed: 69']])
    
    participant = pd.DataFrame()
    participant['a'] = acceleration
    participant['y'] = y
    
    mask = participant['y'] == 'upsatirs'
    participant['y'].loc[mask] = 'upstairs'
    
    new_participant = participant.reindex(index=participant.index[::-1])
    participant = pd.concat([participant,new_participant.iloc[0:1000,:]])
    participant.index = range(participant.shape[0])
    
    return participant

### Funtion for preprocessing the data of the left pocket for each participant

In [4]:
def data_preprocessing_left_pocket(dataset):
    
    dataset.index = range(0, dataset.shape[0])
    rpc_values = np.array(dataset[['Ax', 'Ay', 'Az']], dtype='float')
    
    acceleration = np.array([np.linalg.norm(rpc_values[x, :]) for x in range(0, rpc_values.shape[0])], dtype = 'float')
    acceleration = sensor_errors(acceleration)
    y = np.array(dataset[['Unnamed: 69']])
    
    participant = pd.DataFrame()
    participant['a'] = acceleration
    participant['y'] = y
    
    mask = participant['y'] == 'upsatirs'
    participant['y'].loc[mask] = 'upstairs'
    
    new_participant = participant.reindex(index=participant.index[::-1])
    participant = pd.concat([participant,new_participant.iloc[0:1000,:]])
    participant.index = range(participant.shape[0])
    
    return participant

### Function for preprocessing the data of the wrist for each participant

In [5]:
def data_preprocessing_wrist(dataset):
    
    dataset.index = range(0, dataset.shape[0])
    rpc_values = np.array(dataset[['Ax.2', 'Ay.2', 'Az.2']], dtype='float')
    
    acceleration = np.array([np.linalg.norm(rpc_values[x, :]) for x in range(0, rpc_values.shape[0])], dtype = 'float')
    acceleration = sensor_errors(acceleration)
    y = np.array(dataset[['Unnamed: 69']])
    
    participant = pd.DataFrame()
    participant['a'] = acceleration
    participant['y'] = y
    
    mask = participant['y'] == 'upsatirs'
    participant['y'].loc[mask] = 'upstairs'
    
    new_participant = participant.reindex(index=participant.index[::-1])
    participant = pd.concat([participant,new_participant.iloc[0:1000,:]])
    participant.index = range(participant.shape[0])
    
    return participant

### Function for returning the most common label in each window

In [6]:
def most_common(List):
    count = Counter(List)
    return count.most_common(1)[0][0]

### Function for the feature extraction of our dataset

In [7]:
def feature_extraction(participant):
    
    mean_list = []
    std_list = []
    skew_list = []
    max_list = []
    min_list = []
    min_max_list = []
    y_list = []
    
    df = pd.DataFrame()
    
    for w in range(0, participant.shape[0]-1000, 50):
        end = w+1000
        mean_list.append(np.mean(participant.iloc[w:end, 0]))
        std_list.append(np.std(participant.iloc[w:end, 0]))
        skew_list.append(skew(participant.iloc[w:end, 0]))
        max_list.append(np.max(participant.iloc[w:end, 0]))
        min_list.append(np.min(participant.iloc[w:end, 0]))
        min_max_list.append(np.max(participant.iloc[w:end, 0]) - np.min(participant.iloc[w:end, 0]))
        
        f,p = signal.welch(participant.iloc[w:end, 0], nperseg=128)
        
        if (w==0):
            n = f.shape[0]
            welch_lists = [[] for i in range(n)]
            
        for t in range(0,n):
            welch_lists[t].append(p[t])
        
        y_list.append(most_common(participant['y'][w:end]))
        
    df['Mean'] = mean_list
    df['Std'] = std_list
    df['Skew'] = skew_list
    df['Max'] = max_list
    df['Min'] = min_list
    df['Min-Max'] = min_max_list
    
    for k in range(0,n):
        df['Accwelch'+str(k)] = welch_lists[k]
    
    df['y'] = y_list
    
    return df

### Code Snippet for creating the train and test set for each fold of the LOSO Cross Validation

In [8]:
os.makedirs('train_test_dataset', exist_ok=True)

filenames = []
    
for i in range(0,10):
    filenames.append('dataset/Participant_' +str(i+1)+ '.csv')
    
for j in range(len(filenames)):
    current_participant = pd.read_csv(filenames[j], header=1)
    train = pd.DataFrame()
        
    for t in range(len(filenames)):
        if (t==j):
            continue
        train = pd.concat([train, pd.read_csv(filenames[t], header=1)])
        
    participant = data_preprocessing(current_participant)
    train = data_preprocessing(train)
        
    test = feature_extraction(participant)
    train_features = feature_extraction(train)
        
    X_train = train_features.drop('y', axis=1)
    X_train.to_csv('train_test_dataset/X_train_fold_' +str(j+1)+ '.csv', index=False)
    X_test = test.drop('y', axis=1)
    X_test.to_csv('train_test_dataset/X_test_fold_' +str(j+1)+ '.csv', index=False)
    y_train = train_features['y']
    y_train.to_csv('train_test_dataset/y_train_fold_' +str(j+1)+ '.csv', index=False)
    y_test = test['y']
    y_test.to_csv('train_test_dataset/y_test_fold_' +str(j+1)+ '.csv', index=False)

"\nimport os\nos.makedirs('train_test_dataset', exist_ok=True)\n\nfilenames = []\n    \nfor i in range(0,10):\n    filenames.append('dataset/Participant_' +str(i+1)+ '.csv')\n    \nfor j in range(len(filenames)):\n    current_participant = pd.read_csv(filenames[j], header=1)\n    train = pd.DataFrame()\n        \n    for t in range(len(filenames)):\n        if (t==j):\n            continue\n        train = pd.concat([train, pd.read_csv(filenames[t], header=1)])\n        \n    participant = data_preprocessing(current_participant)\n    train = data_preprocessing(train)\n        \n    test = feature_extraction(participant)\n    train_features = feature_extraction(train)\n        \n    X_train = train_features.drop('y', axis=1)\n    X_train.to_csv('train_test_dataset/X_train_fold_' +str(j+1)+ '.csv', index=False)\n    X_test = test.drop('y', axis=1)\n    X_test.to_csv('train_test_dataset/X_test_fold_' +str(j+1)+ '.csv', index=False)\n    y_train = train_features['y']\n    y_train.to_csv('t

### Code Snippet for joining the 'Standing' and 'Sitting' label

In [8]:
os.makedirs('regrouped_dataset', exist_ok=True)

for i in range(0,10):
    y_train = pd.read_csv('train_test_dataset/y_train_fold_1.csv')
    y_test = pd.read_csv('train_test_dataset/y_test_fold_1.csv')

    for t in range(y_train.shape[0]):
        if y_train['y'][t] == 'standing':
            y_train['y'][t] = 'standing_sitting'
        elif y_train['y'][t] == 'sitting':
            y_train['y'][t] = 'standing_sitting'
    for t in range(y_test.shape[0]):
        if y_test['y'][t] == 'standing':
            y_test['y'][t] = 'standing_sitting'
        elif y_test['y'][t] == 'sitting':
            y_test['y'][t] = 'standing_sitting'
            
    y_train.to_csv('regrouped_dataset/y_train_' +str(i+1)+ '_regrouped.csv', index=False)
    y_test.to_csv('regrouped_dataset/y_test_' +str(i+1)+ '_regrouped.csv', index=False)

### Code Snippet for creating the train set with all participants and the test set with left pocket and wrist dataset

We preprocessed and extracted the features for each participant using the left pocket and the wrist columns. After training the model, you can test it for each of the 10 participants using either the left pocket values or the wrist values.

In [11]:
os.makedirs('10_Subject_Dataset', exist_ok=True)
os.makedirs('dataset/Left_Pocket_Dataset', exist_ok=True)
os.makedirs('dataset/Wrsit_Dataset', exist_ok=True)

filenames=[]

for i in range(10):
    filenames.append('dataset/Participant_' +str(i+1)+ '.csv')
    
data = pd.DataFrame()
    
for t in range(len(filenames)):
    data = pd.concat([data, pd.read_csv(filenames[t], header=1)])

dataset = data_preprocessing(data)
data_features = feature_extraction(dataset)

X_train = data_features.drop('y', axis=1)
y_train = data_features['y']

X_train.to_csv('10_Subject_Dataset/X_train.csv', index=False)
y_train.to_csv('10_Subject_Dataset/y_train.csv', index=False)

for i in range(10):
    participant_l = pd.read_csv(filenames[i], header=1)
    participant_w = pd.read_csv(filenames[i], header=1)
    
    participant_l = data_preprocessing_left_pocket(participant_l)
    participant_w = data_preprocessing_wrist(participant_w)
    
    participant_l_features = feature_extraction(participant_l)
    participant_w_features = feature_extraction(participant_w)
    
    participant_l_features.to_csv('dataset/Left_Pocket_Dataset/Participant_' +str(i+1)+ '_left.csv', index=False)
    participant_w_features.to_csv('dataset/Wrsit_Dataset/Participant_' +str(i+1)+ '_wrist.csv', index=False)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
