## Real World Data Preprocessing and Conversion to HAR dataset Format

The real world data is generated using physics toolbox Suite app and the data is saved in the file named vikash_walking_data.csv.
Then this time series is transformed using the similar procedure on which the HAR data is generated. For transformation the data segmented into fixed-length windows of 2.56 seconds with 50% overlap and sampling rate of 50Hz. After transformation the data is splitted into two parts one for training and other for testing. The data for training was added to the main dataframe with subject_id 31. 

In [1]:
import pandas as pd

In [2]:
# Load data from the CSV file
data = pd.read_csv('vikash_walking_data.csv')

In [3]:
data = data.iloc[:,:-1]
data.head()

Unnamed: 0,time,ax,ay,az,wx,wy,wz
0,0.003,0.28,0.5,-1.46,0.11,0.34,0.09
1,0.004,0.18,0.67,-1.78,0.11,0.34,0.09
2,0.004,0.18,0.67,-1.78,0.05,0.35,0.09
3,0.005,0.13,0.76,-1.95,0.05,0.35,0.09
4,0.005,0.13,0.76,-1.95,-0.01,0.35,0.09


In [4]:
# Rename columns to match the HAR dataset feature names
data = data.rename(columns={'time': 'timestamp', 
                            'ax': 'body_acc_x', 
                            'ay': 'body_acc_y', 
                            'az': 'body_acc_z', 
                            'wx': 'body_gyro_x', 
                            'wy': 'body_gyro_y', 
                            'wz': 'body_gyro_z'})

In [5]:
data.head()

Unnamed: 0,timestamp,body_acc_x,body_acc_y,body_acc_z,body_gyro_x,body_gyro_y,body_gyro_z
0,0.003,0.28,0.5,-1.46,0.11,0.34,0.09
1,0.004,0.18,0.67,-1.78,0.11,0.34,0.09
2,0.004,0.18,0.67,-1.78,0.05,0.35,0.09
3,0.005,0.13,0.76,-1.95,0.05,0.35,0.09
4,0.005,0.13,0.76,-1.95,-0.01,0.35,0.09


In [6]:
# Convert timestamp column to datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data.head()

Unnamed: 0,timestamp,body_acc_x,body_acc_y,body_acc_z,body_gyro_x,body_gyro_y,body_gyro_z
0,1970-01-01 00:00:00.003,0.28,0.5,-1.46,0.11,0.34,0.09
1,1970-01-01 00:00:00.004,0.18,0.67,-1.78,0.11,0.34,0.09
2,1970-01-01 00:00:00.004,0.18,0.67,-1.78,0.05,0.35,0.09
3,1970-01-01 00:00:00.005,0.13,0.76,-1.95,0.05,0.35,0.09
4,1970-01-01 00:00:00.005,0.13,0.76,-1.95,-0.01,0.35,0.09


In [7]:
# Resample data to 50 Hz
data = data.set_index('timestamp')
data = data.resample('20ms').mean()
data = data.interpolate(method='linear', limit_direction='both')
data = data.reset_index()
data.head()

Unnamed: 0,timestamp,body_acc_x,body_acc_y,body_acc_z,body_gyro_x,body_gyro_y,body_gyro_z
0,1970-01-01 00:00:00.000,0.073889,0.587222,-1.480556,-0.167222,0.376667,0.082222
1,1970-01-01 00:00:00.020,-0.196667,0.335833,-0.830833,-0.530833,0.381667,0.09
2,1970-01-01 00:00:00.040,-0.18375,0.24625,-0.52,-0.525,0.24625,0.09375
3,1970-01-01 00:00:00.060,0.118125,-0.0125,0.366875,-0.286875,0.205,0.07375
4,1970-01-01 00:00:00.080,-0.0725,0.045,0.035,0.001667,0.160833,0.073333


In [8]:
# Segment the data into fixed-length windows of 2.56 seconds with 50% overlap
window_length = 2.56  # in seconds
window_overlap = 0.5  # as a fraction of the window length
sampling_rate = 50    # in Hz
window_size = int(window_length * sampling_rate)
step_size = int(window_size * window_overlap * sampling_rate)
num_windows = int((len(data) - window_size) / step_size) + 1

dict = {'window_length':window_length,'window_overlap':window_overlap, 'sampling_rate':sampling_rate, 'window_size':window_size, 'step_size':step_size,'num_windows':num_windows}
dict

{'window_length': 2.56,
 'window_overlap': 0.5,
 'sampling_rate': 50,
 'window_size': 128,
 'step_size': 3200,
 'num_windows': 1}

In [9]:
# Add a new 'activity' column with the label 'walking'
data['activity'] = 'WALKING'
data.head()

Unnamed: 0,timestamp,body_acc_x,body_acc_y,body_acc_z,body_gyro_x,body_gyro_y,body_gyro_z,activity
0,1970-01-01 00:00:00.000,0.073889,0.587222,-1.480556,-0.167222,0.376667,0.082222,WALKING
1,1970-01-01 00:00:00.020,-0.196667,0.335833,-0.830833,-0.530833,0.381667,0.09,WALKING
2,1970-01-01 00:00:00.040,-0.18375,0.24625,-0.52,-0.525,0.24625,0.09375,WALKING
3,1970-01-01 00:00:00.060,0.118125,-0.0125,0.366875,-0.286875,0.205,0.07375,WALKING
4,1970-01-01 00:00:00.080,-0.0725,0.045,0.035,0.001667,0.160833,0.073333,WALKING


In [10]:
# Initialize empty lists for the segmented data and labels
X = []
y = []

In [11]:
# Loop through the data and segment it into fixed-length windows
for i in range(num_windows):
    start = i * step_size
    end = start + window_size
    X.append(data[['body_acc_x', 'body_acc_y', 'body_acc_z', 
                   'body_gyro_x', 'body_gyro_y', 'body_gyro_z']].iloc[start:end].values.flatten())
    y.append(data['activity'].iloc[start])

In [12]:
# Convert the segmented data and labels to a pandas DataFrame
columns = [f'feat_{i}' for i in range(len(X[0]))]
segmented_data = pd.DataFrame(X, columns=columns)
segmented_data['activity'] = y

In [13]:
# Save the segmented data and labels to a new CSV file
segmented_data.to_csv('transformed_vikash_walking_data.csv', index=False)

In [14]:
import pandas as pd

# Load the time series data from a CSV file
mobile_data = pd.read_csv('vikash_walking_data.csv')
mobile_data = mobile_data.iloc[:,:-1]
mobile_data.head()

Unnamed: 0,time,ax,ay,az,wx,wy,wz
0,0.003,0.28,0.5,-1.46,0.11,0.34,0.09
1,0.004,0.18,0.67,-1.78,0.11,0.34,0.09
2,0.004,0.18,0.67,-1.78,0.05,0.35,0.09
3,0.005,0.13,0.76,-1.95,0.05,0.35,0.09
4,0.005,0.13,0.76,-1.95,-0.01,0.35,0.09


In [15]:
# Rename the columns to match the format of the HAR dataset
har_cols = ['mean_ax', 'mean_ay', 'mean_az', 'mean_wx', 'mean_wy', 'mean_wz']
mobile_data.columns = ['time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
mobile_data.head()

Unnamed: 0,time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z
0,0.003,0.28,0.5,-1.46,0.11,0.34,0.09
1,0.004,0.18,0.67,-1.78,0.11,0.34,0.09
2,0.004,0.18,0.67,-1.78,0.05,0.35,0.09
3,0.005,0.13,0.76,-1.95,0.05,0.35,0.09
4,0.005,0.13,0.76,-1.95,-0.01,0.35,0.09


In [16]:
# Convert the time values to integer
mobile_data['time'] = mobile_data['time'].astype(int)
mobile_data.head()

Unnamed: 0,time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z
0,0,0.28,0.5,-1.46,0.11,0.34,0.09
1,0,0.18,0.67,-1.78,0.11,0.34,0.09
2,0,0.18,0.67,-1.78,0.05,0.35,0.09
3,0,0.13,0.76,-1.95,0.05,0.35,0.09
4,0,0.13,0.76,-1.95,-0.01,0.35,0.09


In [17]:
# Partition the data into fixed-width sliding windows of 2.56 seconds each with 50% overlap, resulting in 128 readings per window
window_size = 2.56  # seconds
window_overlap = 0.5  # percentage
sampling_rate = 50  # Hz
window_width = int(window_size * sampling_rate)
window_step = int(window_width * (1 - window_overlap))

In [18]:
# Group the data into windows and compute the features
windows = []
for i in range(0, len(mobile_data) - window_width + 1, window_step):
    window = mobile_data.iloc[i:i+window_width]
    window_features = []
    window_features.extend(window.mean().values[1:])  # Mean of gyro and acc values
    windows.append(window_features)

In [30]:
# Convert the list of windows to a dataframe and add the subject and activity columns
har_data = pd.DataFrame(windows, columns=har_cols)
har_data['subject'] = 31  # Replace with the subject identifier
har_data['activity_label'] = 'WALKING'  # Replace with the activity label

In [50]:
har_data

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyGyromeanX,tBodyGyromeanY,tBodyGyromeanZ,subject_id,activity_label
0,0.055469,0.330469,-0.471406,-0.230859,0.243047,0.098203,31,WALKING
1,0.073203,0.329375,-0.386797,-0.150391,0.231719,0.113125,31,WALKING
2,-0.021953,0.251484,-0.388594,-0.170859,0.180000,0.095547,31,WALKING
3,-0.021875,0.079531,-0.055703,-0.195156,0.046016,-0.039609,31,WALKING
4,0.321484,0.196172,0.704297,0.014531,0.001953,-0.234453,31,WALKING
...,...,...,...,...,...,...,...,...
482,-0.073281,-0.047109,-0.102109,-0.421484,0.263828,0.034219,31,WALKING
483,-0.023906,-0.002969,-0.097578,-0.169766,0.077422,0.012266,31,WALKING
484,0.039375,0.076094,0.002734,-0.002266,-0.037656,-0.001953,31,WALKING
485,0.037344,0.097188,-0.014141,0.100703,-0.072344,0.027266,31,WALKING


#### Renaming the columns to match the name with our dataset

In [51]:
# create a list of new column names
new_cols = ['tBodyAccmeanX', 'tBodyAccmeanY', 'tBodyAccmeanZ', 'tBodyGyromeanX',
       'tBodyGyromeanY', 'tBodyGyromeanZ', 'subject_id',
       'activity_label']

# rename the columns using the list of new names
har_data = har_data.set_axis(new_cols,axis = 1)

# print the updated dataframe
har_data

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyGyromeanX,tBodyGyromeanY,tBodyGyromeanZ,subject_id,activity_label
0,0.055469,0.330469,-0.471406,-0.230859,0.243047,0.098203,31,WALKING
1,0.073203,0.329375,-0.386797,-0.150391,0.231719,0.113125,31,WALKING
2,-0.021953,0.251484,-0.388594,-0.170859,0.180000,0.095547,31,WALKING
3,-0.021875,0.079531,-0.055703,-0.195156,0.046016,-0.039609,31,WALKING
4,0.321484,0.196172,0.704297,0.014531,0.001953,-0.234453,31,WALKING
...,...,...,...,...,...,...,...,...
482,-0.073281,-0.047109,-0.102109,-0.421484,0.263828,0.034219,31,WALKING
483,-0.023906,-0.002969,-0.097578,-0.169766,0.077422,0.012266,31,WALKING
484,0.039375,0.076094,0.002734,-0.002266,-0.037656,-0.001953,31,WALKING
485,0.037344,0.097188,-0.014141,0.100703,-0.072344,0.027266,31,WALKING


### Appending Real World Generated data to main dataframe

In [52]:
# loading data_for_testing.csv
test_data = pd.read_csv(r'D:\Dataset\2020-03-13\Untitled Folder\Data_for_testing.csv', index_col = None)
test_data = test_data.iloc[:,1:]
test_data

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyGyromeanX,tBodyGyromeanY,tBodyGyromeanZ,subject_id,activity_label
0,0.282022,-0.037696,-0.134897,-0.479730,0.082034,0.256443,1,WALKING
1,0.255841,-0.064550,-0.095186,0.094091,-0.309153,0.086441,1,WALKING
2,0.254867,0.003815,-0.123658,0.211201,-0.272905,0.101986,1,WALKING
3,0.343370,-0.014446,-0.167377,0.096082,-0.163394,0.025859,1,WALKING
4,0.276240,-0.029638,-0.142616,0.008742,0.011661,0.004175,1,WALKING
...,...,...,...,...,...,...,...,...
4667,0.310155,-0.053391,-0.099109,-0.142473,0.025443,0.202862,24,WALKING_UPSTAIRS
4668,0.363385,-0.039214,-0.105915,0.062107,-0.043156,0.113594,24,WALKING_UPSTAIRS
4669,0.349966,0.030077,-0.115788,-0.123715,0.086320,0.261423,24,WALKING_UPSTAIRS
4670,0.237594,0.018467,-0.096499,-0.335912,0.099347,0.355058,24,WALKING_UPSTAIRS


In [53]:
# appending first 200 values to main dataset for traing the model.
test_data = test_data.append(har_data.iloc[:200], ignore_index=True)
test_data

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyGyromeanX,tBodyGyromeanY,tBodyGyromeanZ,subject_id,activity_label
0,0.282022,-0.037696,-0.134897,-0.479730,0.082034,0.256443,1,WALKING
1,0.255841,-0.064550,-0.095186,0.094091,-0.309153,0.086441,1,WALKING
2,0.254867,0.003815,-0.123658,0.211201,-0.272905,0.101986,1,WALKING
3,0.343370,-0.014446,-0.167377,0.096082,-0.163394,0.025859,1,WALKING
4,0.276240,-0.029638,-0.142616,0.008742,0.011661,0.004175,1,WALKING
...,...,...,...,...,...,...,...,...
4867,-0.295625,0.858984,1.289922,-0.332109,0.139609,0.031719,31,WALKING
4868,2.368359,-0.118359,0.873203,-0.152578,0.191250,0.051719,31,WALKING
4869,1.100938,-1.361094,-0.526719,-0.018672,0.041016,0.288984,31,WALKING
4870,-1.327656,-1.121484,-1.628594,-0.390547,-0.222266,0.269531,31,WALKING


In [55]:
# Save the data as csv for training the model.
test_data.to_csv('Data_for_testing.csv', index=False)

In [80]:
# appending remaining values to new test_input_data for testing the model.
test_input_data = har_data.iloc[200:,:-2]
test_input_data.head()

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyGyromeanX,tBodyGyromeanY,tBodyGyromeanZ
200,-0.2525,1.672578,-0.682266,0.155859,0.0025,0.334062
201,1.300234,1.802188,-0.884453,-0.108203,-0.216875,0.204688
202,1.951172,-1.585781,-0.232734,-0.781016,-0.32,-0.392344
203,0.955469,-2.228672,-0.466875,0.921953,-0.017969,-0.910156
204,-1.889609,-0.259531,-0.767656,-0.058516,0.563203,-0.518594


### Applying PCA on test_inputs

In [81]:
from sklearn.decomposition import PCA

# Apply PCA to the training and test data separately
pca = PCA(n_components=6) # Choosing 6 number of components to retain
df_pca = pca.fit_transform(test_input_data)

# Convert the PCA transformed data into a dataframe
X = pd.DataFrame(df_pca, columns=[f"PC{i+1}" for i in range(pca.n_components_)])

In [82]:
X.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
0,-1.290586,1.298209,0.149169,0.368897,0.029282,0.054873
1,-0.31832,1.969425,1.243563,0.410254,0.03471,-0.034482
2,2.14328,-0.545909,1.277312,-0.430811,-0.154395,-0.160471
3,2.122603,-1.234726,-0.209543,0.79042,0.529122,-0.433921
4,-1.44293,-1.184292,-0.708963,0.070367,0.640229,-0.295906


In [83]:
# Save these features for testing the model
X.to_csv('test_input_data.csv', index=False)