# import needed packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
import os
import csv
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import scipy.stats as stats
from sklearn.model_selection import train_test_split

# Data pre-processing

In [2]:
path_phone_accel = "/Users/stevenyuan/Documents/McGill/Research/HAR/wisdm-dataset/raw/phone/accel/"
files_phone_accel = os.listdir(path_phone_accel)
path_phone_gyro = "/Users/stevenyuan/Documents/McGill/Research/HAR/wisdm-dataset/raw/phone/gyro/"
files_phone_gyro = os.listdir(path_phone_gyro)
path_watch_accel = "/Users/stevenyuan/Documents/McGill/Research/HAR/wisdm-dataset/raw/watch/accel/"
files_watch_accel = os.listdir(path_watch_accel)
path_watch_gyro = "/Users/stevenyuan/Documents/McGill/Research/HAR/wisdm-dataset/raw/watch/gyro/"
files_watch_gyro = os.listdir(path_watch_gyro)

In [89]:
files_phone_accel.sort()
files_phone_gyro.sort()
files_watch_accel.sort()
files_watch_gyro.sort()
files_phone_accel = files_phone_accel[1:]
files_phone_gyro = files_phone_gyro[1:]
files_watch_accel = files_watch_accel[1:]
files_watch_gyro = files_watch_gyro[1:]

In [90]:
# phone accelerator
df_phone_accel = pd.DataFrame()
for file in files_phone_accel:
    temp_df = pd.read_csv(path_phone_accel + file, sep=",", header=None)
    temp_df.columns = ["id", "actCode", "Timestamp", "a_x", "a_y", "a_z"]
    temp_df['a_z'] = temp_df['a_z'].str.replace(';', '')
    df_phone_accel = pd.concat([df_phone_accel, temp_df], sort=False)
print(df_phone_accel)

          id actCode        Timestamp       a_x        a_y         a_z
0       1600       A  252207666810782 -0.364761   8.793503   1.0550842
1       1600       A  252207717164786 -0.879730   9.768784   1.0169983
2       1600       A  252207767518790  2.001495  11.109070    2.619156
3       1600       A  252207817872794  0.450623  12.651642  0.18455505
4       1600       A  252207868226798 -2.164352  13.928436  -4.4224854
...      ...     ...              ...       ...        ...         ...
160774  1650       S  357734078421000 -8.234077  -4.275110  -1.4394068
160775  1650       S  357734098619000 -8.080796  -4.418811  -1.4154567
160776  1650       S  357734118715000 -7.994575  -4.457131  -1.4418018
160777  1650       S  357734138868000 -8.040081  -4.366121  -1.4968873
160778  1650       S  357734158964000 -8.164622  -4.179309  -1.5304176

[4804403 rows x 6 columns]


In [91]:
# phone gyroscope
df_phone_gyro = pd.DataFrame()
for file in files_phone_gyro:
    temp_df = pd.read_csv(path_phone_gyro + file, sep=",")
    temp_df.columns = ["id", "actCode", "Timestamp", "g_x", "g_y", "g_z"]
    temp_df['g_z'] = temp_df['g_z'].str.replace(';', '')
    df_phone_gyro = pd.concat([df_phone_gyro, temp_df], sort=False)
print(df_phone_gyro)

         id actCode        Timestamp       g_x       g_y          g_z
0      1600       A  252207968934806 -0.875137  0.015472   0.16223145
1      1600       A  252208019288809 -0.720169  0.388489  -0.28401184
2      1600       A  252208069642813 -0.571640  1.227402   -0.2416687
3      1600       A  252208119996817 -0.380493  1.202835  -0.21313477
4      1600       A  252208170350821 -0.225784  0.558136    0.1244812
...     ...     ...              ...       ...       ...          ...
80342  1650       S  357733977635000  0.438889  0.171508   0.16937704
80343  1650       S  357734017925000  0.444215  0.158724   0.19707392
80344  1650       S  357734058325000  0.519849  0.037284   0.19600865
80345  1650       S  357734098619000  0.545415 -0.086286   0.17896444
80346  1650       S  357734138868000  0.471912 -0.201335   0.23222765

[3608584 rows x 6 columns]


In [92]:
# smartwatch accelerator
df_watch_accel = pd.DataFrame()
for file in files_watch_accel:
    temp_df = pd.read_csv(path_watch_accel + file, sep=",")
    temp_df.columns = ["id", "actCode", "Timestamp", "a_x", "a_y", "a_z"]
    temp_df['a_z'] = temp_df['a_z'].str.replace(';', '')
    df_watch_accel = pd.concat([df_watch_accel, temp_df], sort=False)
print(df_watch_accel)

         id actCode         Timestamp       a_x        a_y        a_z
0      1600       A    90426757696641  4.972757  -0.158317  6.6967316
1      1600       A    90426807196641  3.253720  -0.191835   6.107758
2      1600       A    90426856696641  2.801216  -0.155922   5.997625
3      1600       A    90426906196641  3.770868  -1.051354   7.731027
4      1600       A    90426955696641  4.661511   0.169689   9.684695
...     ...     ...               ...       ...        ...        ...
66586  1650       S  2426184421424592  1.486500 -13.302334   5.355231
66587  1650       S  2426184471411932  4.017171 -13.146711  5.1182046
66588  1650       S  2426184521399272  4.412214 -15.318253  3.3704374
66589  1650       S  2426184571385262  4.426579 -13.012636  2.5755625
66590  1650       S  2426184621371252  3.765780 -11.798776   2.166154

[3776995 rows x 6 columns]


In [93]:
# smartwatch gyroscope
df_watch_gyro = pd.DataFrame()
for file in files_watch_gyro:
    temp_df = pd.read_csv(path_watch_gyro + file, sep=",")
    temp_df.columns = ["id", "actCode", "Timestamp", "g_x", "g_y", "g_z"]
    temp_df['g_z'] = temp_df['g_z'].str.replace(';', '')
    df_watch_gyro = pd.concat([df_watch_gyro, temp_df], sort=False)
print(df_watch_gyro)

         id actCode         Timestamp       g_x       g_y            g_z
0      1600       A    90426807196641  0.387382 -0.618541   -0.048971802
1      1600       A    90426856696641  0.070999 -0.209480     -0.1959783
2      1600       A    90426906196641  0.037975  0.254976     -0.1565635
3      1600       A    90426955696641  0.073129  0.719431  -0.0010349044
4      1600       A    90427005196641 -0.101574  1.082686    -0.13419296
...     ...     ...               ...       ...       ...            ...
66550  1650       S  2426184371437252 -2.856065  2.057101     0.85393304
66551  1650       S  2426184421424592 -2.911458  1.817417      1.2811041
66552  1650       S  2426184471411932 -3.015854  1.550036      2.0576818
66553  1650       S  2426184521399272 -3.218254  1.595842      3.0185504
66554  1650       S  2426184571385262 -2.455525  0.866136      3.3541086

[3440291 rows x 6 columns]


In [94]:
# do the one-hot encoding for the labels
onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()
df_phone_accel['label'] = label_encoder.fit_transform(df_phone_accel['actCode'])
df_phone_gyro['label'] = label_encoder.fit_transform(df_phone_gyro['actCode'])
df_watch_accel['label'] = label_encoder.fit_transform(df_watch_accel['actCode'])
df_watch_gyro['label'] = label_encoder.fit_transform(df_watch_gyro['actCode'])

In [95]:
# standarize the value of each axis
scaler = StandardScaler()
df_phone_accel[['a_x', 'a_y', 'a_z']] = scaler.fit_transform(df_phone_accel[['a_x', 'a_y', 'a_z']])
df_phone_gyro[['g_x', 'g_y', 'g_z']] = scaler.fit_transform(df_phone_gyro[['g_x', 'g_y', 'g_z']])
df_watch_accel[['a_x', 'a_y', 'a_z']] = scaler.fit_transform(df_watch_accel[['a_x', 'a_y', 'a_z']])
df_watch_gyro[['g_x', 'g_y', 'g_z']] = scaler.fit_transform(df_watch_gyro[['g_x', 'g_y', 'g_z']])

In [96]:
print(df_phone_accel)

          id actCode        Timestamp       a_x       a_y       a_z  label
0       1600       A  252207666810782 -0.086555  1.522463  0.137514      0
1       1600       A  252207717164786 -0.174591  1.664156  0.130082      0
2       1600       A  252207767518790  0.317969  1.858878  0.442712      0
3       1600       A  252207817872794  0.052839  2.082989 -0.032353      0
4       1600       A  252207868226798 -0.394204  2.268487 -0.931326      0
...      ...     ...              ...       ...       ...       ...    ...
160774  1650       S  357734078421000 -1.431855 -0.376197 -0.349237     17
160775  1650       S  357734098619000 -1.405650 -0.397075 -0.344563     17
160776  1650       S  357734118715000 -1.390911 -0.402642 -0.349704     17
160777  1650       S  357734138868000 -1.398690 -0.389420 -0.360453     17
160778  1650       S  357734158964000 -1.419981 -0.362279 -0.366996     17

[4804403 rows x 7 columns]


In [97]:
print(df_phone_gyro)

         id actCode        Timestamp       g_x       g_y       g_z  label
0      1600       A  252207968934806 -1.035478  0.021521  0.250015      0
1      1600       A  252208019288809 -0.851231  0.412987 -0.431194      0
2      1600       A  252208069642813 -0.674640  1.293393 -0.366555      0
3      1600       A  252208119996817 -0.447379  1.267611 -0.322997      0
4      1600       A  252208170350821 -0.263441  0.591025  0.192388      0
...     ...     ...              ...       ...       ...       ...    ...
80342  1650       S  357733977635000  0.526811  0.185274  0.260923     17
80343  1650       S  357734017925000  0.533144  0.171858  0.303203     17
80344  1650       S  357734058325000  0.623067  0.044412  0.301577     17
80345  1650       S  357734098619000  0.653464 -0.085271  0.275559     17
80346  1650       S  357734138868000  0.566073 -0.206010  0.356867     17

[3608584 rows x 7 columns]


In [98]:
print(df_watch_accel)

         id actCode         Timestamp       a_x       a_y       a_z  label
0      1600       A    90426757696641  0.658894  0.810991  0.927623      0
1      1600       A    90426807196641  0.412107  0.804699  0.814942      0
2      1600       A    90426856696641  0.347145  0.811440  0.793871      0
3      1600       A    90426906196641  0.486349  0.643363  1.125502      0
4      1600       A    90426955696641  0.614211  0.872559  1.499273      0
...     ...     ...               ...       ...       ...       ...    ...
66586  1650       S  2426184421424592  0.158403 -1.656213  0.670970     17
66587  1650       S  2426184471411932  0.521709 -1.627002  0.625623     17
66588  1650       S  2426184521399272  0.578422 -2.034612  0.291244     17
66589  1650       S  2426184571385262  0.580484 -1.601835  0.139170     17
66590  1650       S  2426184621371252  0.485619 -1.373987  0.060843     17

[3776995 rows x 7 columns]


In [99]:
print(df_watch_gyro)

         id actCode         Timestamp       g_x       g_y       g_z  label
0      1600       A    90426807196641  0.264375 -0.396801 -0.044583      0
1      1600       A    90426856696641  0.062580 -0.121778 -0.148378      0
2      1600       A    90426906196641  0.041517  0.190488 -0.120549      0
3      1600       A    90426955696641  0.063939  0.502755 -0.010737      0
4      1600       A    90427005196641 -0.047490  0.746981 -0.104754      0
...     ...     ...               ...       ...       ...       ...    ...
66550  1650       S  2426184371437252 -1.804354  1.402108  0.592922     17
66551  1650       S  2426184421424592 -1.839685  1.240961  0.894531     17
66552  1650       S  2426184471411932 -1.906271  1.061193  1.442842     17
66553  1650       S  2426184521399272 -2.035365  1.091990  2.121273     17
66554  1650       S  2426184571385262 -1.548883  0.601388  2.358198     17

[3440291 rows x 7 columns]


In [100]:
df_phone_accel_x, df_phone_accel_y = df_phone_accel[['a_x', 'a_y', 'a_z']], df_phone_accel['label']

In [101]:
df_phone_gyro_x, df_phone_gyro_y = df_phone_gyro[['g_x', 'g_y', 'g_z']], df_phone_gyro['label']

In [102]:
df_watch_accel_x, df_watch_accel_y = df_watch_accel[['a_x', 'a_y', 'a_z']], df_watch_accel['label']

In [103]:
df_watch_gyro_x, df_watch_gyro_y = df_watch_gyro[['g_x', 'g_y', 'g_z']], df_watch_gyro['label']

In [104]:
df_phone_accel_x_train, df_phone_accel_x_test, df_phone_accel_y_train, df_phone_accel_y_test = train_test_split(df_phone_accel_x, df_phone_accel_y, test_size = 0.2)

In [105]:
print(df_phone_accel_x_train)
print(df_phone_accel_x_test)
print(df_phone_accel_y_train)
print(df_phone_accel_y_test)

             a_x       a_y       a_z
108525  0.102730 -1.178937 -0.093601
42493  -1.652692 -0.035523  0.245158
79037  -0.973281 -0.304170 -1.367573
26770   0.599791 -1.180676 -0.452519
142830 -0.181422  1.704592  0.027907
...          ...       ...       ...
140325  0.211231 -1.090903  0.128853
65334  -1.447823 -0.032067 -0.871724
50653   1.581575  0.419638  0.207624
44347   0.569044 -0.077687 -1.890107
38387   1.107827  0.260387  1.327573

[3843522 rows x 3 columns]
            a_x       a_y       a_z
77014 -0.946667 -0.364715 -1.336728
25921  0.013881 -1.409285 -0.368398
53975 -1.462501 -0.482827  0.115811
21632  1.374862 -1.096819 -0.115099
22246 -1.079650 -0.814167  0.274221
...         ...       ...       ...
3903  -0.403694  2.075042 -1.264122
5596  -0.204773 -1.527996 -0.281771
16374  1.577423 -0.159064  0.180613
94882  0.005692  0.322502  1.872034
29103  0.047155 -1.110486  0.344420

[960881 rows x 3 columns]
108525    13
42493     11
79037      8
26770      2
142830    17
    

In [106]:
df_phone_gyro_x_train, df_phone_gyro_x_test, df_phone_gyro_y_train, df_phone_gyro_y_test = train_test_split(df_phone_gyro_x, df_phone_gyro_y, test_size = 0.2)

In [107]:
print(df_phone_gyro_x_train)
print(df_phone_gyro_x_test)
print(df_phone_gyro_y_train)
print(df_phone_gyro_y_test)

             g_x       g_y       g_z
14712  -0.061288  0.015964 -0.063907
30313  -0.006373  0.017918  0.003387
31866   0.003641  0.005411  0.001756
118719  0.015995  0.011368 -0.006117
48412  -0.003864 -0.002543 -0.000890
...          ...       ...       ...
33153  -0.000065  0.045529  0.003988
31989   0.042192 -0.036752  0.025143
400     2.139515  0.043940  2.763580
30804   0.013619  0.007621  0.000359
3649    2.072735 -0.789179 -1.128544

[2886867 rows x 3 columns]
            g_x       g_y       g_z
78862 -0.241972  0.392095  0.288568
64357 -0.623196 -0.485498 -0.225302
35706 -0.114824  0.069994 -0.294812
14439  0.015143  0.010808  0.010538
58797  0.002679  0.010183 -0.001854
...         ...       ...       ...
35021  0.004512  0.002977  0.009420
24361 -0.099912 -0.030203 -0.184076
55421  0.261484  0.061539  0.010124
24909  0.036665  0.009755  0.047895
12419  0.649664 -0.616299 -0.818854

[721717 rows x 3 columns]
14712      6
30313      8
31866      8
118719    15
48412     10
    

In [108]:
df_watch_accel_x_train, df_watch_accel_x_test, df_watch_accel_y_train, df_watch_accel_y_test = train_test_split(df_watch_accel_x, df_watch_accel_y, test_size = 0.2)

In [109]:
print(df_watch_accel_x_train)
print(df_watch_accel_x_test)
print(df_watch_accel_y_train)
print(df_watch_accel_y_test)

             a_x       a_y       a_z
2157    0.646456  0.568228  0.015153
60659  -0.069049 -0.926605 -0.160396
143784 -1.152853  0.931519 -1.395090
48838  -0.148877 -1.215375  0.647380
17756   1.170684  0.144524  0.197544
...          ...       ...       ...
16269   1.317943  0.671001 -0.016968
132523  0.923888 -0.329942 -0.176253
62132  -0.242410  1.353984  0.091132
14285   0.324396  1.083218  1.465520
7536    1.651990 -0.526494  0.055232

[3021596 rows x 3 columns]
             a_x       a_y       a_z
2214    1.083767  0.563369 -0.787557
16943   0.683641  1.190148 -1.968131
61574  -0.988355  0.880789 -0.564886
40646   1.325355  0.493149  0.032873
17543   1.243830  0.308500 -0.270186
...          ...       ...       ...
16476   1.218782  0.224995 -0.814925
135491 -0.782547 -0.232838  0.620115
18083  -0.623868 -0.492620  0.630232
2314    0.869182  1.613094 -0.372360
100600 -0.401764 -0.489362 -1.541827

[755399 rows x 3 columns]
2157       0
60659     15
143784    16
48838     12
17756

In [110]:
df_watch_gyro_x_train, df_watch_gyro_x_test, df_watch_gyro_y_train, df_watch_gyro_y_test = train_test_split(df_watch_gyro_x, df_watch_gyro_y, test_size = 0.2)

In [111]:
print(df_watch_gyro_x_train)
print(df_watch_gyro_x_test)
print(df_watch_gyro_y_train)
print(df_watch_gyro_y_test)

            g_x       g_y       g_z
30299 -0.005126  0.051291 -0.019032
2864   0.629990 -0.063599  0.802087
27822 -0.112357  0.138933  0.079177
81558  0.013171  0.014827 -0.021983
47569 -0.614200 -0.397435 -0.878797
...         ...       ...       ...
46125 -1.825736  0.616210  0.642107
16739  0.009822  0.019061 -0.000980
14193  0.048214  0.023137 -0.013336
23567 -0.038628 -0.023581  0.008020
58792 -0.128165 -0.809979  0.062494

[2752232 rows x 3 columns]
            g_x       g_y       g_z
31114 -0.336173  0.389263 -0.245924
13643  0.021057  0.019991 -0.005472
54653  1.219914  0.106438  0.233688
23992 -0.210787  0.048415 -0.233545
54071  0.354301  0.163019 -0.245426
...         ...       ...       ...
24127  0.395072 -0.080434  0.243601
46404 -0.079263  0.087490 -0.004774
60823  0.581577 -0.069575  0.128833
8121  -3.784199 -2.534932 -3.038126
1518   0.653901 -0.125894  1.466398

[688059 rows x 3 columns]
30299     6
2864      0
27822     7
81558     4
47569    13
         ..
46125    