In [None]:
import sys
!{sys.executable} -m pip install numpy pandas scikit-learn tqdm

In [6]:
# Define motion columns found in the SHL documentation
MOTION_COLUMNS = [
    'Time',
    'Acceleration X',
    'Acceleration Y',
    'Acceleration Z',
    'Gyroscope X',
    'Gyroscope Y',
    'Gyroscope Z',
    'Magnetometer X',
    'Magnetometer Y',
    'Magnetometer Z',
    'Orientation w',
    'Orientation x',
    'Orientation y',
    'Orientation z',
    'Gravity X',
    'Gravity Y',
    'Gravity Z',
    'Linear acceleration X',
    'Linear acceleration Y',
    'Linear acceleration Z',
    'Pressure',
    'Altitude',
    'Temperature',
]

In [7]:
# Define label columns found in the SHL documentation
LABEL_COLUMNS = [
    'Time',
    'Coarse Label',
    'Fine Label',
    'Road Label',
    'Traffic Label',
    'Tunnels Label',
    'Social Label',
    'Food Label',
]

In [11]:
from collections import OrderedDict

# Define labels found in the SHL documentation
# Note: There are coarse labels and fine labels
# but we'll only use coarse labels
COARSE_LABELS = OrderedDict({
    0: 'Null',
    1: 'Still',
    2: 'Walking',
    3: 'Run',
    4: 'Bike',
    5: 'Car',
    6: 'Bus',
    7: 'Train',
    8: 'Subway',
})
COARSE_LABEL_IDS = COARSE_LABELS.keys()
COARSE_LABEL_VALUES = COARSE_LABELS.values()

In [12]:
import pandas as pd

from pathlib import Path

# Define constants and helpers to load and save files

release_dir = Path('shl-dataset/release')
records_dir = release_dir / 'User1'
record_dirs = [r for r in records_dir.iterdir() if r.is_dir()]

assert record_dirs

def load_motion_data(record_dir: Path) -> pd.DataFrame:
    motion_data = pd.read_csv(
        record_dir / 'Hips_Motion.txt',
        sep=' ',
        header=None,
        names=MOTION_COLUMNS,
        low_memory=False
    )
    return motion_data


def load_labels_data(record_dir: Path) -> pd.DataFrame:
    labels_data = pd.read_csv(
        record_dir / 'Label.txt',
        sep=' ',
        header=None,
        names=LABEL_COLUMNS,
        low_memory=False
    )
    labels_data.replace({
        'Coarse Label': COARSE_LABELS,
    }, inplace=True)
    return labels_data

In [3]:
WINDOW_LENGTH = 200 # 2 seconds

In [21]:
record_dir = record_dirs[0]
try:
    motion_data = load_motion_data(record_dir)
    labels_data = load_labels_data(record_dir)
except FileNotFoundError:
    print(f'Missing file for dir {record_dir}')

In [22]:
# In the documentation, it is said that every line
# in Hips_Motion.txt corresponds to the exact same line
# in Label.txt, therefore we can just merge them together
try:
    data = pd.merge(motion_data, labels_data)
except ValueError:
    print(f'Data under dir {record_dir} has erroneous format')

In [23]:
# Drop rows that contain any NaN
data.dropna(how='any', inplace=True)

In [25]:
# Drop columns that are all NaN
data.dropna(axis='columns', how='all', inplace=True)

In [74]:
from tqdm.notebook import tqdm

def create_window_generator(data: pd.DataFrame):
    height, _ = data.shape
    i_min = 0
    i_max = (height - 1) - WINDOW_LENGTH
    for i in tqdm(range(i_min, i_max, 10)):
        j = i + WINDOW_LENGTH
        window = data.iloc[i:j]
        linacc_cols = window[[
            'Linear acceleration X',
            'Linear acceleration Y',
            'Linear acceleration Z'
        ]]
        sample = linacc_cols.sum(axis=1).to_numpy()
        label = window.iloc[-1]['Coarse Label']
        yield sample, label

window_generator = create_window_generator(data)

X = []
y = []
for i, (sample, label) in enumerate(window_generator):
    X.append(sample)
    y.append(label)

print(f'Extracted {len(y)} windows')



  0%|          | 0/322317 [00:00<?, ?it/s]

In [73]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
sgd = SGDClassifier(loss="hinge", penalty="l2")
pipeline = make_pipeline(StandardScaler(), SGDClassifier())
cross_val_score(clf, X, y, cv=5)



array([0.32940447, 0.36984176, 0.40583307, 0.46633571, 0.28637915])