# Summary
### Transforming the raw data so that the features have timestamps will probably take a non-negligible amount of time since a python module that can handle the signal processing will have to be identified.  

### Unlabeled data for WISDM 2.0 is available as transformed features, but of course, no timestamps

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from time import time

In [2]:
cat ./datasets/WISDM_v2/WISDM_at_v2.0_raw.txt | head -n 2

1679,Walking,1370520469556,0.2941316,-0.6356053,-0.22693644;
1679,Walking,1370520469606,-0.49968776,-0.6044512,-0.22602014;
cat: write error: Broken pipe


# Reading the Raw Data

### Labeled

In [3]:
rows = []
bad_lines = []
with open('./datasets/WISDM_v2/WISDM_at_v2.0_raw.txt', 'r') as fIn:
    lines = [line.strip()[:-1].split(",") for line in fIn.readlines()]
    for ind, line in enumerate(lines):
        try:
            row = {'user' : line[0],
                   'class' : line[1],
                   'timestamp' : dt.datetime.fromtimestamp(int(line[2]) / 1e3),
                   'x-acc' : line[3],
                   'y-acc' : line[4],
                   'z-acc' : line[5]}
            rows.append(row)
        except ValueError as ve:
            #print("could not parse line #%s" % ind)
            #print("line[2] : %s" % line[2])
            #print(ve)
            bad_lines.append(ind)

col_names = ['user', 'class', 'timestamp', 'x-acc', 'y-acc', 'z-acc']
raw_df = pd.DataFrame(rows, columns=col_names)

raw_df['x-acc'] = pd.to_numeric(raw_df['x-acc'])
raw_df['y-acc'] = pd.to_numeric(raw_df['y-acc'])
raw_df['z-acc'] = pd.to_numeric(raw_df['z-acc'])

### Unlabeled

In [4]:
unlabeled_rows = []
unlabeled_bad_lines = []
with open('./datasets/WISDM_v2/WISDM_at_v2.0_unlabeled_raw.txt', 'r') as fIn:
    lines = [line.strip()[:-1].split(",") for line in fIn.readlines()]
    for ind, line in enumerate(lines):
        try:
            row = {'user' : line[0],
                   'class' : line[1],
                   'timestamp' : dt.datetime.fromtimestamp(int(line[2]) / 1e3),
                   'x-acc' : line[3],
                   'y-acc' : line[4],
                   'z-acc' : line[5]}
            rows.append(row)
        except ValueError as ve:
            #print("could not parse line #%s" % ind)
            #print("line[2] : %s" % line[2])
            #print(ve)
            bad_lines.append(ind)

unlabeled_raw_df = pd.DataFrame(rows, columns=col_names)

unlabeled_raw_df['x-acc'] = pd.to_numeric(unlabeled_raw_df['x-acc'])
unlabeled_raw_df['y-acc'] = pd.to_numeric(unlabeled_raw_df['y-acc'])
unlabeled_raw_df['z-acc'] = pd.to_numeric(unlabeled_raw_df['z-acc'])

In [5]:
len(raw_df)

2980763

In [6]:
len(unlabeled_raw_df)

41190491

# Save it all as one Dataframe

In [7]:
all_raw_df = pd.concat([raw_df, unlabeled_raw_df])

In [8]:
all_raw_df.to_pickle('./datasets/WISDM_v2/all_raw_data.dataframe.pickle')

# Load it back in 

In [6]:
loaded_df = pd.read_pickle("./datasets/WISDM_v2/all_raw_data.dataframe.pickle")

In [13]:
loaded_df.sort_values(['user', 'timestamp'], ascending=True, inplace=True)

In [15]:
loaded_df.head(20)

Unnamed: 0,user,class,timestamp,x-acc,y-acc,z-acc
32214283,1058,NoLabel,2013-05-27 17:08:06.700,1.532916,-0.270577,0.112918
32214284,1058,NoLabel,2013-05-27 17:08:06.750,2.914563,-1.339037,0.462325
32214285,1058,NoLabel,2013-05-27 17:08:06.800,0.294013,-3.097789,-0.123571
32214286,1058,NoLabel,2013-05-27 17:08:06.850,0.833037,-2.727077,0.110787
32214287,1058,NoLabel,2013-05-27 17:08:06.900,-0.033023,-2.704706,0.111853
32214288,1058,NoLabel,2013-05-27 17:08:06.950,0.188552,-2.597115,-0.215183
32214289,1058,NoLabel,2013-05-27 17:08:07.000,0.322775,-0.902279,-0.798948
32214290,1058,NoLabel,2013-05-27 17:08:07.050,-0.007457,0.448476,-0.705205
32214291,1058,NoLabel,2013-05-27 17:08:07.100,0.169377,0.735032,0.210922
32214292,1058,NoLabel,2013-05-27 17:08:07.150,1.601093,-1.036502,0.153398


In [8]:
users = loaded_df['user'].unique()

# Data Segmented into 10s windows by segment_raw_data.py

In [3]:
import os

In [6]:
raw_dataframes_loc = './datasets/WISDM_v2/temporary_user_dataframes/'
raw_dataframe_files = [f for f in os.listdir(raw_dataframes_loc) if '.pickle' in f]
len(raw_dataframe_files)

325

# load first user's raw data

In [25]:
user_0_df = pd.read_pickle(raw_dataframes_loc + raw_dataframe_files[0])
user_0_df.dropna(inplace=True)
user_0_df['segment_id'] = user_0_df['segment_id'].astype(int)

In [31]:
from collections import Counter

Counter({0: 144, 1: 144, 2: 162, 3: 84})

In [39]:
user_0_df.head()

Unnamed: 0,user,class,timestamp,x-acc,y-acc,z-acc,segment_id
1688375,633,Walking,2011-10-06 12:23:59.255,-6.588843,14.097059,0.459687,0
1688376,633,Walking,2011-10-06 12:23:59.356,-6.129156,13.637373,-2.604891,0
1688377,633,Walking,2011-10-06 12:23:59.504,-1.225831,17.314867,0.919373,0
1688378,633,Walking,2011-10-06 12:23:59.604,-1.225831,2.145205,0.306458,0
1688379,633,Walking,2011-10-06 12:23:59.759,2.298433,11.798626,7.661446,0


## segment 0

In [26]:
segment_0 = user_0_df[user_0_df['segment_id'] == 0]

In [27]:
segment_0['timestamp'].max() - segment_0['timestamp'].min()

Timedelta('0 days 00:00:08.949000')

In [28]:
segment_0['timestamp'].max()

Timestamp('2011-10-06 12:24:08.204000')

## Segment 1

In [18]:
segment_1 = user_0_df[user_0_df['segment_id'] == 1.0]

In [19]:
len(segment_1)

144

In [21]:
segment_1['timestamp'].max() - segment_1['timestamp'].min()

Timedelta('0 days 00:00:09.299000')

# The user ids of those who were in the initial work

In [33]:
from wisdm import wisdm
wisdm.set_data(version="2")
previous_user_ids = wisdm.user_ids

# Define method(s) to decide whether a user's data is worth considering

In [34]:
def goodUser(user_id, user_df):
    for user_file in raw_dataframe_files:
        if user_id in user_file:
            return True
    else:
        False

# Define fillBins() method

In [36]:
cat ./datasets/WISDM_v2/WISDM_at_v2.0_transformed_about.txt

transformed_about.txt for WISDM_at_v2.0 dataset

See readme.txt for information about the WISDM Lab, rights,
and other general information.

Associated tasks: classification
Number of examples: 5435 
Number of attributes: 46
Missing attribute values: No
        A "?" may appear for the "Time between peaks attribute" when there
        are no identifiable peaks in a sample.  Please see the following
        paper, near the end of section 2.2, for further explanation

        Jennifer R. Kwapisz, Gary M. Weiss and Samuel A. Moore (2010).
                "Activity Recognition using Cell Phone Accelerometers,"
                Proceedings of the Fourth International Workshop on
                Knowledge Discovery from Sensor Data (at KDD-10), Washington
                DC.

Class distribution: {
        Walking -> 2185 -> 40.2%,
        Jogging -> 130 -> 2.4%,
        Stairs -> 251 -> 4.6%,
        Sitting -> 1410 -> 25.9%
        Standing -> 840 -> 15.5%
        Ly

In [61]:
bins = list(np.arange(-2.5, 20, 2.5))
bins = [-np.inf] + bins
bins = bins + [np.inf]
bins

[-inf, -2.5, 0.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, inf]

In [62]:
x = segment_0['x-acc']

In [63]:
counts, bins = np.histogram(x, bins=bins, density=False)
counts

array([72, 51, 12,  3,  6,  0,  0,  0,  0,  0])

In [64]:
len(counts)

10

In [48]:
np.sum([1 for t in x if t < -2.5 ])

72

In [49]:
np.sum([1 for t in x if t > 20 ])

0.0

In [None]:
def fillBins(segment_df):
    reference_bins = np.arange(-2.5, 22.5, 2.5)
    x_bins = {b : 0 for b in reference_bin}
    y_bins = {b : 0 for b in reference_bin}
    z_bins = {b : 0 for b in reference_bin}
    
    for ind, row in segment_df:
        if row['x-acc'] < reference_bins[0]
    