# Summary
### Transforming the raw data so that the features have timestamps will probably take a non-negligible amount of time since a python module that can handle the signal processing will have to be identified.  

### Unlabeled data for WISDM 2.0 is available as transformed features, but of course, no timestamps

In [7]:
import pandas as pd
import numpy as np
import datetime as dt
from time import time

In [2]:
cat ./datasets/WISDM_v2/WISDM_at_v2.0_raw.txt | head -n 2

1679,Walking,1370520469556,0.2941316,-0.6356053,-0.22693644;
1679,Walking,1370520469606,-0.49968776,-0.6044512,-0.22602014;
cat: write error: Broken pipe


# Reading the Raw Data

### Labeled

In [3]:
rows = []
bad_lines = []
with open('./datasets/WISDM_v2/WISDM_at_v2.0_raw.txt', 'r') as fIn:
    lines = [line.strip()[:-1].split(",") for line in fIn.readlines()]
    for ind, line in enumerate(lines):
        try:
            row = {'user' : line[0],
                   'class' : line[1],
                   'timestamp' : dt.datetime.fromtimestamp(int(line[2]) / 1e3),
                   'x-acc' : line[3],
                   'y-acc' : line[4],
                   'z-acc' : line[5]}
            rows.append(row)
        except ValueError as ve:
            #print("could not parse line #%s" % ind)
            #print("line[2] : %s" % line[2])
            #print(ve)
            bad_lines.append(ind)

col_names = ['user', 'class', 'timestamp', 'x-acc', 'y-acc', 'z-acc']
raw_df = pd.DataFrame(rows, columns=col_names)

raw_df['x-acc'] = pd.to_numeric(raw_df['x-acc'])
raw_df['y-acc'] = pd.to_numeric(raw_df['y-acc'])
raw_df['z-acc'] = pd.to_numeric(raw_df['z-acc'])

### Unlabeled

In [4]:
unlabeled_rows = []
unlabeled_bad_lines = []
with open('./datasets/WISDM_v2/WISDM_at_v2.0_unlabeled_raw.txt', 'r') as fIn:
    lines = [line.strip()[:-1].split(",") for line in fIn.readlines()]
    for ind, line in enumerate(lines):
        try:
            row = {'user' : line[0],
                   'class' : line[1],
                   'timestamp' : dt.datetime.fromtimestamp(int(line[2]) / 1e3),
                   'x-acc' : line[3],
                   'y-acc' : line[4],
                   'z-acc' : line[5]}
            rows.append(row)
        except ValueError as ve:
            #print("could not parse line #%s" % ind)
            #print("line[2] : %s" % line[2])
            #print(ve)
            bad_lines.append(ind)

unlabeled_raw_df = pd.DataFrame(rows, columns=col_names)

unlabeled_raw_df['x-acc'] = pd.to_numeric(unlabeled_raw_df['x-acc'])
unlabeled_raw_df['y-acc'] = pd.to_numeric(unlabeled_raw_df['y-acc'])
unlabeled_raw_df['z-acc'] = pd.to_numeric(unlabeled_raw_df['z-acc'])

In [5]:
len(raw_df)

2980763

In [6]:
len(unlabeled_raw_df)

41190491

# Save it all as one Dataframe

In [7]:
all_raw_df = pd.concat([raw_df, unlabeled_raw_df])

In [8]:
all_raw_df.to_pickle('./datasets/WISDM_v2/all_raw_data.dataframe.pickle')

# Load it back in 

In [6]:
loaded_df = pd.read_pickle("./datasets/WISDM_v2/all_raw_data.dataframe.pickle")

In [13]:
loaded_df.sort_values(['user', 'timestamp'], ascending=True, inplace=True)

In [15]:
loaded_df.head(20)

Unnamed: 0,user,class,timestamp,x-acc,y-acc,z-acc
32214283,1058,NoLabel,2013-05-27 17:08:06.700,1.532916,-0.270577,0.112918
32214284,1058,NoLabel,2013-05-27 17:08:06.750,2.914563,-1.339037,0.462325
32214285,1058,NoLabel,2013-05-27 17:08:06.800,0.294013,-3.097789,-0.123571
32214286,1058,NoLabel,2013-05-27 17:08:06.850,0.833037,-2.727077,0.110787
32214287,1058,NoLabel,2013-05-27 17:08:06.900,-0.033023,-2.704706,0.111853
32214288,1058,NoLabel,2013-05-27 17:08:06.950,0.188552,-2.597115,-0.215183
32214289,1058,NoLabel,2013-05-27 17:08:07.000,0.322775,-0.902279,-0.798948
32214290,1058,NoLabel,2013-05-27 17:08:07.050,-0.007457,0.448476,-0.705205
32214291,1058,NoLabel,2013-05-27 17:08:07.100,0.169377,0.735032,0.210922
32214292,1058,NoLabel,2013-05-27 17:08:07.150,1.601093,-1.036502,0.153398


In [8]:
users = loaded_df['user'].unique()

# Segmenting The Data into 10s windows

In [96]:
def assign_segments_by_user(user_id, windowSize=10):
    td = pd.Timedelta(str(windowSize) + ' seconds')
    user_df = loaded_df[loaded_df['user'] == user_id]
    segment_col = pd.Series(index=user_df.index, dtype='int32')

    print("Segmenting data for user #%s" % user_id)
    print("\t %s has %s rows" % (user_id, len(user_df)))
    start_time = time()

    last_timestamp = user_df['timestamp'].max()
    beginning_of_segment = user_df['timestamp'].min()
    end_of_segment = beginning_of_segment + td
    segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                      (user_df['timestamp'] < end_of_segment)]
    
    segment_id = 0
    while beginning_of_segment < last_timestamp:
        #print("Segment #%s" % segment_id)
        if len(segment_df) < 1:
            # set beginning of segment to reflect next timestamp
            beginning_of_segment = user_df[user_df['timestamp'] > beginning_of_segment]['timestamp'].min()
            end_of_segment = beginning_of_segment + td
            segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                      (user_df['timestamp'] < end_of_segment)]
            continue
        segment_col.loc[segment_df.index] = segment_id
        #print("\tupdated segment column")
        # make updates for next iteration
        segment_id += 1
        beginning_of_segment = end_of_segment
        end_of_segment = beginning_of_segment + td
        segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                      (user_df['timestamp'] < end_of_segment)]
    finish_time = time()
    print("finished in %s seconds" % (finish_time - start_time))
    user_df['segment_id'] = segment_col
    return user_df

In [97]:
assign_segments_by_user(user_ids[0])

Segmenting data for user #1058
Segment #0
	updated segment column
Segment #1
	updated segment column
Segment #2
	updated segment column
Segment #3
	updated segment column
Segment #4
	updated segment column
Segment #5
Segment #5
	updated segment column
Segment #6
	updated segment column
Segment #7
	updated segment column
Segment #8
	updated segment column
Segment #9
	updated segment column
Segment #10
	updated segment column
Segment #11
	updated segment column
Segment #12
	updated segment column
Segment #13
	updated segment column
Segment #14
	updated segment column
Segment #15
	updated segment column
Segment #16
	updated segment column
Segment #17
	updated segment column
Segment #18
	updated segment column
Segment #19
	updated segment column
Segment #20
	updated segment column
Segment #21
	updated segment column
Segment #22
	updated segment column
Segment #23
	updated segment column
Segment #24
Segment #24
	updated segment column
Segment #25
	updated segment column
Segment #26
	updated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user,class,timestamp,x-acc,y-acc,z-acc,segment_id
32214283,1058,NoLabel,2013-05-27 17:08:06.700,1.532916,-0.270577,0.112918,
32214284,1058,NoLabel,2013-05-27 17:08:06.750,2.914563,-1.339037,0.462325,0.0
32214285,1058,NoLabel,2013-05-27 17:08:06.800,0.294013,-3.097789,-0.123571,0.0
32214286,1058,NoLabel,2013-05-27 17:08:06.850,0.833037,-2.727077,0.110787,0.0
32214287,1058,NoLabel,2013-05-27 17:08:06.900,-0.033023,-2.704706,0.111853,0.0
32214288,1058,NoLabel,2013-05-27 17:08:06.950,0.188552,-2.597115,-0.215183,0.0
32214289,1058,NoLabel,2013-05-27 17:08:07.000,0.322775,-0.902279,-0.798948,0.0
32214290,1058,NoLabel,2013-05-27 17:08:07.050,-0.007457,0.448476,-0.705205,0.0
32214291,1058,NoLabel,2013-05-27 17:08:07.100,0.169377,0.735032,0.210922,0.0
32214292,1058,NoLabel,2013-05-27 17:08:07.150,1.601093,-1.036502,0.153398,0.0


In [98]:
def assign_segments(raw_df, samplingRate=20, windowSize=10, window_movement='discrete'):
    users = raw_df['user'].unique()
    segment_id = 0
    
    td = pd.Timedelta(str(windowSize) + ' seconds')
    
    segment_col = pd.Series(index=raw_df.index, dtype="int32")
    
    for user_id in users:
        print("Segmenting data for user #%s" % user_id)
        start_time = time()
        user_df = raw_df[raw_df['user'] == user_id]
        
        last_timestamp = user_df['timestamp'].max()
        beginning_of_segment = user_df['timestamp'].min()
        end_of_segment = beginning_of_segment + td
        segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                          (user_df['timestamp'] < end_of_segment)]

        while beginning_of_segment < last_timestamp:
            if len(segment_df) < 1:
                # set beginning of segment to reflect next timestamp
                beginning_of_segment = user_df[user_df['timestamp'] > beginning_of_segment]['timestamp'].min()
                end_of_segment = beginning_of_segment + td
                segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                          (user_df['timestamp'] < end_of_segment)]
                continue
            segment_col.loc[segment_df.index] = segment_id
            
            # make updates for next iteration
            segment_id += 1
            beginning_of_segment = end_of_segment
            end_of_segment = beginning_of_segment + td
            segment_df = user_df[(user_df['timestamp'] > beginning_of_segment) & \
                          (user_df['timestamp'] < end_of_segment)]
        finish_time = time()
        print("finished in %s seconds" % (finish_time - start_time))
    
    raw_df['segment_id'] = segment_col
    return raw_df

In [99]:
loaded_df = assign_segments(loaded_df)

Segmenting data for user #1058
finished in 356.7577426433563 seconds
Segmenting data for user #1064
finished in 2512.589770555496 seconds
Segmenting data for user #1100
finished in 248.4491949081421 seconds
Segmenting data for user #1104


KeyboardInterrupt: 