In [1]:
from wisdm import wisdm
import random
import numpy as np
import pandas as pd
from collections import Counter
import time
from scipy import stats

In [2]:
wisdm.set_data(version="2", make_compatible=True)

In [3]:
user_dfs = {}

for user_id in wisdm.user_ids:
    try:
        user_df = pd.read_pickle('./datasets/WISDM_v2/temporary_user_dataframes/' + user_id + '_raw_segmented.pickle')
        user_dfs[user_id] = user_df
    except FileNotFoundError as fnfe:
        print("%s not in set" % user_id)
        continue

1097 not in set
1809 not in set
1813 not in set
1814 not in set


# Extracting Features

In [4]:
user_df = user_dfs[wisdm.user_ids[0]]

## Handle NaN Values in Segment_id

In [5]:
print("%s/%s missing values" % (np.sum(np.isnan(user_df['segment_id'])), len(user_df)))

208/472179 missing values


In [6]:
user_df['segment_id'] = user_df['segment_id'].fillna(method="bfill", limit=1)

In [7]:
print("%s/%s missing values" % (np.sum(np.isnan(user_df['segment_id'])), len(user_df)))

0/472179 missing values


## Other NaN Values?

### ToDo : Check for all users, The first user seems fine for now

In [8]:
segment_0 = user_df[user_df['segment_id'] == 0]
len(segment_0)

186

## Fill Bins

In [9]:
bin_ranges = list(np.arange(-2.5, 20, 2.5))
bin_ranges = [-np.inf] + bin_ranges
bin_ranges = bin_ranges + [np.inf]
bin_ranges

[-inf, -2.5, 0.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, inf]

In [10]:
counts, _ = np.histogram(segment_0['x-acc'], bins=bin_ranges, density=False)
counts

array([  0, 140,  46,   0,   0,   0,   0,   0,   0,   0])

In [11]:
def fill_bins(segment_df):
    counts_x, _ = np.histogram(segment_df['x-acc'], bins=bin_ranges, density=False)
    counts_y, _ = np.histogram(segment_df['y-acc'], bins=bin_ranges, density=False)
    counts_z, _ = np.histogram(segment_df['z-acc'], bins=bin_ranges, density=False)
    return counts_x, counts_y, counts_z

In [12]:
start = time.time()

user_segments = user_df['segment_id'].unique()

bin_rows = []
for segment in user_segments:
    segment_df = user_df[user_df['segment_id'] == segment]
    bins = fill_bins(segment_df)
    bin_rows.append(bin_rows)

finish = time.time()
print("Took %s seconds" % (finish - start))

Took 2.7780745029449463 seconds


## Get XAVG, YAVG, ZAVG
the average x, y, and z values over the 200 records in the example.

In [13]:
def get_avg(segment_df):
    xavg = segment_df['x-acc'].mean()
    yavg = segment_df['y-acc'].mean()
    zavg = segment_df['z-acc'].mean()
    return xavg, yavg, zavg

In [14]:
start = time.time()

user_segments = user_df['segment_id'].unique()

f_rows = []
for segment in user_segments:
    segment_df = user_df[user_df['segment_id'] == segment]
    f = get_avg(segment_df)
    f_rows.append(f)

finish = time.time()
print("Took %s seconds" % (finish - start))

Took 2.592491388320923 seconds


## Get XPEAK, YPEAK, ZPEAK
approximations of the dominant
frequency. First, the greatest value in the series is
identified, then all local peak values within 10% of
its amplitude are identified. If the number of peaks
is less than 3, then the threshhold is lowered until
at least 3 peaks can be found. The times between
consecutive peaks are summed and divided by the number
of peaks.

In [15]:
def get_peak_times(segment_df):
    axes = ['x-acc', 'y-acc', 'z-acc']
    
    peak_feature_values = {axis : None for axis in axes}
    for axis in axes:
        sorted_indeces = segment_df[axis].argsort()
        max_peak = segment_df[axis].iloc[sorted_indeces.iloc[-1]]
        threshold = max_peak * 0.9
        peaks_df = segment_df[segment_df[axis] > threshold]
        if len(peaks_df) < 3:
            peaks_df = segment_df.iloc[sorted_indeces.iloc[-3:]]
        
        peaks_df.sort_values('timestamp', ascending=False, inplace=True)

        peak_diffs = []
        #print(len(peaks_df))
        iter_peaks = peaks_df['timestamp'].iteritems()
        _, last_ts = next(iter_peaks)

        for _, pt in peaks_df['timestamp'].iteritems():
            peak_diffs.append(pt.timestamp() - last_ts.timestamp())
            last_ts = pt
        peak_feature_values[axis] = np.mean(peak_diffs)

    return peak_feature_values['x-acc'], peak_feature_values['y-acc'], peak_feature_values['z-acc']

In [16]:
start = time.time()

user_segments = user_df['segment_id'].unique()

f_rows = []
for segment in user_segments:
    segment_df = user_df[user_df['segment_id'] == segment]
    f = get_peak_times(segment_df)
    f_rows.append(f)

finish = time.time()
print("Took %s seconds" % (finish - start))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Took 133.62716960906982 seconds


## XABSOLDEV, YABSOLDEV, ZABSOLDEV
the average absolute deviations from the mean value for each axis.


In [17]:
def get_absdev(segment_df):
    x_mean = segment_df['x-acc'].mean()
    y_mean = segment_df['y-acc'].mean()
    z_mean = segment_df['z-acc'].mean()
    
    x_absdev = np.mean([np.absolute(x_mean - x) for x in segment_df['x-acc']])
    y_absdev = np.mean([np.absolute(y_mean - y) for y in segment_df['y-acc']])
    z_absdev = np.mean([np.absolute(z_mean - z) for z in segment_df['z-acc']])
    
    return x_absdev, y_absdev, z_absdev

In [18]:
start = time.time()

user_segments = user_df['segment_id'].unique()

f_rows = []
for segment in user_segments:
    segment_df = user_df[user_df['segment_id'] == segment]
    f = get_absdev(segment_df)
    f_rows.append(f)

finish = time.time()
print("Took %s seconds" % (finish - start))

Took 5.244106769561768 seconds


## XSTANDDEV, YSTANDDEV, ZSTANDDEV 
the standard deviations for each axis.

In [19]:
def get_sd(segment_df):
    x_std = segment_df['x-acc'].std()
    y_std = segment_df['y-acc'].std()
    z_std = segment_df['z-acc'].std()
    
    return x_std, y_std, z_std

## RESULTANT 
average of the square roots of the sum of the values
   of each axis squared �(xi^2 + yi^2 + zi^2).

In [20]:
def get_resultant(segment_df):
    values = []
    
    for ind, row in segment_df.iterrows():
        sum_val = (row['x-acc']**2) + (row['y-acc'] ** 2) + (row['z-acc'] ** 2)
        values.append(sum_val)
    return np.mean(values)

In [21]:
start = time.time()

user_segments = user_df['segment_id'].unique()

f_rows = []
for segment in user_segments:
    segment_df = user_df[user_df['segment_id'] == segment]
    f = get_resultant(segment_df)
    f_rows.append(f)

finish = time.time()
print("Took %s seconds" % (finish - start))

Took 49.957321882247925 seconds


# Putting it all together

In [22]:
feature_names = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
                 'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9',
                 'Z0', 'Z1', 'Z2', 'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9',
                 'XAVG', 'YAVG', 'ZAVG',
                 'XPEAK', 'YPEAK', 'ZPEAK',
                 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV',
                 'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV',
                 'RESULTANT']

In [23]:
def make_bins_df(x_bin_list, y_bin_list, z_bin_list):
    rows = []
    
    for bin_sets in zip(x_bin_list, y_bin_list, z_bin_list):
        row = np.hstack(bin_sets)
        rows.append(row)
    bins_df = pd.DataFrame(rows, columns=feature_names[:30])
    return bins_df

In [25]:
def get_timestamp(segment_df):
    return segment_df['timestamp'].min()

In [26]:
def clean_user_df(user_df):
    ts = user_df['timestamp'].unique()
    print("%s timestamps" % len(ts))
    rows = []
    for ind, t in enumerate(ts):
        #if (ind % 1000) == 0:
            #print("At %s" % ind)
        ts_df = user_df[user_df['timestamp'] == ts[0]]
        class_labels = ts_df['class'].unique()
        
        ts_df = ts_df[ts_df['class'] != 'NoLabel']
        ts_df.reset_index(drop=True, inplace=True)
        ts_df = ts_df.drop(ts_df.index[1:]) # maybe we should keep which ever one has a label?
        rows.append(ts_df)
    user_df = pd.concat(rows)
    return user_df

In [27]:
import warnings

In [31]:
def get_features(user_df):
    start = time.time()
    user_id = user_df['user'].unique()[0]
    segments = user_df['segment_id'].unique()
    timestamps = []
    class_labels = []
    
    x_bin_list = []
    y_bin_list = []
    z_bin_list = []

    x_avg_list = []
    y_avg_list = []
    z_avg_list = []

    x_peak_list = []
    y_peak_list = []
    z_peak_list = []

    x_absdev_list = []
    y_absdev_list = []
    z_absdev_list = []

    x_sd_list = []
    y_sd_list = []
    z_sd_list = []

    resultant_list = []

    for segment in segments:
        segment_df = user_df[user_df['segment_id'] == segment]
        class_label = str(stats.mode(list(segment_df['class']))[0][0])
        timestamp = get_timestamp(segment_df)
        x_bins, y_bins, z_bins = fill_bins(segment_df)
        x_avg, y_avg, z_avg = get_avg(segment_df)
        x_peak, y_peak, z_peak = get_peak_times(segment_df)
        x_absdev, y_absdev, z_absdev = get_absdev(segment_df)
        x_sd, y_sd, z_sd = get_sd(segment_df)
        resultant = get_resultant(segment_df)
        
        class_labels.append(class_label)
        timestamps.append(timestamp)
        
        x_bin_list.append(x_bins)
        y_bin_list.append(y_bins)
        z_bin_list.append(z_bins)

        x_avg_list.append(x_avg)
        y_avg_list.append(y_avg)
        z_avg_list.append(z_avg)

        x_peak_list.append(x_peak)
        y_peak_list.append(y_peak)
        z_peak_list.append(z_peak)

        x_absdev_list.append(x_absdev)
        y_absdev_list.append(y_absdev)
        z_absdev_list.append(z_absdev)

        x_sd_list.append(x_sd)
        y_sd_list.append(y_sd)
        z_sd_list.append(z_sd)
        
        resultant_list.append(resultant)
    
    f_df = make_bins_df(x_bin_list, y_bin_list, z_bin_list)
    f_df['XAVG'] = x_avg_list
    f_df['YAVG'] = y_avg_list
    f_df['ZAVG'] = z_avg_list

    f_df['XPEAK'] = x_peak_list
    f_df['YPEAK'] = y_peak_list
    f_df['ZPEAK'] = z_peak_list

    f_df['XABSOLDEV'] = x_absdev_list
    f_df['YABSOLDEV'] = x_absdev_list
    f_df['ZABSOLDEV'] = y_absdev_list

    f_df['XSTANDDEV'] = x_sd_list
    f_df['YSTANDDEV'] = x_sd_list
    f_df['ZSTANDDEV'] = y_sd_list

    f_df['RESULTANT'] = resultant_list
    f_df['user_id'] = [user_id] * len(segments)
    f_df['segment_id'] = segments
    f_df['timestamps'] = timestamps
    f_df['class'] = class_labels
    return f_df

In [32]:
import warnings

In [36]:
start = time.time()
user_df = clean_user_df(user_df)
finished_cleaning = time.time()
print("Took %s to finish cleaning" % (finished_cleaning - start))
features_df = get_features(user_df)
finish = time.time()

print("Took %s seconds" % (finish - start))

355248 timestamps


NameError: name 'finish_cleaning' is not defined

In [38]:
finished_cleaning = time.time()
print("Took %s to finish cleaning" % (finished_cleaning - start))
features_df = get_features(user_df)
finish = time.time()

print("Took %s seconds" % (finish - start))

Took 1203.8877215385437 to finish cleaning




Took 1249.0434832572937 seconds


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [39]:
features_df['class'].unique()

array(['Standing'], dtype=object)

# ToDo
* Fix clean_user_df() so that it keeps the NoLabel frames that have ONLY NoLabel
* Test again