In [1]:
import re
import scipy
import copy
import math
import numpy as np

# Not a class, just a bunch of useful functions.

def get_chapter(module_path):
    return re.search('_ch._', 'crowdsignals_ch3_outliers.py').group(0).strip('_')

def normalize_dataset(data_table, columns):
    dt_norm = copy.deepcopy(data_table)
    for col in columns:
        dt_norm[col] = (data_table[col] - data_table[col].mean()) / (data_table[col].max() - data_table[col].min())
    return dt_norm

# Calculate the distance between rows.
def distance(rows, d_function='euclidean'):
    if d_function == 'euclidean':
        # Assumes m rows and n columns (attributes), returns and array where each row represents
        # the distances to the other rows (except the own row).
        return scipy.spatial.distance.pdist(rows, 'euclidean') # todo: replace with numpy?
    else:
        raise ValueError("Unknown distance value '" + d_function + "'")

def print_statistics(dataset, describe=True):

    if describe:
        # .describe() gives number of values, mean, standard deviation, min and max for each column in one table.
        print(dataset.describe().round(3).to_string())
        return

    print('\ncolumn \t\t % missing \t\t mean \t\t standard deviation \t\t min \t\t max')
    dataset_length = len(dataset.index)
    for col in dataset.columns:
        print('\t\t'.join([f'{col}',
                           f'{(dataset_length - dataset[col].count()) / dataset_length * 100:3.1f}%',
                           f'{dataset[col].mean():6.3f}',
                           f'{dataset[col].std():6.3f}',
                           f'{dataset[col].min():6.3f}',
                           f'{dataset[col].max():6.3f}']))

def print_table_cell(value1, value2):
    print("{0:.2f}".format(value1), ' / ', "{0:.2f}".format(value2), end='')

def print_latex_table_statistics_two_datasets(dataset1, dataset2):
    print('attribute, fraction missing values, mean, standard deviation, min, max')
    dataset1_length = len(dataset1.index)
    dataset2_length = len(dataset2.index)
    for col in dataset1.columns:
        print(col, '& ', end='')
        print_table_cell((float((dataset1_length - dataset1[col].count()))/dataset1_length)*100, (float((dataset2_length - dataset2[col].count()))/dataset2_length)*100)
        print(' & ', end='')
        print_table_cell(dataset1[col].mean(), dataset2[col].mean())
        print(' & ', end='')
        print_table_cell(dataset1[col].std(), dataset2[col].std())
        print(' & ', end='')
        print_table_cell(dataset1[col].min(), dataset2[col].min())
        print(' & ', end='')
        print_table_cell(dataset1[col].max(), dataset2[col].max())
        print('\\\\')

def print_latex_statistics_clusters(dataset, cluster_col, input_cols, label_col):
    label_cols = [c for c in dataset.columns if label_col == c[0:len(label_col)]]

    clusters = dataset[cluster_col].unique()

    for c in input_cols:
        print('\multirow{2}{*}{', c, '} & mean ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format(dataset.loc[dataset[cluster_col] == cluster, c].mean()), end='')
        print('\\\\')
        print(' & std ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format(dataset.loc[dataset[cluster_col] == cluster, c].std()), end='')
        print('\\\\')

    for l in label_cols:
        print(l, ' & percentage ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format((float(dataset.loc[dataset[cluster_col] == cluster, l].sum())/len(dataset[dataset[l] == 1].index) * 100)), '\%', end='')
        print('\\\\')

def print_table_row_performances(row_name, training_len, test_len, values):
    scores_over_sd = []
    print(row_name, end='')

    for val in values:
        print(' & ', end='')
        sd_train = math.sqrt((val[0]*(1-val[0]))/training_len)
        print("{0:.4f}".format(val[0]), end='')
        print('\\emph{(', "{0:.4f}".format(val[0]-2*sd_train), '-', "{0:.4f}".format(val[0]+2*sd_train), ')}', ' & ', end='')
        sd_test = math.sqrt((val[1]*(1-val[1]))/test_len)
        print("{0:.4f}".format(val[1]), end='')
        print('\\emph{(', "{0:.4f}".format(val[1]-2*sd_test), '-', "{0:.4f}".format(val[1]+2*sd_test), ')}', end='')
        scores_over_sd.append([val[0], sd_train, val[1], sd_test])
    print('\\\\\\hline')
    return scores_over_sd

def print_table_row_performances_regression(row_name, training_len, test_len, values):
    print(row_name),

    for val in values:
        print(' & ', end='')
        print("{0:.4f}".format(val[0]), end='')
        print('\\emph{(', "{0:.4f}".format(val[1]), ')}', ' & ', end='')
        print("{0:.4f}".format(val[2]), end='')
        print('\\emph{(', "{0:.4f}".format(val[3]), ')}', end='')
    print('\\\\\\hline')

def print_pearson_correlations(correlations):
    for i in range(0, len(correlations)):
        if np.isfinite(correlations[i][1]):
            print(correlations[i][0], ' & ', "{0:.4f}".format(correlations[i][1]), '\\\\\\hline')

In [2]:
import pandas as pd
import numpy as np
import re
import copy
from datetime import datetime, timedelta
import matplotlib.pyplot as plot
import matplotlib.dates as md


class CreateDataset:

    base_dir = ''
    granularity = 0
    data_table = None

    def __init__(self, base_dir, granularity):
        self.base_dir = base_dir
        self.granularity = granularity

    # Create an initial data table with entries from start till end time, with steps
    # of size granularity. Granularity is specified in milliseconds
    def create_timestamps(self, start_time, end_time):
        return pd.date_range(start_time, end_time, freq=str(self.granularity)+'ms')

    def create_dataset(self, start_time, end_time, cols, prefix):
        c = copy.deepcopy(cols)
        if not prefix == '':
            for i in range(0, len(c)):
                c[i] = str(prefix) + str(c[i])
        timestamps = self.create_timestamps(start_time, end_time)
        self.data_table = pd.DataFrame(index=timestamps, columns=c)

    # Add numerical data, we assume timestamps in the form of nanoseconds from the epoch
    def add_numerical_dataset(self, file, timestamp_col, value_cols, aggregation='avg', prefix=''):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(self.base_dir / file, skipinitialspace=True)

        # Convert timestamps to dates
        dataset[timestamp_col] = pd.to_datetime(dataset[timestamp_col])

        # Create a table based on the times found in the dataset
        if self.data_table is None:
            self.create_dataset(min(dataset[timestamp_col]), max(dataset[timestamp_col]), value_cols, prefix)
        else:
            for col in value_cols:
                self.data_table[str(prefix) + str(col)] = np.nan

        # Over all rows in the new table
        for i in range(0, len(self.data_table.index)):
            # Select the relevant measurements.
            relevant_rows = dataset[
                (dataset[timestamp_col] >= self.data_table.index[i]) &
                (dataset[timestamp_col] < (self.data_table.index[i] +
                                           timedelta(milliseconds=self.granularity)))
            ]
            for col in value_cols:
                # Take the average value
                if len(relevant_rows) > 0:
                    if aggregation == 'avg':
                        self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.average(relevant_rows[col])
                    else:
                        raise ValueError(f"Unknown aggregation {aggregation}")
                else:
                    self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.nan

    # Remove undesired value from the names.
    def clean_name(self, name):
        return re.sub('[^0-9a-zA-Z]+', '', name)

    # Add data in which we have rows that indicate the occurrence of a certain event with a given start and end time.
    # 'aggregation' can be 'sum' or 'binary'.
    def add_event_dataset(self, file, start_timestamp_col, end_timestamp_col, value_col, aggregation='sum'):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(self.base_dir / file)

        # Convert timestamps to datetime.
        dataset[start_timestamp_col] = pd.to_datetime(dataset[start_timestamp_col])
        dataset[end_timestamp_col] = pd.to_datetime(dataset[end_timestamp_col])

        # Clean the event values in the dataset
        dataset[value_col] = dataset[value_col].apply(self.clean_name)
        event_values = dataset[value_col].unique()

        # Add columns for all possible values (or create a new dataset if empty), set the default to 0 occurrences
        if self.data_table is None:
            self.create_dataset(min(dataset[start_timestamp_col]), max(dataset[end_timestamp_col]), event_values, value_col)
        for col in event_values:
            self.data_table[(str(value_col) + str(col))] = 0

        # Now we need to start counting by passing along the rows....
        for i in range(0, len(dataset.index)):
            # identify the time points of the row in our dataset and the value
            start = dataset[start_timestamp_col][i]
            end = dataset[end_timestamp_col][i]
            value = dataset[value_col][i]
            border = (start - timedelta(milliseconds=self.granularity))

            # get the right rows from our data table
            relevant_rows = self.data_table[(start <= (self.data_table.index +timedelta(milliseconds=self.granularity))) & (end > self.data_table.index)]

            # and add 1 to the rows if we take the sum
            if aggregation == 'sum':
                self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] += 1
            # or set to 1 if we just want to know it happened
            elif aggregation == 'binary':
                self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] = 1
            else:
                raise ValueError("Unknown aggregation '" + aggregation + "'")

    # This function returns the column names that have one of the strings expressed by 'ids' in the column name.
    def get_relevant_columns(self, ids):
        relevant_dataset_cols = []
        cols = list(self.data_table.columns)

        for id in ids:
            relevant_dataset_cols.extend([col for col in cols if id in col])

        return relevant_dataset_cols

In [3]:
from pathlib import Path
import copy
import os
import sys

# Chapter 2: Initial exploration of the dataset.

"""
First, we set some module-level constants to store our data locations. These are saved as a pathlib.Path object, the
preferred way to handle OS paths in Python 3 (https://docs.python.org/3/library/pathlib.html). Using the Path's methods,
you can execute most path-related operations such as making directories.
sys.argv contains a list of keywords entered in the command line, and can be used to specify a file path when running
a script from the command line. For example:
$ python3 crowdsignals_ch2.py my/proj/data/folder my_dataset.csv
If no location is specified, the default locations in the else statement are chosen, which are set to load each script's
output into the next by default.
"""

DATASET_PATH = Path(sys.argv[1] if len(sys.argv) > 1 else '/Users/robinschijf/Desktop/data/')
RESULT_PATH = Path('./intermediate_datafiles/')
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter2_result.csv'

# Set a granularity (the discrete step size of our time series data). We'll use a course-grained granularity of one
# instance per minute, and a fine-grained one with four instances per second.
GRANULARITIES = [60000, 250]

# We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist.
[path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]]


[None, None]

In [4]:

datasets = []
for milliseconds_per_instance in GRANULARITIES:
    print(f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.')

    # Create an initial dataset object with the base directory for our data and a granularity
    dataset = CreateDataset(DATASET_PATH, milliseconds_per_instance)

    
    dataset.add_numerical_dataset('data_accel_phone.csv', 'Timestamps', ['x','y','z'], 'avg', 'acc_phone_')
    dataset.add_numerical_dataset('data_accel_watch.csv', 'Timestamps', ['x','y','z'], 'avg', 'acc_watch_')

    # We add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
    # and aggregate the values per timestep by averaging the values
    dataset.add_numerical_dataset('data_gyro_phone.csv', 'Timestamps', ['x','y','z'], 'avg', 'gyr_phone_')
    dataset.add_numerical_dataset('data_gyro_watch.csv', 'Timestamps', ['x','y','z'], 'avg', 'gyr_watch_')


    # Get the resulting pandas data table
    dataset = dataset.data_table

Creating numerical datasets from files in -f using granularity 60000.
Reading data from data_accel_phone.csv


TypeError: unsupported operand type(s) for /: 'str' and 'int'