# Imports

In [1]:
import pandas as pd
import numpy as np

# for file-management
import pickle as pkl
from pathlib import Path
import os.path

# Participant Infos

In [2]:
output = False
participants = list(range(1, 21))
smartphones = ["S3Mini", "S4", "N5X", "N6"]
event_time_diff = 3
normalize_screen = True

sensors = [
    "acc",
    "gyro",
    "ori",
    "grav",
    "mag",
    "rot"
]

tasks = [
    "points",
    "fitts"
]

file_names = sensors + tasks

In [3]:
%%time
for PID in participants:
    for smartphone in smartphones:
        print("processing - participant:", str(PID), "smartphone:", smartphone)

        # set screen resolution
        if smartphone == "N5X":
            pixels = {"width": 1080, "height": 1920}
        elif smartphone == "S3Mini":
            pixels = {"width": 480, "height": 800}
        elif smartphone == "S4":
            pixels = {"width": 1080, "height": 1920}
        elif smartphone == "N6":
            pixels = {"width": 1440, "height": 2560}

        # Read Files
        raw_data = dict()

        if output: print("read files")
        for file in file_names:
            file_path = str(Path.home()) + "/data/raw/fapra_imu-" +  str(PID) + "-" + file + "-" + smartphone + "-0.csv"
            if not os.path.isfile(file_path):
                print(file_path + "not found")
                continue
            raw_data[file] = pd.read_csv(file_path, ";")

        # split by time in seperate lists
        if output: print("sort to specific interval")
        time_filtered_data = dict()
        for name in file_names:
            tmp = []
            for k, end in enumerate(raw_data["points"].time):
                start = int(raw_data["fitts"].time[k])
                # create mask for time interval
                mask = (raw_data[name]["time"] > start) & (raw_data[name]["time"] <= end)
                # only return items matching to mask
                tmp.append(raw_data[name].loc[mask])
            time_filtered_data[name] = tmp
        # list to dataframe
        time_filtered_data["points"] = pd.concat(time_filtered_data["points"])

        # scale screen
        if normalize_screen:
            time_filtered_data["points"]["x-press"] = time_filtered_data["points"]["x-press"].div(pixels["width"])
            time_filtered_data["points"]["x-circle"] = time_filtered_data["points"]["x-circle"].div(pixels["width"])
            time_filtered_data["points"]["y-press"] = time_filtered_data["points"]["y-press"].div(pixels["height"])
            time_filtered_data["points"]["y-circle"] = time_filtered_data["points"]["y-circle"].div(pixels["height"])

        # filter unique timestamps
        for k, item in enumerate(time_filtered_data["points"]):
            for sensor in sensors:
                time_filtered_data[sensor][k] = time_filtered_data[sensor][k].drop_duplicates(subset="time", keep="last")
        
        # Create Array
        result_interval = []
        if output: print("create-interval", end=' ')
        for k, point in time_filtered_data["points"].iterrows():
            if output: print(k, end=' ', flush=True)
            one_interval = []
            interval = dict()
            for sensor in sensors:
                interval[sensor] = time_filtered_data[sensor][k]

            # keeps index of each sensor
            position = dict()
            # keeps value of last sensor event
            last_values = dict()
            for sensor in sensors:
                position[sensor] = 0
                last_values[sensor] = 0

            # find maximum first timestamp in all sensors
            current_time = -1
            for sensor in sensors:
                if interval[sensor]["time"].iloc[position[sensor]] > current_time:
                    current_time = interval[sensor]["time"].iloc[position[sensor]]
                last_values[sensor] = interval[sensor][:].iloc[position[sensor]]
            
            # print("start", raw_data["fitts"].time.iloc[k])
            # print("first_data", current_time)
            # print("end", time_filtered_data["points"].iloc[k]["time"])

            # iterate as long as time has not reached max
            while current_time <= time_filtered_data["points"].iloc[k]["time"]:
                # iterate over alle sensors to find values before current_time (maybe one sensor is having multiple updates)
                while True:
                    all_valid = True
                    for sensor in sensors:
                        if position[sensor] + 1 >= len(interval[sensor]["time"]):
                            continue
                        if interval[sensor]["time"].iloc[position[sensor] + 1] <= current_time:
                            position[sensor] += 1
                            last_values[sensor] = interval[sensor][:].iloc[position[sensor]]
                            all_valid = False
                    if all_valid:
                        break
                one_interval.append(last_values.copy())
                current_time = current_time + event_time_diff
            result_interval.append(one_interval)
        
        # make numpy arrays (without dicts)
        if output: print()
        if output: print("make numpy-array interval", end=' ')
        final_intervals = []
        for k, interval in enumerate(result_interval):
            if output: print(k, end=' ', flush=True)
            current_interval = []
            for i in interval:
                event = []
                for sensor in i.values():
                    # except timestamp
                    for value in sensor[1:]:
                        event.append(value)
                current_interval.append(np.asarray(event))
            # make list to array, transpose and make 2d matrix
            final_intervals.append(np.array(np.asarray(current_interval).transpose()))
        
        if output: print()
        if output: print("make numpy-array points")
        final_points = time_filtered_data["points"][['x-press','y-press','x-circle','y-circle']].values
        final_result = [final_points, final_intervals]

        # save dump pickles
        data_path = str(Path.home()) + "/data/pickles/fapra_imu-processed-" +  str(PID) + "-" + smartphone + ".pkl"
        pkl.dump(final_result, open( data_path, "wb" ))

processing - participant: 1 smartphone: S3Mini
processing - participant: 1 smartphone: S4
processing - participant: 1 smartphone: N5X
processing - participant: 1 smartphone: N6
processing - participant: 2 smartphone: S3Mini
processing - participant: 2 smartphone: S4
processing - participant: 2 smartphone: N5X
processing - participant: 2 smartphone: N6
processing - participant: 3 smartphone: S3Mini
processing - participant: 3 smartphone: S4
processing - participant: 3 smartphone: N5X
processing - participant: 3 smartphone: N6
processing - participant: 4 smartphone: S3Mini
processing - participant: 4 smartphone: S4
processing - participant: 4 smartphone: N5X
processing - participant: 4 smartphone: N6
processing - participant: 5 smartphone: S3Mini
processing - participant: 5 smartphone: S4


KeyboardInterrupt: 

In [4]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# Imports

In [1]:
import numpy as np

# for file-management
import h5py
import pickle as pkl
from pathlib import Path

In [2]:
output = True
participants = list(range(1, 21))
smartphones = ["S3Mini", "S4", "N5X", "N6"]
train = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
test = [15, 16, 17, 18, 19, 20]
window_sizes = [25, 50]

# hdf-small

In [3]:
%%time
for window_size in window_sizes:
    for smartphone in smartphones:
        hf = h5py.File(str(Path.home()) + "/data/hdf-small/" + smartphone + "-win" + str(window_size) + ".hdf", "w")
        for phase in ["train", "test"]:
            segments = list()
            labels = list()
            groups = list()
            print("creating hdf5 - window_size:", window_size, "smartphone:", smartphone, "phase:", phase, end=' ')
            if output: print("participant:", end=' ')
            current_participants = train if phase is "train" else test
            for pid in current_participants:
                if output: print(pid, end=' ', flush=True)

                data = pkl.load(open(str(Path.home()) + "/data/pickles/fapra_imu-processed-" +  str(pid) + "-" + smartphone + ".pkl", "rb"))
                points = data[0]
                intervals = data[1]
                # split the points to according group
                pressed = points[:,:2]
                cross = points[:,2:]

                # iterate over intervals
                for i, c in zip(intervals, pressed):
                    samples_per_interval = len(i[0]) - (window_size - 1)
                    # create array of same label
                    labels.append(np.array([c]))
                    chunk = np.array(i.T[len(i.T[(window_size):]):(len(i.T[(window_size - 1):]) + window_size)])
                    segments.append(chunk)

            # make numpy arrays of it
            # wenn window_size größer als intervall ist, dann wirf es ein error
            labels = np.concatenate(np.array(labels), axis=0)
            segments = np.array(segments)
            
            segments = segments.reshape(segments.shape[0], segments.shape[1], segments.shape[2], 1)

            if phase is "train":
                hf.create_dataset("train/labels", data=labels)
                hf.create_dataset("train/sensors", data=segments)
            else:
                hf.create_dataset("test/labels", data=labels)
                hf.create_dataset("test/sensors", data=segments)
            print()
        hf.close()

creating hdf5 - window_size: 25 smartphone: S3Mini phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: S3Mini phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: S4 phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: S4 phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: N5X phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: N5X phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: N6 phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: N6 phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 50 smartphone: S3Mini phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 50 smartphone: S3Mini phase: test participant: 15 16 17 18

# hdf-normal

In [5]:
%%time
for window_size in window_sizes:
    for smartphone in smartphones:
        hf = h5py.File(str(Path.home()) + "/data/hdf/" + smartphone + "-win" + str(window_size) + ".hdf", "w")
        for phase in ["train", "test"]:
            segments = list()
            labels = list()
            groups = list()
            print("creating hdf5 - window_size:", window_size, "smartphone:", smartphone, "phase:", phase, end=' ')
            if output: print("participant:", end=' ')
            current_participants = train if phase is "train" else test
            for pid in current_participants:
                if output: print(pid, end=' ', flush=True)

                data = pkl.load(open(str(Path.home()) + "/data/pickles/fapra_imu-processed-" +  str(pid) + "-" + smartphone + ".pkl", "rb"))
                points = data[0]
                intervals = data[1]
                # split the points to according group
                pressed = points[:,:2]
                cross = points[:,2:]

                # iterate over intervals
                for i, c in zip(intervals, pressed):
                    samples_per_interval = len(i[0]) - (window_size - 1)
                    # create array of same label
                    tmp_labels = [c] * samples_per_interval
                    tmp_labels = np.array(tmp_labels)
                    labels.append(tmp_labels)
                    tmp_groups = [pid] * samples_per_interval
                    groups = groups + tmp_groups
                    for k in range(len(i.T[(window_size - 1):])):
                        chunk = np.array(i.T[k:(k + window_size)])
                        segments.append(chunk)

            # make numpy arrays of it
            # wenn window_size größer als intervall ist, dann wirf es ein error
            labels = np.concatenate(np.array(labels), axis=0)
            segments = np.array(segments)
            groups = np.array(groups, dtype=np.int8)
            
            segments = segments.reshape(segments.shape[0], segments.shape[1], segments.shape[2], 1)

            if phase is "train":
                hf.create_dataset("train/labels", data=labels)
                hf.create_dataset("train/sensors", data=segments)
            else:
                hf.create_dataset("test/labels", data=labels)
                hf.create_dataset("test/sensors", data=segments)
            print()
        hf.close()

creating hdf5 - window_size: 25 smartphone: S3Mini phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: S3Mini phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: S4 phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: S4 phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: N5X phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: N5X phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 25 smartphone: N6 phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 25 smartphone: N6 phase: test participant: 15 16 17 18 19 20 
creating hdf5 - window_size: 50 smartphone: S3Mini phase: train participant: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 
creating hdf5 - window_size: 50 smartphone: S3Mini phase: test participant: 15 16 17 18