# Feature Engineering

Extract 30 feature from 6 raw data

## imports

In [None]:
import os
import pandas as pd
import sys
import json
import math
import numpy as np
import statistics as stat

## Prepare Data

In [None]:
task_name = 'ZL_trace' 
# [ZL_trace, ZL_predict, PL_trace, PL_predict]

df = f'Datasets/Full_DS/{task_name}.csv'

output = pd.DataFrame()

In [None]:
# Wacom sitting
df['ClientX'] = (df['ClientX'] * (3840 / 2000))
df['ClientY'] = (df['ClientY'] * (2160 / 1200))

df['Pressure'] = df['Pressure'] * 8192

In [None]:
#Global Features Sitting
timeSpent = max(df['timestamp']) - min(df['timestamp'])

df['TiltX'] = df['TiltX'] + 90
df['TiltY'] = df['TiltY'] + 90

## Peaks & Valleys Functions

In [None]:
def find_peaks(series):
    peaks = []
    for i in range(1, len(series) - 1):
        if series.iloc[i] > series.iloc[i - 1] and series.iloc[i] > series.iloc[i + 1]:
            peaks.append(i)
    return peaks

def find_valleys(series):
    valleys = []
    for i in range(1, len(series) - 1):
        if series.iloc[i] < series.iloc[i - 1] and series.iloc[i] < series.iloc[i + 1]:
            valleys.append(i)
    return valleys

## Feature 1, 2

In [None]:
Width = (max(df['ClientX']) - min(df['ClientX']))
Height = (max(df['ClientY']) - min(df['ClientY']))


output.at[0, 'Width']= Width
output.at[0, 'Height']= Height

Feature 3

In [None]:
euclidean_distances = []

for i in range(1, len(df)):
    x = df['ClientX'][i] - df['ClientX'][i-1]
    y = df['ClientY'][i] - df['ClientY'][i-1]

    sqrt = math.sqrt((x**2) + (y**2))
    euclidean_distances.append(sqrt)

Length = sum(euclidean_distances)
output.at[0, 'Length']= Length

Features 4, 5, 6

In [None]:
def calculate_velocities(exper):
    euclidean_distances = []
    velocities = []

    for i in range(1, len(exper)):
        x = exper['ClientX'][i] - exper['ClientX'][i-1]
        y = exper['ClientY'][i] - exper['ClientY'][i-1]

        displacement = math.sqrt((x**2) + (y**2))
        euclidean_distances.append(displacement)

        delta_time = exper['timestamp'][i] - exper['timestamp'][i-1]

        if delta_time > 0:
            Velocity = displacement / delta_time
            velocities.append(Velocity)
        else:
            velocities.append(0)

    return velocities

In [None]:
velocities = pd.Series(calculate_velocities(df))
Velocity = velocities.mean()

P_max_V = max(velocities) * 3.3
P_min_V =  min(vel for vel in velocities if vel > 0) * 3.3

output.at[0, 'Velocity']= Velocity

output.at[0, 'P_max_V']= P_max_V
output.at[0, 'P_min_V']= P_min_V

## Features 7, 8, 9, 10

In [None]:
def calculate_accelerations(exper):
    accelerations = []
    previous_Velocity = 0

    for i in range(1, len(exper)):
        x = exper['ClientX'][i] - exper['ClientX'][i-1]
        y = exper['ClientY'][i] - exper['ClientY'][i-1]
        displacement = math.sqrt((x**2) + (y**2))

        delta_time = exper['timestamp'][i] - exper['timestamp'][i-1]

        if delta_time > 0:
            Velocity = displacement / delta_time
            delta_Velocity = Velocity - previous_Velocity
            acceleration = abs(delta_Velocity) / delta_time
            accelerations.append(acceleration)

            previous_Velocity = Velocity
        else:
            accelerations.append(0)

    return accelerations

In [None]:
accelerations = pd.Series(calculate_accelerations(df))
P_max_A= max(accelerations) * 3.3
P_min_A = min(abs(acc) for acc in accelerations if acc > 0) * 3.3

output.at[0, 'P_max_A']= P_max_A
output.at[0, 'P_min_A']= P_min_A

In [None]:
GA_mean_H = df['TiltX'].mean()
GA_mean_V = df['TiltY'].mean()

output.at[0, 'GA_mean_H']= GA_mean_H
output.at[0, 'GA_mean_V']= GA_mean_V

## Features 11, 12, 13, 14

In [None]:
#Feature 11
GA_SD_H = df['TiltX'].std()
output.at[0, 'GA_SD_H']= GA_SD_H

# Feature 12
GA_SD_V = df['TiltY'].std()
output.at[0, 'GA_SD_V']= GA_SD_V

#Feature 13
PressureMean = df['Pressure'].mean()
output.at[0, 'PressureMean']= PressureMean

#Feature 14
PressureSD = stat.stdev(df['Pressure'])
output.at[0, 'PressureSD']= PressureSD

## Features 15, 16, 17, 18, 19, 20

In [None]:
df['Pressure_diff'] = df['Pressure'].diff()
df['Time_diff'] = df['timestamp'].diff()

df['Pressure_change_rate'] = df.apply(
    lambda row: row['Pressure_diff'] / row['Time_diff'] if row['Time_diff'] != 0 and row['Pressure_diff'] >0 else None, axis=1
)

df['Positive_change_rate'] = df.apply(
    lambda row: row['Pressure_change_rate'] if row['Pressure_change_rate'] > 0 else 0, axis=1
)
df['Negative_change_rate'] = df.apply(
    lambda row: row['Pressure_change_rate'] if row['Pressure_change_rate'] < 0 else 0, axis=1
)

positive_changes = df['Positive_change_rate'].dropna()

In [None]:
PCAvgPos = stat.mean(positive_changes)  # Feature 15: Mean of positive changes
PCSDPos = stat.stdev(positive_changes)  # Feature 16: Standard deviation of positive changes
PCMax = max(positive_changes)

output.at[0, 'PCAvgPos']= PCAvgPos
output.at[0, 'PCSDPos']= PCSDPos
output.at[0, 'PCMax']= PCMax

In [None]:
PCAvgNeg = stat.mean(positive_changes)  # Feature 18: Mean of negative changes
PCSDNeg = stat.stdev(positive_changes)  # Feature 19: Standard deviation of negative changes
PCMin = max(positive_changes) # Feature 20

output.at[0, 'PCAvgNeg']= PCAvgNeg
output.at[0, 'PCSDNeg']= PCSDNeg
output.at[0, 'PCMin']= PCMin

## Feature 21

In [None]:
def calculate_angle(x1, y1, x2, y2, x3, y3):
    vector1 = np.array([x1 - x2, y1 - y2])
    vector2 = np.array([x3 - x2, y3 - y2])

    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)

    if magnitude_product == 0: return 0

    cosine_angle = dot_product / magnitude_product
    angle_rad = np.arccos(np.clip(cosine_angle, -1.0, 1.0))
    angle_deg = np.degrees(angle_rad)

    return angle_deg

In [None]:
def compute_error_feature(row1, row2, row3):
    x1, y1 = row1['TiltX'], row1['TiltY']
    x2, y2 = row2['TiltX'], row2['TiltY']
    x3, y3 = row3['TiltX'], row3['TiltY']

    triangle_angle = calculate_angle(x1, y1, x2, y2, x3, y3)
    square_angle = 90

    error = square_angle - triangle_angle

    return error < 0, row2['timestamp']

In [None]:
Error = 0

for i in range(0, len(df) - 2, 3):
    row1 = df.iloc[i]
    row2 = df.iloc[i + 1]
    row3 = df.iloc[i + 2]
    error_feature, timestamp = compute_error_feature(row1, row2, row3)

    if error_feature: Error += 1

output.at[0, 'Error']= Error

## Feature 22

In [None]:
press = df['Pressure']
press = pd.concat([press, pd.Series([0])], ignore_index=True)

valleys = find_valleys(press)
Pressure_valleys = df['Pressure'].iloc[valleys]

In [None]:
if Pressure_valleys.empty:
    PeakpresMean = 0
else:
    PeakpresMean = (Pressure_valleys.mean())

output.at[0, 'PeakpresMean']= PeakpresMean

## Feature 23

In [None]:
error_timestamps = []

for i in range(0, len(df) - 2, 3):
    row1 = df.iloc[i]
    row2 = df.iloc[i + 1]
    row3 = df.iloc[i + 2]
    error_feature, timestamp = compute_error_feature(row1, row2, row3)

    if error_feature:
        error_timestamps.append(timestamp)

# Compute the mean of the error timestamps if there are any
if error_timestamps:
    base_timestamp = min(error_timestamps)
    error_timestamps_ms = [(t - base_timestamp) for t in error_timestamps]
    mean_timestamp = np.mean(error_timestamps_ms)
    output.at[0, 'ErrorStopTime'] = mean_timestamp
else:
    output.at[0, 'ErrorStopTime'] = 0

## Feature 24

In [None]:
df['TiltAngle'] = np.degrees(np.arctan2(df['TiltY'], df['TiltX']))

df['TiltAngle'].fillna(value=0, inplace=True)

anglepeaks = find_peaks(df['TiltAngle'])
anglevalleys = find_valleys(df['TiltAngle'])

mean_angle_at_peaks = df['TiltAngle'].iloc[anglepeaks].mean() if len(anglepeaks) > 0 else 0
mean_angle_at_valleys = df['TiltAngle'].iloc[anglevalleys].mean() if len(anglevalleys) > 0 else 0

AngleMean = np.mean([mean_angle_at_peaks, mean_angle_at_valleys])

output.at[0, 'AngleMean'] = AngleMean

## Feature 25

In [None]:
AngleVar = df['Pressure'].var()
output.at[0, 'AngleVar'] = AngleVar

## Features 26, 27

In [None]:
raw_data = df.filter(['ClientX', 'ClientY', 'TiltX', 'TiltY', 'Pressure', 'timestamp'])
correlation_matrix = raw_data.corr()
sorted_correlations = correlation_matrix.abs().unstack().sort_values(ascending=False)
top_pairs = sorted_correlations[sorted_correlations < 1].head(1).index

col1, col2 = top_pairs[0]
x = np.array(df[col1])
y = np.array(df[col2])

ReglineSlope, ReglineIntercept = np.polyfit(x, y, 1)

output.at[0, 'ReglineSlope']= ReglineSlope
output.at[0, 'ReglineIntercept']= ReglineIntercept

## Feature 28

In [None]:
Vpeaks = find_peaks(velocities)
Vvalleys = find_valleys(velocities)

velocities_at_peaks = velocities.iloc[Vpeaks]
velocities_at_valleys = velocities.iloc[Vvalleys]

if len(velocities_at_peaks) == 0 or timeSpent == 0: LoopCount = timeSpent
else: LoopCount = timeSpent / len(velocities_at_peaks)

output.at[0, 'LoopCount']= LoopCount

## Feature 29

In [None]:
if velocities_at_peaks.empty:
    mean_Velocity_at_peaks = 0
else:
    mean_Velocity_at_peaks = velocities_at_peaks.mean()

if velocities_at_valleys.empty:
    mean_Velocity_at_valleys = 0
else:
    mean_Velocity_at_valleys = velocities_at_valleys.mean()

AngleSpeed = np.mean([mean_Velocity_at_peaks, mean_Velocity_at_valleys])

output.at[0, 'AngleSpeed']= AngleSpeed

## Feature 30

In [None]:
if Error == 0 or len(velocities_at_peaks) == 0:
    ErrorRate = 0
else:
    ErrorRate = abs(Error/len(velocities_at_peaks))

output.at[0, 'ErrorRate']= ErrorRate

## Extra Features

from 31 to 33

In [None]:
StrokeDurationArray = df[df['StrokeDuration'] != 0]
Stroke_Duration_mean = stat.mean(StrokeDurationArray['StrokeDuration'])
output.at[0, 'Stroke_Duration_mean']= Stroke_Duration_mean

In [None]:
Stroke_Durations = sum(StrokeDurationArray['StrokeDuration'])
On_Paper_Time = Stroke_Durations
output.at[0, 'On_Paper_Time']= On_Paper_Time

In [None]:
In_Air_Time = (timeSpent - On_Paper_Time)
output.at[0, 'In_Air_Time']= In_Air_Time