# Cancellation Task Feature Extraction
This notebook extracts the following features for each participant and condition:
1. **Omissions** (normalized error per quadrant)
2. **Subjective epicenter** (mean x, y of all marks)
3. **First mark** (x, y of the first mark)
4. **Directional shifts** (sum of horizontal & vertical shifts)
5. **Time shifts** (average time between marks by direction)
6. **Smooth index** (deviation from optimized pathway)


In [102]:
import pandas as pd
import numpy as np
import os
from math import asin, sqrt, pi

In [103]:
# Load data
clicks = pd.read_csv(r'D:\01_Academic\Disseration\Post_EXP\Preprocessing\Star_cancellation\Cancellation_Low-Level_Feature_Extraction\cleaned_cancellation_clicks')
results = pd.read_csv(r'D:\01_Academic\Disseration\Post_EXP\Preprocessing\Star_cancellation\Cancellation_Low-Level_Feature_Extraction\cleaned_cancellation_results')
clicks.head(), results.head()

(   participant_id  participant_group  round_index  click_time  click_x  \
 0               1                  1            1       1.765    220.0   
 1               1                  1            2      29.529    826.0   
 2               1                  1            3       2.230    468.0   
 3               1                  1            3       3.022    346.0   
 4               1                  1            3       4.422    133.0   
 
    click_y  click_quadrant  was_target  target_quadrant  
 0    134.0               1           1                1  
 1    550.0               3           1                3  
 2     68.0               1           1                1  
 3    316.0               1           1                1  
 4    274.0               1           1                1  ,
    participant_id  participant_group  round_index  time_used  targets_total  \
 0               1                  1            1       30.0            112   
 1               1               

## 1. Compute Omissions per Quadrant
Normalize error percentage by arcsin(2 * sqrt(err%)) for each of the four quadrants.

In [104]:
om_list = []
for q in [1,2,3,4]:
    hits   = results[f"q{q}_hits"]
    misses = results[f"q{q}_misses"]
    total  = hits + misses

    # avoid division by zero
    err_pct = np.where(total>0, misses/total*100, np.nan)
    norm_err = np.arcsin(2 * np.sqrt(err_pct/100))

    dfq = pd.DataFrame({
        'participant_id': results['participant_id'],
        'condition':      results['round_index'],
        'quadrant':       f"Q{q}",
        'err_pct':        err_pct,
        'norm_err':       norm_err
    })
    om_list.append(dfq)

omissions = pd.concat(om_list, ignore_index=True)
omissions.head()

  norm_err = np.arcsin(2 * np.sqrt(err_pct/100))
  norm_err = np.arcsin(2 * np.sqrt(err_pct/100))
  norm_err = np.arcsin(2 * np.sqrt(err_pct/100))
  norm_err = np.arcsin(2 * np.sqrt(err_pct/100))


Unnamed: 0,participant_id,condition,quadrant,err_pct,norm_err
0,1,1,Q1,16.666667,0.955317
1,1,2,Q1,0.0,0.0
2,1,3,Q1,0.0,0.0
3,2,1,Q1,87.5,
4,2,2,Q1,61.111111,


In [105]:
# Assuming you already have:
# omissions with columns ['participant_id','condition','quadrant','err_pct','norm_err']

# 1) Show the error% per quadrant, per participant & condition
display( 
    omissions[['participant_id','condition','quadrant','err_pct']]
    .sort_values(['participant_id','condition','quadrant'])
    .head(12)  # just to preview a few rows
)

# 2) Pivot to a matrix: participants down, quadrants across, within each condition
err_by_quad = (
    omissions
    .pivot_table(
        index=['participant_id','condition'],
        columns='quadrant',
        values='err_pct'
    )
    .reset_index()
)

# Preview
err_by_quad.head()

# 3) (Optionally) get the group‐level mean error% by quadrant for each condition
mean_err = (
    omissions
    .groupby(['condition','quadrant'])['err_pct']
    .mean()
    .unstack()
)
mean_err


Unnamed: 0,participant_id,condition,quadrant,err_pct
0,1,1,Q1,16.666667
51,1,1,Q2,29.166667
102,1,1,Q3,81.578947
153,1,1,Q4,100.0
1,1,2,Q1,0.0
52,1,2,Q2,100.0
103,1,2,Q3,0.0
154,1,2,Q4,100.0
2,1,3,Q1,0.0
53,1,3,Q2,100.0


quadrant,Q1,Q2,Q3,Q4
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,50.635813,82.343137,49.787549,87.967914
2,72.575163,61.819147,61.709173,58.257919
3,49.089815,83.981128,46.27865,69.594628


In [106]:
# === Pivot omissions → one column per quadrant (fixed) ===
err_wide = (
    omissions
    .pivot_table(
        index=['participant_id','condition'],
        columns='quadrant',
        values='err_pct'
    )
    .reset_index()
    # only prefix the actual quadrant columns (Q1–Q4)
    .rename(columns={
        'Q1': 'err_pct_Q1',
        'Q2': 'err_pct_Q2',
        'Q3': 'err_pct_Q3',
        'Q4': 'err_pct_Q4'
    })
)

# Quick sanity check
print(err_wide.columns.tolist())
# ['participant_id', 'condition', 'err_pct_Q1', 'err_pct_Q2', 'err_pct_Q3', 'err_pct_Q4']

['participant_id', 'condition', 'err_pct_Q1', 'err_pct_Q2', 'err_pct_Q3', 'err_pct_Q4']


## 2. Subjective Epicenter
Average (x, y) coordinates of all marks per participant & condition.

In [107]:
clicks = clicks.rename(columns={
    'click_time':'timestamp',
    'click_x':'x',
    'click_y':'y'
})

In [108]:
epicenter = (
    clicks
    .groupby(['participant_id','round_index'])
    .agg(epicenter_x=('x','mean'),
         epicenter_y=('y','mean'))
    .reset_index()
)

## 3. First Mark Coordinates
Extract the (x, y) of the very first mark (by timestamp) in each participant & condition.

In [109]:
first_mark = (
    clicks
    .sort_values('timestamp')
    .groupby(['participant_id','round_index'])
    .first()
    .reset_index()[['participant_id','round_index','x','y']]
    .rename(columns={'x':'first_x','y':'first_y',
                     'round_index':'condition'})
)


## 4. Directional Shifts
Compute signed shifts between consecutive marks along x (horizontal) and y (vertical).

In [110]:
def compute_shifts(arr):
    diffs = np.diff(arr, prepend=arr[0])
    signs = np.where(diffs>0.3,  1,
             np.where(diffs< -0.3, -1, 0))
    return signs.sum()

shifts = (
    clicks
    .sort_values(['participant_id','round_index','timestamp'])
    .groupby(['participant_id','round_index'])
    .apply(lambda g: pd.Series({
        'hor_shifts': compute_shifts(g['x'].values),
        'ver_shifts': compute_shifts(g['y'].values)
    }))
    .reset_index()
    .rename(columns={'round_index':'condition'})
)

## 5. Time Shifts
Compute average time interval between consecutive marks per direction type.

In [111]:
clicks['dt'] = clicks.groupby(['participant_id','round_index'])['timestamp'].diff()
clicks['h_dir'] = np.where(clicks.groupby(['participant_id','round_index'])['x'].diff()>0,  1,
                  np.where(clicks.groupby(['participant_id','round_index'])['x'].diff()<0, -1, np.nan))
time_shifts = (
    clicks
    .dropna(subset=['dt','h_dir'])
    .groupby(['participant_id','round_index','h_dir'])['dt']
    .mean()
    .unstack(fill_value=np.nan)
    .reset_index()
    .rename(columns={1:'mean_dt_left_to_right', -1:'mean_dt_right_to_left',
                     'round_index':'condition'})
)

## 6. Smooth Index
Compute the standard deviation of 3-point sliding windows on x and y, then average.

In [112]:
def smooth_index(vals):
    if len(vals)<3:
        return np.nan
    stds = []
    for i in range(len(vals)-2):
        stds.append(np.std(vals[i:i+3]))
    return np.mean(stds)

smooth = (
    clicks
    .sort_values(['participant_id','round_index','timestamp'])
    .groupby(['participant_id','round_index'])[['x','y']]
    .apply(lambda df: np.mean([smooth_index(df['x'].values),
                               smooth_index(df['y'].values)]))
    .reset_index()
    .rename(columns={0:'smooth_index','round_index':'condition'})
)


## Combine All Features
Merge all computed feature tables into a single summary.

In [113]:
dfs = [
    err_wide,
    epicenter.rename(columns={'round_index':'condition'}),
    first_mark,
    shifts.rename(columns={'round_index':'condition'}),
    time_shifts.rename(columns={'round_index':'condition'}),
    smooth.rename(columns={'round_index':'condition'})
]

feature_summary = dfs[0]
for df_ in dfs[1:]:
    feature_summary = feature_summary.merge(
        df_, on=['participant_id','condition'], how='left'
    )

feature_summary.head()

Unnamed: 0,participant_id,condition,err_pct_Q1,err_pct_Q2,err_pct_Q3,err_pct_Q4,epicenter_x,epicenter_y,first_x,first_y,hor_shifts,ver_shifts,mean_dt_right_to_left,mean_dt_left_to_right,smooth_index
0,1,1,16.666667,29.166667,81.578947,100.0,901.363636,324.681818,220.0,134.0,6,3,-0.988522,2.4941,67.107726
1,1,2,0.0,100.0,0.0,100.0,467.98,558.0,227.0,219.0,3,6,-1.122346,0.198652,56.271532
2,1,3,0.0,100.0,0.0,87.096774,585.45098,580.764706,468.0,68.0,-3,12,-0.36732,0.430917,57.099812
3,2,1,87.5,96.0,21.212121,100.0,644.677419,746.225806,852.0,600.0,-4,8,1.872294,-0.974462,52.573895
4,2,2,61.111111,57.142857,84.615385,67.5,1081.444444,467.833333,771.0,609.0,5,-1,-1.54919,0.5205,49.998429


In [114]:
# Save summary
outpath = r'D:\01_Academic\Disseration\Post_EXP\Preprocessing\Star_cancellation\Cancellation_High_level_Feature_Extraction\Cancellation_High-level_Features'
feature_summary.to_csv(outpath, index=False)
print(f'Saved feature summary to {outpath}')

Saved feature summary to D:\01_Academic\Disseration\Post_EXP\Preprocessing\Star_cancellation\Cancellation_High_level_Feature_Extraction\Cancellation_High-level_Features
