# Data preprocessing

In [8]:
# import libraries
from scripts.utils import *

## 1. Clean raw data
- remove unnecessary header rows and columns
- remove incomplete samples
- convert to correct data types
- save into new file

In [9]:
data_raw = pd.read_csv("../data/raw_data_qualtrics.csv")
data_raw.head()

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,id,stimulus_group
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,id,stimulus_group
1,"{""ImportId"":""startDate"",""timeZone"":""Europe/Ber...","{""ImportId"":""endDate"",""timeZone"":""Europe/Berlin""}","{""ImportId"":""status""}","{""ImportId"":""progress""}","{""ImportId"":""duration""}","{""ImportId"":""finished""}","{""ImportId"":""recordedDate"",""timeZone"":""Europe/...","{""ImportId"":""_recordId""}","{""ImportId"":""distributionChannel""}","{""ImportId"":""userLanguage""}",...,"{""ImportId"":""QID7_9""}","{""ImportId"":""QID7_10""}","{""ImportId"":""QID7_11""}","{""ImportId"":""QID7_12""}","{""ImportId"":""QID7_13""}","{""ImportId"":""QID7_14""}","{""ImportId"":""QID7_15""}","{""ImportId"":""QID7_16""}","{""ImportId"":""id""}","{""ImportId"":""stimulus_group""}"
2,2025-05-09 16:24:38,2025-05-09 16:28:40,0,100,241,1,2025-05-09 16:28:40,R_2wcQZNoEgb828YN,anonymous,NL,...,2,2,4,5,4,1,2,3,,uncertainty
3,2025-05-09 16:35:42,2025-05-09 16:43:56,0,100,493,1,2025-05-09 16:43:56,R_2xVbwQvHkSdNB3X,anonymous,EN-GB,...,4,4,4,5,4,1,2,2,,uncertainty
4,2025-05-09 16:40:41,2025-05-09 16:46:32,0,100,351,1,2025-05-09 16:46:32,R_8IgEAe8wOZIeQih,anonymous,DE,...,3,4,2,4,5,2,3,2,,control


In [10]:
# remove unnecessary rows and columns
data = data_raw.drop(index=[0,1], columns=['StartDate', 'EndDate', 'Status', 'Progress', 'Duration (in seconds)', 'RecordedDate', 'DistributionChannel', 'id', 'delay_timer_First Click', 'delay_timer_Last Click', 'delay_timer_Click Count', 'delay_timer_First Click.1', 'delay_timer_Last Click.1', 'delay_timer_Click Count.1'])

print(f'{len(data.index)} samples in raw data.')

# remove incomplete samples
data = data[data['Finished'] == '1']
print(f'{len(data.index)} samples after removing incomplete responses.')

# remove samples without consent
data = data[data['consent'] == get_value_for_label('consent', 'I agree to participate in the study')]
print(f'{len(data.index)} samples after removing no consent cases.')

# remove medical professionals
data = data[data['medical_prof'] == get_value_for_label('medical_prof', 'No')]
print(f'{len(data.index)} samples after removing medical professionals.')

# remove underage
data = data[pd.to_numeric(data['age']) >= 16]
print(f'{len(data.index)} samples after removing underage participants.')

# remove more unnecessary columns
data = data.drop(columns=['consent', 'medical_prof', 'Finished'])

# set df index to response_id
data.set_index('ResponseId', inplace=True)

# show result
data

347 samples in raw data.
282 samples after removing incomplete responses.
281 samples after removing no consent cases.
255 samples after removing medical professionals.
255 samples after removing underage participants.


Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_UP3,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,4,2,2,4,5,4,1,2,3,uncertainty
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,4,5,4,1,2,2,uncertainty
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,3,4,2,4,5,2,3,2,control
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,4,3,5,2,5,4,1,3,4,control
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,4,5,5,5,3,1,4,4,uncertainty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,3,2,4,5,5,1,3,3,uncertainty
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,4,3,4,4,3,3,3,3,3,control
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,3,2,4,3,2,2,3,uncertainty
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,4,2,1,3,5,4,1,2,2,control


In [11]:
# combine "delay_timer_Page Submit" columns (only one of two is filled in per group)
data['page_submit'] = pd.to_numeric(data['delay_timer_Page Submit'].fillna(0)) + pd.to_numeric(data['delay_timer_Page Submit.1'].fillna(0))
data = data.drop(columns=['delay_timer_Page Submit', 'delay_timer_Page Submit.1'])

# Convert page_submit to float
data['page_submit'] = data['page_submit'].astype(float)

data

Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group,page_submit
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,2,2,4,5,4,1,2,3,uncertainty,41.777
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,5,4,1,2,2,uncertainty,68.266
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,4,2,4,5,2,3,2,control,40.024
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,3,5,2,5,4,1,3,4,control,32.696
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,5,5,5,3,1,4,4,uncertainty,82.790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,2,4,5,5,1,3,3,uncertainty,46.038
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,3,4,4,3,3,3,3,3,control,184.260
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,2,4,3,2,2,3,uncertainty,31.266
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,2,1,3,5,4,1,2,2,control,31.400


In [12]:
# Convert data types explicitly for each column

# Metadata columns (Qualtrics system fields)
data['UserLanguage'] = data['UserLanguage'].astype(str)

# Demographics
data['gender'] = data['gender'].astype(int)
data['age'] = data['age'].astype(int)
data['education'] = data['education'].astype(int)

# AI-experience question
data['Q19'] = data['Q19'].astype(int)

# ATI scale items (likert 5)
for i in range(1, 10):
    data[f'ATI_{i}'] = data[f'ATI_{i}'].astype(int)

# HCSDS scale items (likert 5)
hcsds_items = ['HCSDS_C1', 'HCSDS_V1', 'HCSDS_C2', 'HCSDS_C3', 'HCSDS_V2', 'HCSDS_C4', 'HCSDS_V3', 'HCSDS_V4', 'HCSDS_V5']
for item in hcsds_items:
    data[item] = data[item].astype(int)

# Manipulation checks
for i in range(1, 5):
    data[f'manip_check1_{i}'] = data[f'manip_check1_{i}'].astype(int)
data['manip_check2'] = data['manip_check2'].astype(str)

# Trust in Automation scale items (likert 5)
tia_items = ['TiA_RC1', 'TiA_F1', 'TiA_Pro1', 'TiA_RC2', 'TiA_UP2', 'TiA_T1',
             'TiA_RC3', 'TiA_UP3', 'TiA_Pro2', 'TiA_RC4', 'TiA_T2', 'TiA_RC5',
             'TiA_UP4', 'TiA_F2', 'TiA_Pro3', 'TiA_RC6']
for item in tia_items:
    data[item] = data[item].astype(int)

# Remap experimental condition from string labels to integer values
data['stimulus_group'] = data['stimulus_group'].replace({
    'control': get_value_for_label('stimulus_group', 'control'),
    'uncertainty': get_value_for_label('stimulus_group', 'uncertainty')
})
data['stimulus_group'] = data['stimulus_group'].astype(int)

data

Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group,page_submit
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,2,2,4,5,4,1,2,3,1,41.777
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,5,4,1,2,2,1,68.266
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,4,2,4,5,2,3,2,0,40.024
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,3,5,2,5,4,1,3,4,0,32.696
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,5,5,5,3,1,4,4,1,82.790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,2,4,5,5,1,3,3,1,46.038
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,3,4,4,3,3,3,3,3,0,184.260
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,2,4,3,2,2,3,1,31.266
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,2,1,3,5,4,1,2,2,0,31.400


In [13]:
data.to_csv('../data/data_clean.csv', header=True, index=True)

## 2. Compute Scales
Compute scores for different scales per sample.

### Affinity for Technology Interaction (ATI) scale
Franke et al., 2019

Single scale.

### Revised Health Care System Distrust Scale
Shea et al., 2008

> Interpretation inverted to serve as scale for *trust* instead of distrust!

2 subscales:
- c = competence
- v = values

### Trust in Automation scale (adapted)
Körber, 2019

5 subscales:
- rc = reliability/confidence (*capability-based trust*)
- up = understanding/predictability (*shared mental model*)
- f = familiarity (*familiarity*)
- pro = propensity to trust (*faith in technology*)
- t = trust in automation (*general trust*)

In [14]:
# Load question metadata to understand scales and inversion requirements
questions = pd.read_csv("../data/questions.csv")

# Filter to only items that belong to a scale (have a subscale defined)
scale_items = questions[questions['subscale'].notna() & (questions['subscale'] != '')]

# Group items by subscale to understand which items belong to each scale
scale_groups = scale_items.groupby('subscale')

print("Scale composition:")
for scale_name, items in scale_groups:
    item_list = items['item'].tolist()
    inversions = items['inverted'].tolist()
    print(f"\n{scale_name}: {len(item_list)} items")
    print(f"  Items: {', '.join(item_list)}")
    inverted_items = [item for item, inv in zip(item_list, inversions) if inv == 1]
    if inverted_items:
        print(f"  Inverted: {', '.join(inverted_items)}")


Scale composition:

ati: 9 items
  Items: ATI_1, ATI_2, ATI_3, ATI_4, ATI_5, ATI_6, ATI_7, ATI_8, ATI_9
  Inverted: ATI_3, ATI_6, ATI_8

hcsdc_v: 5 items
  Items: HCSDS_V1, HCSDS_V2, HCSDS_V3, HCSDS_V4, HCSDS_V5
  Inverted: HCSDS_V1, HCSDS_V2, HCSDS_V4, HCSDS_V5

hcsds_c: 4 items
  Items: HCSDS_C1, HCSDS_C2, HCSDS_C3, HCSDS_C4
  Inverted: HCSDS_C3

tia_f: 2 items
  Items: TiA_F1, TiA_F2

tia_pro: 3 items
  Items: TiA_Pro1, TiA_Pro2, TiA_Pro3
  Inverted: TiA_Pro1

tia_rc: 6 items
  Items: TiA_RC1, TiA_RC2, TiA_RC3, TiA_RC4, TiA_RC5, TiA_RC6
  Inverted: TiA_RC3, TiA_RC5

tia_t: 2 items
  Items: TiA_T1, TiA_T2

tia_up: 3 items
  Items: TiA_UP2, TiA_UP3, TiA_UP4
  Inverted: TiA_UP2, TiA_UP4


In [15]:
# Function to compute scale scores with automatic inversion handling
def compute_scale_scores(data_df, questions_df, max_likert=5):
    """
    Compute scale scores by averaging items within each subscale.
    
    Args:
        data_df: DataFrame with individual item responses
        questions_df: DataFrame with question metadata (from questions.csv)
        max_likert: Maximum value on Likert scale (for inversion calculation)
    
    Returns:
        DataFrame with computed scale scores
    """
    # Create a copy of data to avoid modifying original
    result_df = data_df.copy()
    
    # Filter to items that belong to a scale
    scale_items = questions_df[questions_df['subscale'].notna() & (questions_df['subscale'] != '')]
    
    # Group by subscale
    scale_groups = scale_items.groupby('subscale')
    
    for scale_name, items_group in scale_groups:
        # Get list of items and their inversion status
        items_list = []
        
        for _, item_row in items_group.iterrows():
            item_name = item_row['item']
            is_inverted = item_row['inverted'] == 1
            
            # Check if this item exists in the data
            if item_name in result_df.columns:
                # Apply inversion if needed: inverted = (max + 1) - original
                if is_inverted:
                    items_list.append((max_likert + 1) - result_df[item_name])
                else:
                    items_list.append(result_df[item_name])
        
        # Compute mean across all items in this scale (if any items found)
        if items_list:
            result_df[scale_name] = pd.concat(items_list, axis=1).mean(axis=1)
            print(f"Computed scale '{scale_name}' from {len(items_list)} items")
        else:
            print(f"Warning: No items found for scale '{scale_name}'")
    
    return result_df


# Compute all scale scores
data_with_scales = compute_scale_scores(data, questions, max_likert=5)

# Show the newly computed scale columns
scale_columns = [col for col in data_with_scales.columns if col not in data.columns]
print(f"\nNewly computed scales: {scale_columns}")
data_with_scales[scale_columns]

Computed scale 'ati' from 9 items
Computed scale 'hcsdc_v' from 5 items
Computed scale 'hcsds_c' from 4 items
Computed scale 'tia_f' from 2 items
Computed scale 'tia_pro' from 3 items
Computed scale 'tia_rc' from 6 items
Computed scale 'tia_t' from 2 items
Computed scale 'tia_up' from 3 items

Newly computed scales: ['ati', 'hcsdc_v', 'hcsds_c', 'tia_f', 'tia_pro', 'tia_rc', 'tia_t', 'tia_up']


Unnamed: 0_level_0,ati,hcsdc_v,hcsds_c,tia_f,tia_pro,tia_rc,tia_t,tia_up
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
R_2wcQZNoEgb828YN,3.777778,2.4,3.25,2.5,2.000000,3.000000,4.0,3.000000
R_2xVbwQvHkSdNB3X,4.444444,2.8,4.00,1.0,2.333333,2.500000,3.0,2.666667
R_8IgEAe8wOZIeQih,2.000000,3.4,3.75,2.5,2.666667,2.666667,2.0,2.000000
R_2xVkPWT9CLOsV9Z,3.333333,2.2,3.50,3.0,2.333333,3.333333,3.0,2.666667
R_2DNNnCk5wVZRMMi,4.222222,3.8,5.00,2.5,3.333333,4.166667,5.0,4.000000
...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,2.222222,3.2,4.50,1.0,2.333333,3.166667,4.0,3.000000
R_231qLn8Gu6n4zTN,2.111111,2.4,3.00,3.5,2.333333,3.666667,4.0,3.666667
R_2h0aWSGxqhp84Em,2.888889,2.8,3.00,2.0,2.333333,3.333333,2.5,3.000000
R_8bVEMRVGPdWr17K,2.666667,2.8,2.50,1.0,1.666667,1.833333,2.5,2.666667


In [16]:
# Create final dataframe with scale scores and demographic/metadata variables
# Keep: demographics, experimental condition, page_submit, manipulation checks, and computed scales
# Drop: individual scale items (but NOT manipulation checks)

# Get list of individual scale items to drop (exclude manipulation checks)
scale_items_to_drop = questions[
    (questions['subscale'].notna()) & 
    (questions['subscale'] != '')
]['item'].tolist()

# Keep only columns that are NOT individual scale items
columns_to_keep = [col for col in data_with_scales.columns if col not in scale_items_to_drop]

# Create final dataframe
data_scales = data_with_scales[columns_to_keep].copy()

print(f"Original data shape: {data.shape}")
print(f"Final data shape (without individual items): {data_scales.shape}")

data_scales.to_csv('../data/data_scales.csv', header=True, index=True)

data_scales

Original data shape: (255, 46)
Final data shape (without individual items): (255, 20)


Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,manip_check1_1,manip_check1_2,manip_check1_3,manip_check1_4,manip_check2,stimulus_group,page_submit,ati,hcsdc_v,hcsds_c,tia_f,tia_pro,tia_rc,tia_t,tia_up
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,2,4,3,2,90,1,41.777,3.777778,2.4,3.25,2.5,2.000000,3.000000,4.0,3.000000
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,2,4,4,3,Between 82 and 98%,1,68.266,4.444444,2.8,4.00,1.0,2.333333,2.500000,3.0,2.666667
R_8IgEAe8wOZIeQih,DE,2,22,6,3,2,4,4,2,90%,0,40.024,2.000000,3.4,3.75,2.5,2.666667,2.666667,2.0,2.000000
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,1,2,4,2,85,0,32.696,3.333333,2.2,3.50,3.0,2.333333,3.333333,3.0,2.666667
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,1,3,4,3,behoorlijk betrouwbaar,1,82.790,4.222222,3.8,5.00,2.5,3.333333,4.166667,5.0,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,3,1,1,1,90% (+/- 8%),1,46.038,2.222222,3.2,4.50,1.0,2.333333,3.166667,4.0,3.000000
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,1,5,2,4,90%,0,184.260,2.111111,2.4,3.00,3.5,2.333333,3.666667,4.0,3.666667
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,5,4,75%,1,31.266,2.888889,2.8,3.00,2.0,2.333333,3.333333,2.5,3.000000
R_8bVEMRVGPdWr17K,NL,2,23,6,4,1,4,4,4,90%,0,31.400,2.666667,2.8,2.50,1.0,1.666667,1.833333,2.5,2.666667
