# Data preprocessing

In [1]:
# import libraries
from scripts.utils import *

## 1. Clean raw data
- remove unnecessary header rows and columns
- remove incomplete samples
- convert to correct data types
- save into new file

In [2]:
data_raw = pd.read_csv("../data/raw_data_qualtrics.csv")
data_raw.head()

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,id,stimulus_group
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,The following statements are about your impres...,id,stimulus_group
1,"{""ImportId"":""startDate"",""timeZone"":""Europe/Ber...","{""ImportId"":""endDate"",""timeZone"":""Europe/Berlin""}","{""ImportId"":""status""}","{""ImportId"":""progress""}","{""ImportId"":""duration""}","{""ImportId"":""finished""}","{""ImportId"":""recordedDate"",""timeZone"":""Europe/...","{""ImportId"":""_recordId""}","{""ImportId"":""distributionChannel""}","{""ImportId"":""userLanguage""}",...,"{""ImportId"":""QID7_9""}","{""ImportId"":""QID7_10""}","{""ImportId"":""QID7_11""}","{""ImportId"":""QID7_12""}","{""ImportId"":""QID7_13""}","{""ImportId"":""QID7_14""}","{""ImportId"":""QID7_15""}","{""ImportId"":""QID7_16""}","{""ImportId"":""id""}","{""ImportId"":""stimulus_group""}"
2,2025-05-09 16:24:38,2025-05-09 16:28:40,0,100,241,1,2025-05-09 16:28:40,R_2wcQZNoEgb828YN,anonymous,NL,...,2,2,4,5,4,1,2,3,,uncertainty
3,2025-05-09 16:35:42,2025-05-09 16:43:56,0,100,493,1,2025-05-09 16:43:56,R_2xVbwQvHkSdNB3X,anonymous,EN-GB,...,4,4,4,5,4,1,2,2,,uncertainty
4,2025-05-09 16:40:41,2025-05-09 16:46:32,0,100,351,1,2025-05-09 16:46:32,R_8IgEAe8wOZIeQih,anonymous,DE,...,3,4,2,4,5,2,3,2,,control


In [3]:
# remove unnecessary rows and columns
data = data_raw.drop(index=[0,1], columns=['StartDate', 'EndDate', 'Status', 'Progress', 'Duration (in seconds)', 'RecordedDate', 'DistributionChannel', 'id', 'delay_timer_First Click', 'delay_timer_Last Click', 'delay_timer_Click Count', 'delay_timer_First Click.1', 'delay_timer_Last Click.1', 'delay_timer_Click Count.1'])

print(f'{len(data.index)} samples in raw data.')

# remove incomplete samples
data = data[data['Finished'] == '1']
print(f'{len(data.index)} samples after removing incomplete responses.')

# remove samples without consent
data = data[data['consent'] == get_value_for_label('consent', 'I agree to participate in the study')]
print(f'{len(data.index)} samples after removing no consent cases.')

# remove medical professionals
data = data[data['medical_prof'] == get_value_for_label('medical_prof', 'No')]
print(f'{len(data.index)} samples after removing medical professionals.')

# remove underage
data = data[pd.to_numeric(data['age']) >= 16]
print(f'{len(data.index)} samples after removing underage participants.')

# remove more unnecessary columns
data = data.drop(columns=['consent', 'medical_prof', 'Finished'])

# set df index to response_id
data.set_index('ResponseId', inplace=True)

# show result
data

347 samples in raw data.
282 samples after removing incomplete responses.
281 samples after removing no consent cases.
255 samples after removing medical professionals.
255 samples after removing underage participants.


Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_UP3,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,4,2,2,4,5,4,1,2,3,uncertainty
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,4,5,4,1,2,2,uncertainty
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,3,4,2,4,5,2,3,2,control
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,4,3,5,2,5,4,1,3,4,control
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,4,5,5,5,3,1,4,4,uncertainty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,3,2,4,5,5,1,3,3,uncertainty
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,4,3,4,4,3,3,3,3,3,control
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,3,2,4,3,2,2,3,uncertainty
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,4,2,1,3,5,4,1,2,2,control


In [4]:
# combine "delay_timer_Page Submit" columns (only one of two is filled in per group)
data['page_submit'] = pd.to_numeric(data['delay_timer_Page Submit'].fillna(0)) + pd.to_numeric(data['delay_timer_Page Submit.1'].fillna(0))
data = data.drop(columns=['delay_timer_Page Submit', 'delay_timer_Page Submit.1'])

# Convert page_submit to float
data['page_submit'] = data['page_submit'].astype(float)

data

Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group,page_submit
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,2,2,4,5,4,1,2,3,uncertainty,41.777
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,5,4,1,2,2,uncertainty,68.266
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,4,2,4,5,2,3,2,control,40.024
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,3,5,2,5,4,1,3,4,control,32.696
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,5,5,5,3,1,4,4,uncertainty,82.790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,2,4,5,5,1,3,3,uncertainty,46.038
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,3,4,4,3,3,3,3,3,control,184.260
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,2,4,3,2,2,3,uncertainty,31.266
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,2,1,3,5,4,1,2,2,control,31.400


In [6]:
# Convert data types explicitly for each column

# Metadata columns (Qualtrics system fields)
data['UserLanguage'] = data['UserLanguage'].astype(str)

# Demographics
data['gender'] = data['gender'].astype(int)
data['age'] = data['age'].astype(int)
data['education'] = data['education'].astype(int)

# AI-experience question
data['Q19'] = data['Q19'].astype(int)

# ATI scale items (likert 5)
for i in range(1, 10):
    data[f'ATI_{i}'] = data[f'ATI_{i}'].astype(int)

# HCSDS scale items (likert 5)
hcsds_items = ['HCSDS_C1', 'HCSDS_V1', 'HCSDS_C2', 'HCSDS_C3', 'HCSDS_V2', 'HCSDS_C4', 'HCSDS_V3', 'HCSDS_V4', 'HCSDS_V5']
for item in hcsds_items:
    data[item] = data[item].astype(int)

# Manipulation checks
for i in range(1, 5):
    data[f'manip_check1_{i}'] = data[f'manip_check1_{i}'].astype(int)
data['manip_check2'] = data['manip_check2'].astype(str)

# Trust in Automation scale items (likert 5)
tia_items = ['TiA_RC1', 'TiA_F1', 'TiA_Pro1', 'TiA_RC2', 'TiA_UP2', 'TiA_T1',
             'TiA_RC3', 'TiA_UP3', 'TiA_Pro2', 'TiA_RC4', 'TiA_T2', 'TiA_RC5',
             'TiA_UP4', 'TiA_F2', 'TiA_Pro3', 'TiA_RC6']
for item in tia_items:
    data[item] = data[item].astype(int)

# Remap experimental condition from string labels to integer values
data['stimulus_group'] = data['stimulus_group'].replace({
    'control': get_value_for_label('stimulus_group', 'control'),
    'uncertainty': get_value_for_label('stimulus_group', 'uncertainty')
})
data['stimulus_group'] = data['stimulus_group'].astype(int)

data

Unnamed: 0_level_0,UserLanguage,gender,age,education,Q19,ATI_1,ATI_2,ATI_3,ATI_4,ATI_5,...,TiA_Pro2,TiA_RC4,TiA_T2,TiA_RC5,TiA_UP4,TiA_F2,TiA_Pro3,TiA_RC6,stimulus_group,page_submit
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_2wcQZNoEgb828YN,NL,2,24,6,4,3,4,2,4,5,...,2,2,4,5,4,1,2,3,1,41.777
R_2xVbwQvHkSdNB3X,EN-GB,1,24,6,5,5,5,2,4,5,...,4,4,4,5,4,1,2,2,1,68.266
R_8IgEAe8wOZIeQih,DE,2,22,6,3,1,3,2,2,2,...,3,4,2,4,5,2,3,2,0,40.024
R_2xVkPWT9CLOsV9Z,NL,2,22,7,4,4,3,4,3,3,...,3,5,2,5,4,1,3,4,0,32.696
R_2DNNnCk5wVZRMMi,NL,1,46,7,5,5,5,2,3,5,...,4,5,5,5,3,1,4,4,1,82.790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R_8v3b0I5Vzhx1y83,NL,1,19,3,3,1,4,1,1,3,...,3,2,4,5,5,1,3,3,1,46.038
R_231qLn8Gu6n4zTN,EN-GB,2,26,6,3,2,1,4,3,2,...,3,4,4,3,3,3,3,3,0,184.260
R_2h0aWSGxqhp84Em,EN-GB,2,19,6,3,3,3,3,3,3,...,3,3,2,4,3,2,2,3,1,31.266
R_8bVEMRVGPdWr17K,NL,2,23,6,4,2,4,2,2,2,...,2,1,3,5,4,1,2,2,0,31.400


In [7]:
data.to_csv('../data/data_clean.csv', header=True, index=True)

## 2. Compute Scales
Compute scores for different scales per sample.

### Affinity for Technology Interaction (ATI) scale
Franke et al., 2019

Single scale.

### Revised Health Care System Distrust Scale
Shea et al., 2008

> Interpretation inverted to serve as scale for *trust* instead of distrust!

2 subscales:
- c = competence
- v = values

### Trust in Automation scale (adapted)
Körber, 2019

5 subscales:
- rc = reliability/confidence (*capability-based trust*)
- up = understanding/predictability (*shared mental model*)
- f = familiarity (*familiarity*)
- pro = propensity to trust (*faith in technology*)
- t = trust in automation (*general trust*)