In [94]:
import ast
import pandas as pd
import matplotlib.pyplot as plt
import re

In [95]:
answer_regex = re.compile('\[([\-\+\.0-9e]+),\s*([\-\+\.0-9e]+)\]')
import math

def handle_valence (row):
    if row['WIDGET-TYPE'] == "emospace1" :
        return answer_regex.match(row["ANS"])[1]
    else:
        return 0
    

def handle_arousal (row):
    if row['WIDGET-TYPE'] == "emospace1" :
        return answer_regex.match(row["ANS"])[2]
    else:
        return 0
    
def handle_ans_time (row):
    if math.isnan(row['ANS-TIME']):
        return 100.0
    else:
        return row['ANS-TIME']

In [96]:
meta_data = pd.read_csv("BIRAFFE-metadata.csv", sep=";")
meta_data = meta_data.dropna(subset=['OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS', 'NEUROTICISM', 'NEO-FFI', 'PROCEDURE'])
meta_data.head()


Unnamed: 0,ID,AGE,SEX,PROCEDURE-BEGIN-TIMESTAMP,PROCEDURE-END-TIMESTAMP,BIOSIGS-BEGIN-TIMESTAMP,BIOSIGS-END-TIMESTAMP,OPENNESS,CONSCIENTIOUSNESS,EXTRAVERSION,AGREEABLENESS,NEUROTICISM,NEO-FFI,BIOSIGS,PROCEDURE,SPACE,FREUD,PHOTOS
0,1107,21.0,F,1555651000.0,1555655000.0,1555651000.0,1555655000.0,4.0,6.0,10.0,4.0,3.0,Y,Y,Y,Y,Y,Y
1,1153,22.0,F,1555672000.0,1555676000.0,1555672000.0,1555676000.0,5.0,3.0,8.0,3.0,8.0,Y,Y,Y,Y,Y,Y
2,1233,21.0,F,1555411000.0,1555415000.0,,,6.0,8.0,7.0,10.0,2.0,Y,,Y,Y,Y,Y
5,1318,21.0,M,1558009000.0,1558014000.0,1558009000.0,1558014000.0,3.0,6.0,5.0,5.0,2.0,Y,Y,Y,Y,Y,Y
6,1400,22.0,M,1554710000.0,1554715000.0,1554711000.0,1554715000.0,4.0,6.0,5.0,6.0,6.0,Y,Y,Y,Y,Y,Y


In [97]:
iads_data = pd.read_csv("IADS-database.csv", sep=";")
iads_data = iads_data.rename(columns={"Number": "IADS-ID"})
iads_data.head()



Unnamed: 0,Sound,IADS-ID,ValenceMean,ValenceSD,ArousalMean,ArousalSD
0,Cat,102,4.63,2.17,4.91,1.97
1,Panting,104,4.96,1.68,5.37,1.66
2,Puppy,105,2.88,2.14,6.4,2.13
3,Growl1,106,3.37,1.64,6.39,1.62
4,Dog,107,5.47,2.22,5.85,1.81


In [98]:
iaps_data = pd.read_csv("IAPS-database.csv", sep=";")
iaps_data = iaps_data.rename(columns={"IAPS": "IAPS-ID"})
iaps_data.astype({'IAPS-ID': 'int32'})
iaps_data.head()



Unnamed: 0,Description,IAPS-ID,ValenceMean,ValenceSD,ArousalMean,ArousalSD,set
0,Snake,1019.0,3.95,1.96,5.77,1.83,12
1,Snake,1022.0,4.26,2.04,6.02,1.97,12
2,Snake,1026.0,4.09,1.91,5.61,2.23,16
3,Snake,1030.0,4.3,2.35,5.46,2.43,1
4,Snake,1033.0,3.87,1.94,6.13,2.15,18


In [99]:

def preprocess_face_data(subject_id):
    face_data = pd.read_csv(f'BIRAFFE-photo/SUB{subject_id}-Face.csv', sep=";")
    face_data = face_data.dropna(subset=['IADS-ID', 'IAPS-ID'])
    face_data = face_data.dropna(subset=['ANGER', 'CONTEMPT', 'DISGUST', 'FEAR', 'HAPPINESS', 'NEUTRAL', 'SADNESS',	'SURPRISE'])
    
    merged = face_data.merge(iaps_data, how='left', on='IAPS-ID')
    merged = merged.merge(iads_data, how='left', on='IADS-ID', suffixes=('-IAPS', '_IADS'))
    
    grouped = merged.groupby(['IADS-ID', 'IAPS-ID']).mean()
    grouped = grouped.drop(columns=["GAME-TIMESTAMP", "PICTURE-TIMESTAMP", "set"])
    return grouped


In [100]:
def preprocess_answer_data(subject_id):
    answer_data = pd.read_csv(f'BIRAFFE-procedure/SUB{subject_id}-Procedure.csv', sep=";")
    answer_data = answer_data[["IADS-ID", "IAPS-ID", "WIDGET-TYPE", "ANS", "ANS-TIME"]]

    answer_data["EMO-ANS"] = 0

    answer_data.loc[answer_data['WIDGET-TYPE'] == "emoscale1", 'EMO-ANS'] = answer_data["ANS"]

    answer_data['VALENCE-ANS'] = answer_data.apply (lambda row: handle_valence(row), axis=1)
    answer_data['AROUSAL-ANS'] = answer_data.apply (lambda row: handle_arousal(row), axis=1)
    answer_data['ANS-TIME'] = answer_data.apply (lambda row: handle_ans_time(row), axis=1)
    
    answer_data = answer_data.drop(['ANS'], axis=1)

    return answer_data


In [103]:
for index, row in meta_data.iterrows():
    subject_id = row['ID']
    data = preprocess_face_data(subject_id)
    
    data['SUB-ID'] = subject_id
    data['OPENNESS'] = row['OPENNESS']
    data['CONSCIENTIOUSNESS'] = row['CONSCIENTIOUSNESS']
    data['EXTRAVERSION'] = row['EXTRAVERSION']
    data['AGREEABLENESS'] = row['AGREEABLENESS']
    data['NEUROTICISM'] = row['NEUROTICISM']
    
    answer_data = preprocess_answer_data(subject_id)
    data = pd.merge(data, answer_data, how='inner', on=['IADS-ID','IAPS-ID'])
    
    
    data = data.dropna()
    data.head()
    data.to_csv(f'preprocessed/DATA-{subject_id}.csv')
    

In [104]:
import glob

path = 'preprocessed'
all_files = glob.glob(path + "/DATA-*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

all_dfs = pd.concat(li, axis=0, ignore_index=True)
all_dfs.to_csv('preprocessed/ALL-DATA.csv')



In [105]:
all_dfs.head()

Unnamed: 0.1,Unnamed: 0,IADS-ID,IAPS-ID,ANGER,CONTEMPT,DISGUST,FEAR,HAPPINESS,NEUTRAL,SADNESS,...,OPENNESS,CONSCIENTIOUSNESS,EXTRAVERSION,AGREEABLENESS,NEUROTICISM,WIDGET-TYPE,ANS-TIME,EMO-ANS,VALENCE-ANS,AROUSAL-ANS
0,0,104.0,2810.0,0.161789,0.000684,0.0,0.0,0.0,0.834421,0.002947,...,3.0,6.0,5.0,5.0,2.0,emoscale1,1.184649,1.0,0.0,0.0
1,1,106.0,6213.0,0.044895,0.000105,0.0,0.0,0.0,0.936842,0.018105,...,3.0,6.0,5.0,5.0,2.0,emoscale1,1.71851,2.0,0.0,0.0
2,2,106.0,7405.0,0.029789,0.000211,0.0,0.0,0.0,0.964368,0.005474,...,3.0,6.0,5.0,5.0,2.0,emospace1,1.785309,0.0,0.483333,0.163721
3,3,109.0,3261.0,0.123,0.000316,0.0,0.0,0.0,0.873474,0.003263,...,3.0,6.0,5.0,5.0,2.0,emospace1,2.653014,0.0,-0.995714,0.180954
4,4,110.0,6213.0,0.013316,0.000263,0.0,0.0,0.0,0.982737,0.003474,...,3.0,6.0,5.0,5.0,2.0,emoscale1,0.033457,4.0,0.0,0.0
