# DATASET_BUILDING_FOR_SCENARIO_1
---
**10.05.2019**

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from datetime import datetime
import pickle
import ast
import time
import heartpy as hp
import matplotlib
import matplotlib.pyplot as plt
from biosppy import storage
from biosppy.signals import ecg
import neurokit as nk
%matplotlib inline



## 2. Settings

In [2]:
# BITALINO

BITALINO_COLUMNS_TO_USE = [5, 6]
BITALINO_COLUMN_NAMES = [
    "EKG", 
    "GSR"
]
BITALINO_SAMPLING_RATE_PER_SEC = 1000

# PROCEDURE

PROCEDURE_COLUMNS_TO_USE = [2, 3, 4, 5, 6, 7, 8, 9]
PROCEDURE_COLUMN_NAMES = [
    "CONDITION",
    "SPEC_CONDITION",
    "SOUND_NUM",
    "IMAGE_NUM",
    "WIDGET_TYPE",
    "WIDGET_RESPONSE",
    "RESPONSE_TIME",
    "TIMESTAMP"
]

# FILE MAP

FILE_MAP_NAME = "file_map"

## 3. Loading of general data

In [3]:
IAPS_data = pd.read_csv("./data/IAPS.csv",sep=";")
IADS2_data = pd.read_csv("./data/IADS2.csv",sep=";")
NEOFFI_data = pd.read_csv("./data/NEO-FFI.txt",sep="\t")

## 4. General data preprocessing

### 4.1. IPAS

In [4]:
def preprocess_IAPS(data):
# Convert to int and if it fails, delete the row
    data['IAPS'] = pd.to_numeric(data['IAPS'], errors='coerce')
    data = data.dropna(subset=['IAPS'])
    data['IAPS'] = data['IAPS'].astype('int')

    data['ValenceMean'] = data['ValenceMean'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ValenceSD'] = data['ValenceSD'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ArousalMean'] = data['ArousalMean'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ArousalSD'] = data['ArousalSD'].apply(lambda x: x.replace(',','.')).astype(float)
    return data.drop(["Description", "set"], axis=1)

In [5]:
IPAS = preprocess_IAPS(IAPS_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

In [6]:
IPAS.head()

Unnamed: 0,IAPS,ValenceMean,ValenceSD,ArousalMean,ArousalSD
0,1019,3.95,1.96,5.77,1.83
1,1022,4.26,2.04,6.02,1.97
2,1026,4.09,1.91,5.61,2.23
3,1030,4.3,2.35,5.46,2.43
4,1033,3.87,1.94,6.13,2.15


In [7]:
IPAS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1171 entries, 0 to 1193
Data columns (total 5 columns):
IAPS           1171 non-null int64
ValenceMean    1171 non-null float64
ValenceSD      1171 non-null float64
ArousalMean    1171 non-null float64
ArousalSD      1171 non-null float64
dtypes: float64(4), int64(1)
memory usage: 54.9 KB


### 4.2. IADS2

In [8]:
def preprocess_IADS2(data):
    return data.drop(["Sound"], axis=1)

In [9]:
IADS2 = preprocess_IADS2(IADS2_data)

In [10]:
IADS2.head()

Unnamed: 0,Number,ValenceMean,ValenceSD,ArousalMean,ArousalSD
0,102,4.63,2.17,4.91,1.97
1,104,4.96,1.68,5.37,1.66
2,105,2.88,2.14,6.4,2.13
3,106,3.37,1.64,6.39,1.62
4,107,5.47,2.22,5.85,1.81


In [11]:
IADS2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 5 columns):
Number         167 non-null int64
ValenceMean    167 non-null float64
ValenceSD      167 non-null float64
ArousalMean    167 non-null float64
ArousalSD      167 non-null float64
dtypes: float64(4), int64(1)
memory usage: 6.6 KB


### 4.3. Personality questionnaire results

In [12]:
def preprocess_NEOFFI(data):
    data["SEX"] = data["SEX"].apply(lambda x: x=="M").astype(int)
    return data

In [13]:
NEOFFI = preprocess_NEOFFI(NEOFFI_data)

In [14]:
NEOFFI.head()

Unnamed: 0,ID,AGE,SEX,OTW,SUM,NEU,UGD,EKST
0,1107,21,0,4,6,3,4,10
1,1153,22,0,5,3,8,3,8
2,1233,21,0,6,8,2,10,7
3,1400,22,1,4,6,6,6,5
4,1402,27,0,9,7,7,4,7


In [15]:
NEOFFI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 8 columns):
ID      79 non-null int64
AGE     79 non-null int64
SEX     79 non-null int64
OTW     79 non-null int64
SUM     79 non-null int64
NEU     79 non-null int64
UGD     79 non-null int64
EKST    79 non-null int64
dtypes: int64(8)
memory usage: 5.0 KB


### 5. Loading of raw data

### 5.1. Auxiliary functions

In [16]:
def load_data_bitalino(file_path):
    return pd.read_csv(
        file_path,sep="\t", 
        skiprows=3, 
        usecols=BITALINO_COLUMNS_TO_USE, 
        names=BITALINO_COLUMN_NAMES, 
        header=None
    )

In [17]:
def load_data_procedure(file_path):
    return pd.read_csv(
        file_path,sep="\t", 
        usecols=PROCEDURE_COLUMNS_TO_USE, 
        names=PROCEDURE_COLUMN_NAMES, 
        header=None
    )

In [18]:
def load_file_map(file_name):
    pickle_in = open(file_name,"rb")
    dictionary = pickle.load(pickle_in)
    return dictionary

In [19]:
def extract_start_time(file_path):
    metadata_str = None
    fp = open(file_path)
    
    # meta data are on the second line of the file
    for i, line in enumerate(fp):
        if i == 1:
            metadata_str = line
            fp.close()
            break
           
    # meta data starts with '# ' and ends with a new line sign
    metadata_str = metadata_str[2 : -1]
    # the meta data format is consistent with the Python dictionary
    metadata_dict = ast.literal_eval(metadata_str)
    
    # HACK: the acquired dictionary is nested but has only one key on the first level
    key_name = list(metadata_dict)[0]
    
    time_str = metadata_dict[key_name]['time']
    date_str = metadata_dict[key_name]['date']
    datetime_str = date_str + "," + time_str
    
    return time.mktime(datetime.strptime(datetime_str, "%Y-%m-%d,%H:%M:%S.%f").timetuple())

In [20]:
def spec_condition_helper(x):
    if x=='-':
        return(0)
    elif x=='0':
        return(1)
    elif x=='+':
        return(2)

### 5.2. Load file map

In [21]:
FILE_MAP = load_file_map(FILE_MAP_NAME)

### 5.3. Preproces raw data

In [22]:
def preprocess_procedure(data):
    # select only rows concerning widgets of emoscale1 type
    data = data[data["WIDGET_TYPE"] == "emoscale1"]
    
    # convert the response time to float and if the value is not correct then remove the row
    data['RESPONSE_TIME'] = pd.to_numeric(data['RESPONSE_TIME'], errors='coerce')
    data = data.dropna(subset=['RESPONSE_TIME'])
    data['RESPONSE_TIME'] = data['RESPONSE_TIME'].astype('float')
    
    # enforcing the appropriate type of columns
    data["CONSISTENT"] = data['CONDITION'].apply(lambda x: x=="con").astype(int)
    data["IMAGE_ATTRIBUTE"] = data['SPEC_CONDITION'].apply(lambda x: spec_condition_helper(x[1])).astype(int)
    data["SOUND_ATTRIBUTE"] = data['SPEC_CONDITION'].apply(lambda x: spec_condition_helper(x[3])).astype(int)
    data["WIDGET_RESPONSE"] = data["WIDGET_RESPONSE"].astype(float).astype(int)
    data["TIMESTAMP"] = data["TIMESTAMP"].astype(int)
    
    # deleting unnecessary columns
    data.drop(["CONDITION", "SPEC_CONDITION", "WIDGET_TYPE"], axis=1, inplace=True)
    return data

In [42]:
def preprocess_bitalino(data, start_time):
    out = ecg.ecg(signal=data["EKG"], sampling_rate=1000, show=False)
    hr = out['heart_rate']
    t_end = np.floor(out['heart_rate_ts'] * 1000)
    t_start = np.insert(t_end[:-1] + 1, 0, 0, axis=0)
    new_data = pd.DataFrame({'T_START': t_start, 'T_END': t_end, 'HR': hr}, columns=['T_START', 'T_END', 'HR'])
    new_data['T_START'] = new_data['T_START'].astype('int')
    new_data['T_END'] = new_data['T_END'].astype('int')
    new_data['GSR'] = new_data.apply(lambda row: data.iloc[int(row['T_START']):int(row['T_END'])]["GSR"].mean(), axis=1)
    new_data["TIMESTAMP"] = start_time + (new_data["T_START"] / 1000)
    return new_data

In [43]:
def build_dataset(file_paths):
    bitalino_data = load_data_bitalino(file_paths['bitalino'])
    procedure_data = load_data_procedure(file_paths['procedure'])
    bitalino_start_time = extract_start_time(file_paths['bitalino'])

    procedure_data = preprocess_procedure(procedure_data)
    bitalino_data = preprocess_bitalino(bitalino_data, bitalino_start_time)

    procedure_data["HR_BASE"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] < row["TIMESTAMP"])].tail(2)["HR"].mean(), axis=1)
    procedure_data["HR_MAX"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] > row["TIMESTAMP"]) & (bitalino_data["TIMESTAMP"] < row["TIMESTAMP"] + 12)]["HR"].max(), axis=1)
    procedure_data["HR_MEAN"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] > row["TIMESTAMP"]) & (bitalino_data["TIMESTAMP"] < row["TIMESTAMP"] + 12)]["HR"].mean(), axis=1)
    procedure_data["HR_DELTA"] = procedure_data["HR_MEAN"] - procedure_data["HR_BASE"]
    procedure_data["GSR_BASE"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] < row["TIMESTAMP"])].tail(2)["GSR"].mean(), axis=1)
    procedure_data["GSR_MAX"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] > row["TIMESTAMP"]) & (bitalino_data["TIMESTAMP"] < row["TIMESTAMP"] + 12)]["GSR"].max(), axis=1)
    procedure_data["GSR_MEAN"] = procedure_data.apply(lambda row: bitalino_data[(bitalino_data["TIMESTAMP"] > row["TIMESTAMP"]) & (bitalino_data["TIMESTAMP"] < row["TIMESTAMP"] + 12)]["GSR"].mean(), axis=1)
    procedure_data["GSR_DELTA"] = procedure_data["GSR_MEAN"] - procedure_data["GSR_BASE"]
    return procedure_data

In [44]:
FILE_MAP.keys()

dict_keys(['9076', '8057', '6546', '6635', '9639', '9926', '3723', '3172', '2121', '4124', '3497', '9222', '1107', '6412', '4087', '5948', '3567', '1516', '3327', '7686', '9296', '8341', '8040', '2104', '2688', '3054', '9937', '5513', '6801', '8722', '4231', '4484', '9495', '8101', '4542', '2110', '1507', '3377', '8624', '7820', '3333', '1402', '2070', '4830', '7957', '9744', '2669', '8909', '2006', '6111', '2881', '4624', '6684', '9740', '6678', '7078', '7474', '5330', '6603', '3802', '5215', '1809', '5900', '3640', '4105', '5099', '7630', '9702', '2428', '7311', '8002', '2103', '5648', '9695', '9952', '1153', '2900', '8803', '5924', '7777', '3264', '1233', '5104', '1437', '1400', '8958', '3872', '9899', '7250', '8500', '7020'])

In [51]:
output = pd.DataFrame()
for key in FILE_MAP.keys():
    print(key)
    
    try:
        data = build_dataset(FILE_MAP[key])
        if output.empty:
            output = data
        else:
            output = pd.concat([output, data], ignore_index=True)
    except:
        print("Something went wrong for {}".format(key))

9076


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


8057
6546
6635
9639
9926
3723
3172
2121
4124
3497
9222
1107
6412
4087
5948
3567
1516
3327
7686
9296
8341
8040
2104
2688
Something went wrong for 2688
3054
9937
5513
6801
8722
4231
4484
9495
Something went wrong for 9495
8101
4542
2110
1507
3377
8624
7820
3333
1402
2070
4830
7957
9744
2669
8909
2006
6111
2881
4624
6684
9740
6678
7078
7474
5330
6603
3802
5215
1809
5900
3640
4105
5099
7630
9702
2428
7311
8002
2103
5648
9695
9952
1153
2900
8803
5924
7777
3264
1233
5104
1437
1400
8958
3872
9899
7250
8500
7020


In [52]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4272 entries, 0 to 4271
Data columns (total 16 columns):
SOUND_NUM          4272 non-null int64
IMAGE_NUM          4272 non-null int64
WIDGET_RESPONSE    4272 non-null int64
RESPONSE_TIME      4272 non-null float64
TIMESTAMP          4272 non-null int64
CONSISTENT         4272 non-null int64
IMAGE_ATTRIBUTE    4272 non-null int64
SOUND_ATTRIBUTE    4272 non-null int64
HR_BASE            3833 non-null float64
HR_MAX             3557 non-null float64
HR_MEAN            3557 non-null float64
HR_DELTA           3556 non-null float64
GSR_BASE           3833 non-null float64
GSR_MAX            3557 non-null float64
GSR_MEAN           3557 non-null float64
GSR_DELTA          3556 non-null float64
dtypes: float64(9), int64(7)
memory usage: 534.1 KB


In [59]:
output_clean = output.dropna()
output_clean = pd.merge(output_clean, IPAS, left_on='IMAGE_NUM', right_on='IAPS')

In [60]:
output_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3531 entries, 0 to 3530
Data columns (total 21 columns):
SOUND_NUM          3531 non-null int64
IMAGE_NUM          3531 non-null int64
WIDGET_RESPONSE    3531 non-null int64
RESPONSE_TIME      3531 non-null float64
TIMESTAMP          3531 non-null int64
CONSISTENT         3531 non-null int64
IMAGE_ATTRIBUTE    3531 non-null int64
SOUND_ATTRIBUTE    3531 non-null int64
HR_BASE            3531 non-null float64
HR_MAX             3531 non-null float64
HR_MEAN            3531 non-null float64
HR_DELTA           3531 non-null float64
GSR_BASE           3531 non-null float64
GSR_MAX            3531 non-null float64
GSR_MEAN           3531 non-null float64
GSR_DELTA          3531 non-null float64
IAPS               3531 non-null int64
ValenceMean        3531 non-null float64
ValenceSD          3531 non-null float64
ArousalMean        3531 non-null float64
ArousalSD          3531 non-null float64
dtypes: float64(13), int64(8)
memory usage: 606.

In [62]:
corr = output_clean.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,SOUND_NUM,IMAGE_NUM,WIDGET_RESPONSE,RESPONSE_TIME,TIMESTAMP,CONSISTENT,IMAGE_ATTRIBUTE,SOUND_ATTRIBUTE,HR_BASE,HR_MAX,HR_MEAN,HR_DELTA,GSR_BASE,GSR_MAX,GSR_MEAN,GSR_DELTA,IAPS,ValenceMean,ValenceSD,ArousalMean,ArousalSD
SOUND_NUM,1.0,0.14,0.061,-0.015,0.028,-0.025,-0.017,0.028,0.0041,0.017,0.018,0.018,0.023,0.023,0.023,0.0011,0.14,-0.0069,0.051,0.086,-0.077
IMAGE_NUM,0.14,1.0,0.22,0.022,-0.0088,-0.026,0.15,0.04,0.0022,0.00019,0.021,0.024,0.0013,0.0014,0.00039,-0.0097,1.0,0.2,0.1,-0.11,-0.085
WIDGET_RESPONSE,0.061,0.22,1.0,0.008,-0.051,0.14,0.55,0.15,-0.012,-0.016,0.015,0.038,0.0062,0.012,0.0093,0.031,0.22,0.6,0.39,-0.049,-0.33
RESPONSE_TIME,-0.015,0.022,0.008,1.0,0.025,-0.03,0.044,-0.026,-0.007,-0.027,-0.034,-0.034,-0.044,-0.037,-0.039,0.047,0.022,0.038,0.031,0.018,-0.031
TIMESTAMP,0.028,-0.0088,-0.051,0.025,1.0,0.0066,-0.029,0.0016,-0.043,0.031,-0.046,0.0028,-0.036,-0.045,-0.039,-0.037,-0.0088,-0.03,-0.0068,-0.0045,-0.005
CONSISTENT,-0.025,-0.026,0.14,-0.03,0.0066,1.0,0.047,-0.052,-0.00039,0.0029,0.0084,0.012,-0.012,-0.014,-0.014,-0.014,-0.026,0.037,0.13,-0.25,-0.071
IMAGE_ATTRIBUTE,-0.017,0.15,0.55,0.044,-0.029,0.047,1.0,-0.15,-0.015,-0.018,-0.01,0.0092,-0.004,-0.00048,-0.0017,0.024,0.15,0.97,0.49,-0.052,-0.31
SOUND_ATTRIBUTE,0.028,0.04,0.15,-0.026,0.0016,-0.052,-0.15,1.0,-0.022,-0.035,-0.035,-0.014,0.0063,0.0031,0.0031,-0.033,0.04,-0.15,-0.08,0.016,0.034
HR_BASE,0.0041,0.0022,-0.012,-0.007,-0.043,-0.00039,-0.015,-0.022,1.0,0.67,0.75,-0.48,0.13,0.14,0.14,0.038,0.0022,-0.018,-0.0055,-0.0039,0.021
HR_MAX,0.017,0.00019,-0.016,-0.027,0.031,0.0029,-0.018,-0.035,0.67,1.0,0.89,0.18,0.16,0.16,0.17,0.035,0.00019,-0.019,-0.021,0.016,0.029
