# DATASET_BUILDING_FOR_SCENARIO_1
---
**10.05.2019**

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
import pickle

## 2. Settings

In [29]:
# BITALINO

BITALINO_COLUMNS_TO_USE = [5, 6]
BITALINO_COLUMN_NAMES = [
    "EKG", 
    "GSR"
]
BITALINO_SAMPLING_RATE_PER_SEC = 1000

# PROCEDURE

PROCEDURE_COLUMNS_TO_USE = [2, 3, 4, 5, 6, 7, 8, 9]
PROCEDURE_COLUMN_NAMES = [
    "CONDITION",
    "SPEC_CONDITION",
    "SOUND_NUM",
    "IMAGE_NUM",
    "WIDGET_TYPE",
    "WIDGET_RESPONSE",
    "RESPONSE_TIME",
    "TIMESTAMP"
]

# FILE MAP

FILE_MAP_NAME = "file_map"

## 3. Loading of general data

In [12]:
IAPS_data = pd.read_csv("./data/IAPS.csv",sep=";")
IADS2_data = pd.read_csv("./data/IADS2.csv",sep=";")
NEOFFI_data = pd.read_csv("./data/NEO-FFI.txt",sep="\t")

## 4. General data preprocessing

### 4.1. IPAS

In [13]:
def preprocess_IAPS(data):
# Convert to int and if it fails, delete the row
    data['IAPS'] = pd.to_numeric(data['IAPS'], errors='coerce')
    data = data.dropna(subset=['IAPS'])
    data['IAPS'] = data['IAPS'].astype('int')

    data['ValenceMean'] = data['ValenceMean'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ValenceSD'] = data['ValenceSD'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ArousalMean'] = data['ArousalMean'].apply(lambda x: x.replace(',','.')).astype(float)
    data['ArousalSD'] = data['ArousalSD'].apply(lambda x: x.replace(',','.')).astype(float)
    return data.drop(["Description", "set"], axis=1)

In [14]:
IPAS = preprocess_IAPS(IAPS_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

In [15]:
IPAS.head()

Unnamed: 0,IAPS,ValenceMean,ValenceSD,ArousalMean,ArousalSD
0,1019,3.95,1.96,5.77,1.83
1,1022,4.26,2.04,6.02,1.97
2,1026,4.09,1.91,5.61,2.23
3,1030,4.3,2.35,5.46,2.43
4,1033,3.87,1.94,6.13,2.15


In [16]:
IPAS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1171 entries, 0 to 1193
Data columns (total 5 columns):
IAPS           1171 non-null int64
ValenceMean    1171 non-null float64
ValenceSD      1171 non-null float64
ArousalMean    1171 non-null float64
ArousalSD      1171 non-null float64
dtypes: float64(4), int64(1)
memory usage: 54.9 KB


### 4.2. IADS2

In [17]:
def preprocess_IADS2(data):
    return data.drop(["Sound"], axis=1)

In [18]:
IADS2 = preprocess_IADS2(IADS2_data)

In [19]:
IADS2.head()

Unnamed: 0,Number,ValenceMean,ValenceSD,ArousalMean,ArousalSD
0,102,4.63,2.17,4.91,1.97
1,104,4.96,1.68,5.37,1.66
2,105,2.88,2.14,6.4,2.13
3,106,3.37,1.64,6.39,1.62
4,107,5.47,2.22,5.85,1.81


In [20]:
IADS2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 5 columns):
Number         167 non-null int64
ValenceMean    167 non-null float64
ValenceSD      167 non-null float64
ArousalMean    167 non-null float64
ArousalSD      167 non-null float64
dtypes: float64(4), int64(1)
memory usage: 6.6 KB


### 4.3. Personality questionnaire results

In [21]:
def preprocess_NEOFFI(data):
    data["SEX"] = data["SEX"].apply(lambda x: x=="M").astype(int)
    return data

In [22]:
NEOFFI = preprocess_NEOFFI(NEOFFI_data)

In [23]:
NEOFFI.head()

Unnamed: 0,ID,AGE,SEX,OTW,SUM,NEU,UGD,EKST
0,1107,21,0,4,6,3,4,10
1,1153,22,0,5,3,8,3,8
2,1233,21,0,6,8,2,10,7
3,1400,22,1,4,6,6,6,5
4,1402,27,0,9,7,7,4,7


In [24]:
NEOFFI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 8 columns):
ID      79 non-null int64
AGE     79 non-null int64
SEX     79 non-null int64
OTW     79 non-null int64
SUM     79 non-null int64
NEU     79 non-null int64
UGD     79 non-null int64
EKST    79 non-null int64
dtypes: int64(8)
memory usage: 5.0 KB


### 5. Loading of raw data

### 5.1. Auxiliary functions

In [25]:
def load_data_bitalino(file_path):
    return pd.read_csv(
        file_path,sep="\t", 
        skiprows=3, 
        usecols=BITALINO_COLUMNS_TO_USE, 
        names=BITALINO_COLUMN_NAMES, 
        header=None
    )

In [26]:
def load_data_procedure(file_path):
    return pd.read_csv(
        file_path,sep="\t", 
        usecols=PROCEDURE_COLUMNS_TO_USE, 
        names=PROCEDURE_COLUMN_NAMES, 
        header=None
    )

In [28]:
def load_file_map(file_name):
    pickle_in = open(file_name,"rb")
    dictionary = pickle.load(pickle_in)
    return dictionary

In [32]:
def extract_start_time(file_path):
    metadata_str = None
    fp = open(file_path)
    
    # meta data are on the second line of the file
    for i, line in enumerate(fp):
        if i == 1:
            metadata_str = line
            fp.close()
            break
           
    # meta data starts with '# ' and ends with a new line sign
    metadata_str = metadata_str[2 : -1]
    # the meta data format is consistent with the Python dictionary
    metadata_dict = ast.literal_eval(metadata_str)
    
    # HACK: the acquired dictionary is nested but has only one key on the first level
    key_name = list(metadata_dict)[0]
    
    time_str = metadata_dict[key_name]['time']
    date_str = metadata_dict[key_name]['date']
    datetime_str = date_str + "," + time_str
    
    return time.mktime(datetime.strptime(datetime_str, "%Y-%m-%d,%H:%M:%S.%f").timetuple())

### 5.2. Load file map

In [30]:
FILE_MAP = load_file_map(FILE_MAP_NAME)

In [31]:
pprint(FILE_MAP)

{'1107': {'bitalino': './data/raw_data/bitalino/1107_opensignals_prawestanowisko_2019-04-19_09-11-49.txt',
          'info': './data/raw_data/procedura/1107_2019_Apr_19_0712_info.txt',
          'procedure': './data/raw_data/procedura/1107_2019_Apr_19_0712.txt'},
 '1153': {'bitalino': './data/raw_data/bitalino/1153_opensignals_lewestanowisko_2019-04-19_13-03-03.txt',
          'info': './data/raw_data/procedura/1153_2019_Apr_19_1259_info.txt',
          'procedure': './data/raw_data/procedura/1153_2019_Apr_19_1259.txt'},
 '1233': {'bitalino': './data/raw_data/bitalino/1233_opensignals_prawestanowisko_2019-04-16_13-06-39.txt',
          'info': './data/raw_data/procedura/1233_2019_Apr_16_1240_info.txt',
          'procedure': './data/raw_data/procedura/1233_2019_Apr_16_1240.txt'},
 '1400': {'bitalino': './data/raw_data/bitalino/1400_opensignals_lewestanowisko_2019-04-08_10-12-56.txt',
          'info': './data/raw_data/procedura/1400_2019_Apr_08_1000_info.txt',
          'procedure': '.

### 5.3. Rreproces raw data