# DREAMER Dataset

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy.io import loadmat
import heartpy as hp

In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split

In [None]:
# def preprocessing_and_feature(data):
#     data_ECG = {}
#     for participant in range(0,23):
#         for video in range(0,18):
#             # load raw baseline and stimuli data for left and right
#             basl_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,0]
#             stim_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,0]
#             basl_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,1]
#             stim_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,1]
#             # process with neurokit
#             ecg_signals_b_l, info_b_l = nk.ecg_process(basl_l,sampling_rate=256)
#             ecg_signals_s_l, info_s_l = nk.ecg_process(stim_l,sampling_rate=256)
#             ecg_signals_b_r, info_b_r = nk.ecg_process(basl_r,sampling_rate=256)
#             ecg_signals_s_r, info_s_r = nk.ecg_process(stim_r,sampling_rate=256)
#             # divide stimuli features by baseline features
#             # would be interesting to compare classification accuracy when we
#             # don't do this
#             features_ecg_l=nk.ecg_intervalrelated(ecg_signals_s_l)/nk.ecg_intervalrelated(ecg_signals_b_l)
#             features_ecg_r=nk.ecg_intervalrelated(ecg_signals_s_r)/nk.ecg_intervalrelated(ecg_signals_b_r)
#             # average left and right features
#             # would be interesting to compare classification accuracy when we
#             # rather include both left and right features
#             features_ecg=(features_ecg_l+features_ecg_r)/2
#             if not len(data_ECG):
#                 data_ECG=features_ecg
#             else:
#                 data_ECG=pd.concat([data_ECG,features_ecg],ignore_index=True)
#     return data_ECG

In [7]:
raw = loadmat("data/DREAMER.mat")

In [31]:
def preprocessing_and_feature(data):
    data_ECG = {}
    ecgDf = pd.DataFrame()
    for participant in range(0,23):
        for video in range(0,18):
            # load raw baseline and stimuli data for left and right
            basl_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,0]
            stim_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,0]
            basl_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,1]
            stim_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,1]

            colName = "P" + str(participant+1) + "V" + str(video+1)
           
            ecgDf = pd.concat([ecgDf, pd.DataFrame({colName : stim_l})], axis=1)
            
    
    return ecgDf

In [32]:
ecg = preprocessing_and_feature(raw)

In [33]:
ecg.head()

Unnamed: 0,P1V1,P1V2,P1V3,P1V4,P1V5,P1V6,P1V7,P1V8,P1V9,P1V10,...,P23V9,P23V10,P23V11,P23V12,P23V13,P23V14,P23V15,P23V16,P23V17,P23V18
0,2046.0,2054.0,2018.0,2055.0,2080.0,2057.0,2072.0,2024,1912.0,2048.0,...,2037.0,2180.0,2074.0,2020.0,2018.0,2386.0,2086.0,2043.0,2076.0,2094.0
1,2042.0,2036.0,2022.0,2052.0,2038.0,2055.0,2077.0,2030,2080.0,2055.0,...,2037.0,2169.0,2107.0,2015.0,2064.0,2077.0,2071.0,2008.0,2120.0,2050.0
2,2039.0,2036.0,2025.0,2053.0,2043.0,2054.0,2079.0,2028,2077.0,2048.0,...,2046.0,2163.0,2098.0,2022.0,2064.0,2074.0,2063.0,2012.0,2130.0,2043.0
3,2039.0,2035.0,2027.0,2054.0,2044.0,2056.0,2074.0,2021,2072.0,2044.0,...,2032.0,2153.0,2093.0,2025.0,2064.0,2062.0,2060.0,2016.0,2134.0,2064.0
4,2041.0,2034.0,2026.0,2058.0,2045.0,2052.0,2065.0,2029,2069.0,2046.0,...,2038.0,2145.0,2083.0,2028.0,2061.0,2056.0,2052.0,2015.0,2142.0,2076.0


In [34]:
ecg.to_csv('data/DREAMER_ECG_Raw.csv')

In [53]:
def calculateBPMLib(seriesIn, colName):
    print(colName)
    fs = 256 #The example dataset was recorded at 256Hz
    
    series = seriesIn.copy()
#     print(series.shape)
    dropIdx = len(series.dropna())
    series = series.truncate(after=dropIdx-1)
    print(series.shape)

    minuteHz = 7680
    currIdx = 7680
    bpmList = []
    
#     if(len(series)<minuteHz):
#         print('ERROR0')
#         return []
    
    while currIdx < len(series):
#         print(currIdx)
        currSlice = series.iloc[currIdx-minuteHz:currIdx]
        currIdx += 256
        currSlice = np.array(currSlice.to_list())
        try:
            bpmList.append(hp.process(currSlice, fs)[1]['bpm'])
        except hp.exceptions.BadSignalWarning:
            bpmList.append(np.nan)

    return bpmList

In [78]:
hrDf = pd.DataFrame()

for col in ecg.columns:
    hrDf = pd.concat([hrDf, pd.DataFrame({col : calculateBPMLib(ecg[col], col)})], axis=1)

P1V1
(100864,)
(50944,)
P1V2
(100864,)
(33536,)
P1V3
(100864,)
(89088,)
P1V4
(100864,)
(42496,)
P1V5
(100864,)
(34816,)
P1V6
(100864,)
(48640,)
P1V7
(100864,)
(49152,)
P1V8
(100864,)
(100864,)
P1V9
(100864,)
(37120,)
P1V10
(100864,)
(17152,)


The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


P1V11
(100864,)
(24576,)
P1V12
(100864,)
(46336,)
P1V13
(100864,)
(94208,)
P1V14
(100864,)
(43520,)
P1V15
(100864,)
(78848,)
P1V16
(100864,)
(49920,)
P1V17
(100864,)
(65536,)
P1V18
(100864,)
(47616,)
P2V1
(100864,)
(50944,)
P2V2
(100864,)
(33536,)
P2V3
(100864,)
(89088,)
P2V4
(100864,)
(42496,)
P2V5
(100864,)
(34816,)
P2V6
(100864,)
(48640,)
P2V7
(100864,)
(49152,)
P2V8
(100864,)
(100864,)
P2V9
(100864,)
(37120,)
P2V10
(100864,)
(17152,)
P2V11
(100864,)
(24576,)
P2V12
(100864,)
(46336,)
P2V13
(100864,)
(94208,)
P2V14
(100864,)
(43520,)
P2V15
(100864,)
(78848,)
P2V16
(100864,)
(49920,)
P2V17
(100864,)
(65536,)
P2V18
(100864,)
(47616,)
P3V1
(100864,)
(50944,)
P3V2
(100864,)
(33536,)
P3V3
(100864,)
(89088,)
P3V4
(100864,)
(42496,)
P3V5
(100864,)
(34816,)
P3V6
(100864,)
(48640,)
P3V7
(100864,)
(49152,)
P3V8
(100864,)
(100864,)
P3V9
(100864,)
(37120,)
P3V10
(100864,)
(17152,)
P3V11
(100864,)
(24576,)
P3V12
(100864,)
(46336,)
P3V13
(100864,)
(94208,)
P3V14
(100864,)
(43520,)
P3V15
(100864,)


A theoretically impossible result was found during the iteration
process for finding a smoothing spline with fp = s: s too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


P4V9
(100864,)
(37120,)
P4V10
(100864,)
(17152,)
P4V11
(100864,)
(24576,)
P4V12
(100864,)
(46336,)
P4V13
(100864,)
(94208,)
P4V14
(100864,)
(43520,)
P4V15
(100864,)
(78848,)
P4V16
(100864,)
(49920,)
P4V17
(100864,)
(65536,)
P4V18
(100864,)
(47616,)
P5V1
(100864,)
(50944,)
P5V2
(100864,)
(33536,)
P5V3
(100864,)
(89088,)
P5V4
(100864,)
(42496,)
P5V5
(100864,)
(34816,)
P5V6
(100864,)
(48640,)
P5V7
(100864,)
(49152,)
P5V8
(100864,)
(100864,)
P5V9
(100864,)
(37120,)
P5V10
(100864,)
(17152,)
P5V11
(100864,)
(24576,)
P5V12
(100864,)
(46336,)
P5V13
(100864,)
(94208,)
P5V14
(100864,)
(43520,)
P5V15
(100864,)
(78848,)
P5V16
(100864,)
(49920,)
P5V17
(100864,)
(65536,)
P5V18
(100864,)
(47616,)
P6V1
(100864,)
(50944,)
P6V2
(100864,)
(33536,)
P6V3
(100864,)
(89088,)
P6V4
(100864,)
(42496,)
P6V5
(100864,)
(34816,)
P6V6
(100864,)
(48640,)
P6V7
(100864,)
(49152,)
P6V8
(100864,)
(100864,)
P6V9
(100864,)
(37120,)
P6V10
(100864,)
(17152,)
P6V11
(100864,)
(24576,)
P6V12
(100864,)
(46336,)
P6V13
(100864,)
(

  result = super(MaskedArray, self).mean(axis=axis,
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


P7V2
(100864,)
(33536,)
P7V3
(100864,)
(89088,)
P7V4
(100864,)
(42496,)
P7V5
(100864,)
(34816,)
P7V6
(100864,)
(48640,)
P7V7
(100864,)
(49152,)
P7V8
(100864,)
(100864,)
P7V9
(100864,)
(37120,)
P7V10
(100864,)
(17152,)
P7V11
(100864,)
(24576,)
P7V12
(100864,)
(46336,)
P7V13
(100864,)
(94208,)
P7V14
(100864,)
(43520,)
P7V15
(100864,)
(78848,)
P7V16
(100864,)
(49920,)
P7V17
(100864,)
(65536,)
P7V18
(100864,)
(47616,)
P8V1
(100864,)
(50944,)
P8V2
(100864,)
(33536,)
P8V3
(100864,)
(89088,)
P8V4
(100864,)
(42496,)
P8V5
(100864,)
(34816,)
P8V6
(100864,)
(48640,)
P8V7
(100864,)
(49152,)
P8V8
(100864,)
(100864,)
P8V9
(100864,)
(37120,)
P8V10
(100864,)
(17152,)
P8V11
(100864,)
(24576,)
P8V12
(100864,)
(46336,)
P8V13
(100864,)
(94208,)
P8V14
(100864,)
(43520,)
P8V15
(100864,)
(78848,)
P8V16
(100864,)
(49920,)
P8V17
(100864,)
(65536,)
P8V18
(100864,)
(47616,)
P9V1
(100864,)
(50944,)
P9V2
(100864,)
(33536,)
P9V3
(100864,)
(89088,)
P9V4
(100864,)
(42496,)
P9V5
(100864,)
(34816,)
P9V6
(100864,)
(4864

In [70]:
hrDf.shape

(364, 414)

In [79]:

count = 0

for i in temp.iteritems():
    if i[1]/364 > 0.7:
        hrDf = hrDf.drop([i[0]], axis=1)
        df_Participants_Data = df_Participants_Data.drop([i[0]], axis=1)
        count+=1
print(count)

93


In [80]:
df_Participants_Data.head()

Unnamed: 0,P1V1,P1V3,P1V4,P1V6,P1V7,P1V8,P1V9,P1V12,P1V13,P1V14,...,P23V7,P23V8,P23V9,P23V12,P23V13,P23V14,P23V15,P23V16,P23V17,P23V18
Valence,4.0,5.0,4.0,1.0,5.0,1.0,1.0,4.0,4.0,3.0,...,4.0,1.0,2.0,4.0,4.0,2.0,2.0,3.0,2.0,2.0
Arousal,3.0,4.0,3.0,2.0,4.0,2.0,2.0,3.0,3.0,1.0,...,2.0,5.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,4.0
Dominance,2.0,4.0,2.0,5.0,4.0,4.0,3.0,4.0,2.0,2.0,...,1.0,5.0,2.0,2.0,3.0,2.0,2.0,2.0,4.0,2.0


In [90]:
df_Participants_Data.to_csv('data/DREAMER_emotLabels.csv', index=False)

In [83]:
hrDf.to_csv('data/preprocessedDREAMER.csv', index=False)

In [None]:
lenList = []
for col in hrDf.columns:
    lenList.append(len(hrDf[col].dropna()))

In [76]:
def Participants_Data(raw):
    cols = []
    # Create new dataframe with emotion, participant, and video data
    a=np.zeros((23,18,9),dtype=object)
    for participant in range(0,23):
        for video in range(0,18):
            cols.append("P" + str(participant+1) + "V" + str(video+1))
            a[participant,video,0]=raw['DREAMER'][0,0]['Data'][0,participant]['Age'][0][0][0]
            a[participant,video,1]=raw['DREAMER'][0,0]['Data'][0,participant]['Gender'][0][0][0]
            a[participant,video,2]=participant+1
            a[participant,video,3]=video+1
            a[participant,video,4]=['Searching for Bobby Fischer','D.O.A.', 'The Hangover', 'The Ring', '300',
                      'National Lampoon\'s VanWilder', 'Wall-E', 'Crash', 'My Girl', 'The Fly',
                      'Pride and Prejudice', 'Modern Times', 'Remember the Titans', 'Gentlemans Agreement',
                      'Psycho', 'The Bourne Identitiy', 'The Shawshank Redemption', 'The Departed'][video]
            a[participant,video,5]=['calmness', 'surprise', 'amusement', 'fear', 'excitement', 'disgust',
                      'happiness', 'anger', 'sadness', 'disgust', 'calmness', 'amusement',
                      'happiness', 'anger', 'fear', 'excitement', 'sadness', 'surprise'][video]
            a[participant,video,6]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreValence'][0,0][video,0].astype(float)
            a[participant,video,7]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreArousal'][0,0][video,0].astype(float)
            a[participant,video,8]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreDominance'][0,0][video,0].astype(float)
    b=pd.DataFrame(a.reshape((23*18,a.shape[2])),columns=['Age','Gender','Participant','Video','Video_Name','Target_Emotion','Valence','Arousal','Dominance'])
    
    c = pd.DataFrame(b[['Valence', 'Arousal', 'Dominance']].T.values, columns=cols, index=['Valence', 'Arousal', 'Dominance'])
    ## combine feature extraction dataframes with the new dataframe
    #all_data=pd.concat([data_EEG,data_ECG,b],axis=1)
    return c

In [77]:
df_Participants_Data = Participants_Data(raw)
df_Participants_Data.head()

Unnamed: 0,P1V1,P1V2,P1V3,P1V4,P1V5,P1V6,P1V7,P1V8,P1V9,P1V10,...,P23V9,P23V10,P23V11,P23V12,P23V13,P23V14,P23V15,P23V16,P23V17,P23V18
Valence,4.0,3.0,5.0,4.0,4.0,1.0,5.0,1.0,1.0,5.0,...,2.0,2.0,4.0,4.0,4.0,2.0,2.0,3.0,2.0,2.0
Arousal,3.0,3.0,4.0,3.0,4.0,2.0,4.0,2.0,2.0,3.0,...,2.0,5.0,1.0,2.0,3.0,2.0,2.0,3.0,2.0,4.0
Dominance,2.0,1.0,4.0,2.0,4.0,5.0,4.0,4.0,3.0,4.0,...,2.0,5.0,1.0,2.0,3.0,2.0,2.0,2.0,4.0,2.0


In [52]:
df_Participants_Data.to_csv('data/DREAMER_emotLabels.csv')

In [8]:
df_Participants_Data = Participants_Data(raw)
df_Participants_Data.head()

Unnamed: 0,Age,Gender,Participant,Video,Video_Name,Target_Emotion,Valence,Arousal,Dominance
0,22,male,1,1,Searching for Bobby Fischer,calmness,4,3,2
1,22,male,1,2,D.O.A.,surprise,3,3,1
2,22,male,1,3,The Hangover,amusement,5,4,4
3,22,male,1,4,The Ring,fear,4,3,2
4,22,male,1,5,300,excitement,4,4,4


In [4]:
df_Features.head()

Unnamed: 0,ECG_Rate_Mean,HRV_RMSSD,HRV_MeanNN,HRV_SDNN,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,HRV_MCVNN,...,HRV_C2d,HRV_C2a,HRV_SD2d,HRV_SD2a,HRV_Cd,HRV_Ca,HRV_SDNNd,HRV_SDNNa,HRV_ApEn,HRV_SampEn
0,1.01935,0.937463,0.980715,0.942895,0.933189,0.961436,0.955897,0.98173,1.0,1.018617,...,1.006937,0.992211,0.947206,0.940288,1.008489,0.990622,0.946925,0.938547,1.438378,0.858731
1,0.901911,1.646394,1.108991,1.212197,1.641699,1.093063,1.484588,1.113345,1.104762,0.992243,...,1.024839,0.976554,1.209966,1.181118,1.020996,0.980174,1.228156,1.203344,1.174425,1.351139
2,1.032777,0.973727,0.967888,1.094067,0.968126,1.130365,1.006033,0.980392,1.111111,1.133333,...,0.778085,1.423316,0.974122,1.317528,0.802029,1.339746,0.976209,1.261751,2.239887,0.853926
3,0.952182,1.235442,1.049724,1.190338,1.236088,1.133953,1.176921,1.056122,1.142857,1.082126,...,0.861915,1.322482,1.128573,1.396073,0.856808,1.311056,1.126642,1.392277,1.317776,0.804541
4,1.063239,0.74778,0.942383,0.685394,0.745794,0.727298,0.793499,0.933333,0.786765,0.842962,...,1.035208,0.964293,0.751168,0.724984,1.074476,0.930645,0.766086,0.712975,1.371131,1.30124


In [22]:
len(testList)

414

In [None]:
df_Features = pd.DataFrame()
df_Features = preprocessing_and_feature(raw)

In [32]:
cleanedDf = df_Features.dropna(axis=1, how='any')
cleanedDf.to_csv("data/preprocessedDREAMER.csv", index=False)

(414,)

In [14]:
labels = df_Participants_Data['Target_Emotion']
labels.shape

(414,)

In [10]:
cleanedDf = pd.read_csv("data/preprocessedDREAMER.csv")

In [13]:
cvf = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [21]:
rfModel = RandomForestClassifier(n_estimators=1000, random_state=0)
score = np.mean(cross_val_score(rfModel, cleanedDf, labels, cv=cvf))

In [22]:
score

0.1648

In [20]:
cleanedDf.shape

(414, 41)

# Labels

In [84]:
df_Participants_Data.head()

Unnamed: 0,P1V1,P1V3,P1V4,P1V6,P1V7,P1V8,P1V9,P1V12,P1V13,P1V14,...,P23V7,P23V8,P23V9,P23V12,P23V13,P23V14,P23V15,P23V16,P23V17,P23V18
Valence,4.0,5.0,4.0,1.0,5.0,1.0,1.0,4.0,4.0,3.0,...,4.0,1.0,2.0,4.0,4.0,2.0,2.0,3.0,2.0,2.0
Arousal,3.0,4.0,3.0,2.0,4.0,2.0,2.0,3.0,3.0,1.0,...,2.0,5.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,4.0
Dominance,2.0,4.0,2.0,5.0,4.0,4.0,3.0,4.0,2.0,2.0,...,1.0,5.0,2.0,2.0,3.0,2.0,2.0,2.0,4.0,2.0


[4.0 3.0 2.0]
[5.0 4.0 4.0]
[4.0 3.0 2.0]
[1.0 2.0 5.0]
[5.0 4.0 4.0]
[1.0 2.0 4.0]
[1.0 2.0 3.0]
[4.0 3.0 4.0]
[4.0 3.0 2.0]
[3.0 1.0 2.0]
[2.0 5.0 5.0]
[3.0 2.0 2.0]
[1.0 2.0 5.0]
[3.0 5.0 5.0]
[3.0 2.0 2.0]
[5.0 5.0 4.0]
[1.0 5.0 4.0]
[2.0 3.0 3.0]
[4.0 2.0 2.0]
[2.0 5.0 4.0]
[2.0 3.0 3.0]
[4.0 3.0 2.0]
[5.0 4.0 4.0]
[2.0 3.0 3.0]
[1.0 5.0 5.0]
[2.0 5.0 5.0]
[1.0 3.0 5.0]
[3.0 5.0 4.0]
[3.0 4.0 2.0]
[5.0 5.0 4.0]
[2.0 5.0 5.0]
[3.0 3.0 3.0]
[5.0 3.0 3.0]
[1.0 4.0 4.0]
[1.0 2.0 3.0]
[5.0 3.0 3.0]
[4.0 4.0 5.0]
[3.0 2.0 2.0]
[4.0 2.0 3.0]
[4.0 2.0 1.0]
[2.0 2.0 4.0]
[3.0 3.0 2.0]
[3.0 3.0 1.0]
[5.0 4.0 4.0]
[1.0 5.0 4.0]
[4.0 4.0 3.0]
[5.0 3.0 4.0]
[1.0 4.0 4.0]
[1.0 3.0 3.0]
[3.0 2.0 1.0]
[3.0 2.0 2.0]
[2.0 2.0 2.0]
[1.0 2.0 2.0]
[3.0 1.0 1.0]
[1.0 3.0 4.0]
[2.0 3.0 3.0]
[4.0 2.0 2.0]
[5.0 4.0 4.0]
[1.0 4.0 4.0]
[3.0 3.0 4.0]
[4.0 3.0 3.0]
[1.0 4.0 4.0]
[1.0 2.0 2.0]
[4.0 1.0 2.0]
[4.0 4.0 4.0]
[3.0 2.0 2.0]
[3.0 3.0 3.0]
[3.0 4.0 3.0]
[1.0 4.0 4.0]
[2.0 4.0 4.0]
[3.0 3.0 3.0]
[4.0 4