# DREAMER Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy.io import loadmat
import neurokit2 as nk

In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split

In [2]:
def preprocessing_and_feature(data):
    data_ECG = {}
    for participant in range(0,23):
        for video in range(0,18):
            # load raw baseline and stimuli data for left and right
            basl_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,0]
            stim_l=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,0]
            basl_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['baseline'][0,0][video,0][:,1]
            stim_r=raw['DREAMER'][0,0]['Data'][0,participant]['ECG'][0,0]['stimuli'][0,0][video,0][:,1]
            # process with neurokit
            ecg_signals_b_l, info_b_l = nk.ecg_process(basl_l,sampling_rate=256)
            ecg_signals_s_l, info_s_l = nk.ecg_process(stim_l,sampling_rate=256)
            ecg_signals_b_r, info_b_r = nk.ecg_process(basl_r,sampling_rate=256)
            ecg_signals_s_r, info_s_r = nk.ecg_process(stim_r,sampling_rate=256)
            # divide stimuli features by baseline features
            # would be interesting to compare classification accuracy when we
            # don't do this
            features_ecg_l=nk.ecg_intervalrelated(ecg_signals_s_l)/nk.ecg_intervalrelated(ecg_signals_b_l)
            features_ecg_r=nk.ecg_intervalrelated(ecg_signals_s_r)/nk.ecg_intervalrelated(ecg_signals_b_r)
            # average left and right features
            # would be interesting to compare classification accuracy when we
            # rather include both left and right features
            features_ecg=(features_ecg_l+features_ecg_r)/2
            if not len(data_ECG):
                data_ECG=features_ecg
            else:
                data_ECG=pd.concat([data_ECG,features_ecg],ignore_index=True)
    return data_ECG

In [2]:
raw = loadmat("data/DREAMER.mat")

In [None]:
df_Features = pd.DataFrame()
df_Features = preprocessing_and_feature(raw)

In [4]:
df_Features.head()

Unnamed: 0,ECG_Rate_Mean,HRV_RMSSD,HRV_MeanNN,HRV_SDNN,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,HRV_MCVNN,...,HRV_C2d,HRV_C2a,HRV_SD2d,HRV_SD2a,HRV_Cd,HRV_Ca,HRV_SDNNd,HRV_SDNNa,HRV_ApEn,HRV_SampEn
0,1.01935,0.937463,0.980715,0.942895,0.933189,0.961436,0.955897,0.98173,1.0,1.018617,...,1.006937,0.992211,0.947206,0.940288,1.008489,0.990622,0.946925,0.938547,1.438378,0.858731
1,0.901911,1.646394,1.108991,1.212197,1.641699,1.093063,1.484588,1.113345,1.104762,0.992243,...,1.024839,0.976554,1.209966,1.181118,1.020996,0.980174,1.228156,1.203344,1.174425,1.351139
2,1.032777,0.973727,0.967888,1.094067,0.968126,1.130365,1.006033,0.980392,1.111111,1.133333,...,0.778085,1.423316,0.974122,1.317528,0.802029,1.339746,0.976209,1.261751,2.239887,0.853926
3,0.952182,1.235442,1.049724,1.190338,1.236088,1.133953,1.176921,1.056122,1.142857,1.082126,...,0.861915,1.322482,1.128573,1.396073,0.856808,1.311056,1.126642,1.392277,1.317776,0.804541
4,1.063239,0.74778,0.942383,0.685394,0.745794,0.727298,0.793499,0.933333,0.786765,0.842962,...,1.035208,0.964293,0.751168,0.724984,1.074476,0.930645,0.766086,0.712975,1.371131,1.30124


In [32]:
cleanedDf = df_Features.dropna(axis=1, how='any')
cleanedDf.to_csv("data/preprocessedDREAMER.csv", index=False)

(414,)

In [7]:
def Participants_Data(raw):
    # Create new dataframe with emotion, participant, and video data
    a=np.zeros((23,18,9),dtype=object)
    for participant in range(0,23):
        for video in range(0,18):
            a[participant,video,0]=raw['DREAMER'][0,0]['Data'][0,participant]['Age'][0][0][0]
            a[participant,video,1]=raw['DREAMER'][0,0]['Data'][0,participant]['Gender'][0][0][0]
            a[participant,video,2]=participant+1
            a[participant,video,3]=video+1
            a[participant,video,4]=['Searching for Bobby Fischer','D.O.A.', 'The Hangover', 'The Ring', '300',
                      'National Lampoon\'s VanWilder', 'Wall-E', 'Crash', 'My Girl', 'The Fly',
                      'Pride and Prejudice', 'Modern Times', 'Remember the Titans', 'Gentlemans Agreement',
                      'Psycho', 'The Bourne Identitiy', 'The Shawshank Redemption', 'The Departed'][video]
            a[participant,video,5]=['calmness', 'surprise', 'amusement', 'fear', 'excitement', 'disgust',
                      'happiness', 'anger', 'sadness', 'disgust', 'calmness', 'amusement',
                      'happiness', 'anger', 'fear', 'excitement', 'sadness', 'surprise'][video]
            a[participant,video,6]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreValence'][0,0][video,0].astype(float)
            a[participant,video,7]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreArousal'][0,0][video,0].astype(float)
            a[participant,video,8]=raw['DREAMER'][0,0]['Data'][0,participant]['ScoreDominance'][0,0][video,0].astype(float)
    b=pd.DataFrame(a.reshape((23*18,a.shape[2])),columns=['Age','Gender','Participant','Video','Video_Name','Target_Emotion','Valence','Arousal','Dominance'])
    ## combine feature extraction dataframes with the new dataframe
    #all_data=pd.concat([data_EEG,data_ECG,b],axis=1)
    return b

In [8]:
df_Participants_Data = Participants_Data(raw)
df_Participants_Data.head()

Unnamed: 0,Age,Gender,Participant,Video,Video_Name,Target_Emotion,Valence,Arousal,Dominance
0,22,male,1,1,Searching for Bobby Fischer,calmness,4,3,2
1,22,male,1,2,D.O.A.,surprise,3,3,1
2,22,male,1,3,The Hangover,amusement,5,4,4
3,22,male,1,4,The Ring,fear,4,3,2
4,22,male,1,5,300,excitement,4,4,4


In [14]:
labels = df_Participants_Data['Target_Emotion']
labels.shape

(414,)

In [10]:
cleanedDf = pd.read_csv("data/preprocessedDREAMER.csv")

In [13]:
cvf = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [21]:
rfModel = RandomForestClassifier(n_estimators=1000, random_state=0)
score = np.mean(cross_val_score(rfModel, cleanedDf, labels, cv=cvf))

In [22]:
score

0.1648

In [20]:
cleanedDf.shape

(414, 41)