In [1]:
import numpy as np
import scipy.io as io
import random
import scipy

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from utils import *
from eeg_feature_extraction.eeg_utils import *

In [2]:
duplicate_sents_data = pd.read_csv('./task_materials/duplicate_sentences.csv')
duplicate_sents = list(duplicate_sents_data.sentence.values)

In [3]:
# keep top k most important eeg features for further analysis, and modelling
k = 10

# extract most important eeg features for respective et feature
et_feature = 'FFD'

# freq domain binning strategy (must be one of 'avg' for averaging or 'max' for max-pooling)
merge = 'avg'

In [4]:
# pick random sentences as a held out test set (10% of the entire dataset) for which there is data for all participants
np.random.seed(42)

held_out_sents_task2 = sorted(np.random.randint(100, 300, 30))

held_out_sents_task3_first = sorted(np.random.randint(0, 170, 20))
held_out_sents_task3_second = sorted(np.random.randint(230, 265, 10))
held_out_sents_task3_third = sorted(np.random.randint(315, 355, 10))

held_out_sents_task3_first.extend(held_out_sents_task3_second)
held_out_sents_task3_first.extend(held_out_sents_task3_third)

held_out_sents_task3 = held_out_sents_task3_first

#np.savetxt('eeg_feature_extraction' + '\\held_out_test_set\\' + 'held_out_sents_task2.txt', held_out_sents_task2)
#np.savetxt('eeg_feature_extraction' + '\\held_out_test_set\\' + 'held_out_sents_task3.txt', held_out_sents_task3)

In [5]:
path = os.getcwd() + '\\eeg_feature_extraction\\' + '\\held_out_test_set\\'
files = [os.path.join(path, file) for file in os.listdir(path) if not file.endswith('.ipynb_checkpoints')]
held_out_sents = [np.loadtxt(file, dtype=int).tolist() for file in files]
held_out_sents_task2, held_out_sents_task3 = held_out_sents[0], held_out_sents[1]

In [6]:
X_NR = stacked_freq_per_sbj('task2', 'alpha', merge, et_feature, held_out_indices=held_out_sents_task2)
Y_NR = np.zeros((X_NR.shape[0], 1))

X_AR = stacked_freq_per_sbj('task3', 'alpha', merge, et_feature, held_out_indices=held_out_sents_task3)
Y_AR = np.ones((X_AR.shape[0], 1))

X, y = np.vstack((X_NR, X_AR)), np.vstack((Y_NR, Y_AR))

X, y = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [7]:
alpha_feature_weights = clf_fit(X_train, X_test, y_train, y_test, 'RandomForest')

np.savetxt('eeg_feature_extraction\\alpha_features_' + et_feature + '.txt', np.argsort(alpha_feature_weights)[::-1][:k])

0.9326907096559748


In [8]:
X_NR = stacked_freq_per_sbj('task2', 'theta', merge, et_feature, held_out_indices=held_out_sents_task2)
Y_NR = np.zeros((X_NR.shape[0], 1))

X_AR = stacked_freq_per_sbj('task3', 'theta', merge, et_feature, held_out_indices=held_out_sents_task3)
Y_AR = np.ones((X_AR.shape[0], 1))

X, y = np.vstack((X_NR, X_AR)), np.vstack((Y_NR, Y_AR))

X, y = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [9]:
theta_feature_weights = clf_fit(X_train, X_test, y_train, y_test, 'RandomForest')

np.savetxt('eeg_feature_extraction\\theta_features_' + et_feature + '.txt', np.argsort(theta_feature_weights)[::-1][:k])

0.9301423743836906


In [10]:
X_NR = stacked_freq_per_sbj('task2', 'beta', merge, et_feature, held_out_indices=held_out_sents_task2)
Y_NR = np.zeros((X_NR.shape[0], 1))

X_AR = stacked_freq_per_sbj('task3', 'beta', merge, et_feature, held_out_indices=held_out_sents_task3)
Y_AR = np.ones((X_AR.shape[0], 1))

X, y = np.vstack((X_NR, X_AR)), np.vstack((Y_NR, Y_AR))

X, y = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [11]:
beta_feature_weights = clf_fit(X_train, X_test, y_train, y_test, 'RandomForest')

np.savetxt('eeg_feature_extraction\\beta_features_' + et_feature + '.txt', np.argsort(beta_feature_weights)[::-1][:k])

0.9680904105035733


In [12]:
X_NR = stacked_freq_per_sbj('task2', 'gamma', merge, et_feature, held_out_indices=held_out_sents_task2)
Y_NR = np.zeros((X_NR.shape[0], 1))

X_AR = stacked_freq_per_sbj('task3', 'gamma', merge, et_feature, held_out_indices=held_out_sents_task3)
Y_AR = np.ones((X_AR.shape[0], 1))

X, y = np.vstack((X_NR, X_AR)), np.vstack((Y_NR, Y_AR))

X, y = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
y_train, y_test = y_train.ravel(), y_test.ravel()

In [13]:
gamma_feature_weights = clf_fit(X_train, X_test, y_train, y_test, 'RandomForest')

np.savetxt('eeg_feature_extraction\\gamma_features_' + et_feature + '.txt', np.argsort(gamma_feature_weights)[::-1][:k])

0.9861503517810648
