In [1]:
import json
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

# 1.Preparation

In [2]:
def get_dict(file_name):
  # Opening JSON file
  with open(file_name) as json_file:
    data = json.load(json_file)
  return data

In [3]:
train_subjects = ['P01', 'P04', 'P06','P07', 'P09','P11','P12'] 
test_subjects = ['P13']
# discard data collected from P14. He/She was sleeping

Event IDs: [  11   12   13     21   22   23     31   32   33    41   42
   43    111  112  113 121  122  123    131  132  133 
  141  142  143   211  212  213   221  222  223   231  232
  233  241  242  243 ]

Each subject 36 events, and each event has 60 points

In [4]:
train_dict = get_dict('train_3175.json')
test_dict = get_dict('test_3175.json')

print(train_dict.keys())
print(test_dict.keys())

print(len(train_dict['P01']))
# P01 subject key, and 112 is event key, value is 60 points
print(len(train_dict['P01']['112']))

dict_keys(['P01', 'P04', 'P06', 'P07', 'P09', 'P11', 'P12'])
dict_keys(['P13', 'P14'])
36
3175


# 2. Predict whether the music which subjects are listening to have lyrics

###### Subjects were listening to different types of music. In this task, our model predicts whether the music which subject were listening to has lyrics or not. The event ID indicates whether a piece of music has lyrics or not. According to the dataset, if the number before the last digit of event ID is less than 5, the music includes lyrics. If not, the music does not include lyrics. 

###### We label 1 if music has lyrics, 0 if music does not have lyrics

### 2.1 Get training data

In [23]:
X_train = []
y_train=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        X_train.append(train_dict[i][e])
        if int(e[:-1]) < 5:
            y_train.append(1)
        else:
            y_train.append(0)
X_train = np.array(X_train)
y_train = np.array(y_train)


In [24]:
print(X_train.shape)
print(y_train.shape)

print(X_train)
print(y_train)

(252, 3175)
(252,)
[[-1.69119553e-06 -1.70719092e-06 -1.70571593e-06 ... -2.82413139e-06
  -2.99672046e-06 -3.12866468e-06]
 [ 2.51033489e-07  8.28635578e-08 -5.07200770e-08 ...  1.46581121e-06
   1.40723283e-06  1.34682366e-06]
 [ 2.32050990e-07  2.68842057e-07  2.88497187e-07 ... -2.18505816e-07
  -3.29256592e-07 -4.77714800e-07]
 ...
 [-4.31661990e-07 -5.58612147e-07 -6.61924365e-07 ...  1.04634445e-06
   1.01671609e-06  8.77562638e-07]
 [-3.99281651e-08  1.19493492e-07  1.76883307e-07 ...  1.70887086e-06
   1.73587248e-06  1.74346539e-06]
 [-5.51766142e-07 -5.23061655e-07 -4.25232392e-07 ... -1.94006106e-07
  -1.97847205e-07 -1.63716152e-07]]
[1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

### 2.2 Train model

In [25]:
clf = LinearDiscriminantAnalysis().fit(X_train, y_train)

### 2.3 Cross validation

In [26]:
# 5-fold cross validation
score_lsqrs = cross_val_score(clf.fit(X_train, y_train), X_train, y_train, cv = 5)

print("cross validation accuracy: %f" % np.mean(score_lsqrs))

cross validation accuracy: 0.610980


### 2.4 Check accuracy on train set

In [9]:
pred=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        sample = np.array(train_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
pred=np.array(pred).astype(int)
print("Model's accuracy on train set is",np.sum(pred==y_train)/len(pred))

Model's accuracy on train set is 0.9404761904761905


### 2.5 Apply model to test set

In [10]:
y_test=[]
pred=[]
for i in test_subjects:
    eventIDs=test_dict[i].keys()
    for e in eventIDs:
        sample = np.array(test_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
        if int(e[:-1])<5:
            y_test.append(1)
        else:
            y_test.append(0)
pred=np.array(pred).astype(int)
y_test=np.array(y_test)
print("Model's accuracy on test set is",np.sum(pred==y_test)/len(pred))

Model's accuracy on test set is 0.6111111111111112


### 2.6 Try another machine learning model

In [11]:
!pip install lightgbm
import lightgbm as lgb



In [12]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

### 2.7 Check accuracy on train set

In [13]:
pred=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        sample = np.array(train_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
pred=np.array(pred).astype(int)
print("LightGBM Model's accuracy on train set is",np.sum(pred==y_train)/len(pred))

LightGBM Model's accuracy on train set is 1.0


### 2.8 Check accuracy on test set

In [14]:
y_test=[]
pred=[]
for i in test_subjects:
    eventIDs=test_dict[i].keys()
    for e in eventIDs:
        sample = np.array(test_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
        if int(e[:-1])<5:
            y_test.append(1)
        else:
            y_test.append(0)
pred=np.array(pred).astype(int)
y_test=np.array(y_test)
print("LightGBM Model's accuracy on test set is",np.sum(pred==y_test)/len(pred))

LightGBM Model's accuracy on test set is 0.6388888888888888


###### It seems that Light GBM is a better model, so we decide to mainly use Light GBM to implement the following predictive tasks.

# 3. Predict whether stimuli have cue clicks

###### Stimuli in this dataset were presented to the participants in several conditions while EEG was recorded. This task predicts whether stimulus was presented with cue clicks or not. The last digit of event ID indicates stimulus' condition. If this digit is less than 3, the stimulus was presented with cue clicks. If not, the stimulus was presented without cue clicks.
###### We label 1 if the stimulus was presented with cue clicks and 0 if the stimulus was presented without cue clicks.

### 3.1 Get training data

In [15]:
X_train = []
y_train=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        X_train.append(train_dict[i][e])
        if int(e[-1]) < 3:
            y_train.append(1)
        else:
            y_train.append(0)
X_train = np.array(X_train)
y_train = np.array(y_train)

### 3.2 Train model

In [16]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

### 3.3 Check accuracy on train set

In [17]:
pred=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        sample = np.array(train_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
pred=np.array(pred).astype(int)
print("Model's accuracy on train set is",np.sum(pred==y_train)/len(pred))

Model's accuracy on train set is 1.0


### 3.4 Check accuracy on test set

In [18]:
y_test=[]
pred=[]
for i in test_subjects:
    eventIDs=test_dict[i].keys()
    for e in eventIDs:
        sample = np.array(test_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
        if int(e[-1])<3:
            y_test.append(1)
        else:
            y_test.append(0)
pred=np.array(pred).astype(int)
y_test=np.array(y_test)
print("Model's accuracy on test set is", np.sum(pred==y_test)/len(pred))

Model's accuracy on test set is 0.8055555555555556


# 4. Predict stimuli's conditions

###### More specifically, the stimulu were presented in these four conditions. 
1. Stimulus perception with cue clicks

2. Stimulus imagination with cue clicks

3. Stimulus imagination without cue clicks

4. Stimulus imagination without cue clicks, with additional feedback from participants after each trial

We only include the first three conditions in our data since the fourth condition is affected by feedback after trials.

### 4.1 Get data for training

In [19]:
X_train = []
y_train=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        X_train.append(train_dict[i][e])
        y_train.append(int(e[-1]))
X_train = np.array(X_train)
y_train = np.array(y_train)

### 4.2 Train model

In [20]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

### 4.3 Check accuracy on train set

In [21]:
pred=[]
for i in train_subjects:
    eventIDs=train_dict[i].keys()
    for e in eventIDs:
        sample = np.array(train_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
pred=np.array(pred).astype(int)
print("Model's accuracy on train set is",np.sum(pred==y_train)/len(pred))

Model's accuracy on train set is 1.0


### 4.4 Check accuracy on test set

In [22]:
y_test=[]
pred=[]
for i in test_subjects:
    eventIDs=test_dict[i].keys()
    for e in eventIDs:
        sample = np.array(test_dict[i][e]).reshape(1,-1)
        pred.append(clf.predict(sample)[0])
        y_test.append(int(e[-1]))
pred=np.array(pred).astype(int)
y_test=np.array(y_test)
print("Model's accuracy on test set is",np.sum(pred==y_test)/len(pred))

Model's accuracy on test set is 0.6388888888888888


# Conclusion
The overall performance of our predictive model is not as good as expected. The performance of train set is much higher than the performance of test set. Our model might have overfitting problems. The reason might be that we did not properly encode the EEG data as model input. The size of the dataset might also be a problem. We only have data from nine subjects. In order to have a better model, we need data from more subjects.