## Overview
In this notebook we fetch the data from Mongo DB into an EMR cluster and develop several machine learning models for accurately predicting the event type from the eeg signals. 
We compare the performance of models in terms of both area under the ROC curve (AUC) and time taken to classify the test data.
The raw EEG signals have been pre-processed and stored in Mongo DB along with their corresponding event data. The pre-processing step is part of another notebook.


### Connect to MongoDB

In [None]:
import os
import time

from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [1]:
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [2]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://34.219.77.22/msds697.eeg")\
.config("spark.executor.memory", "22g")\
.config("spark.driver.memory", "10g").config("spark.memory.offHeap.enabled",True)\
.config("spark.memory.offHeap.size", "3g")\
.getOrCreate()

### Loading data from mongodb

In [None]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [None]:
#create a feature vector
target_cols = [str(x) for x in range(6)] # 6 events
feature_cols = [str(x) for x in range(6,26)] 

va = VectorAssembler(outputCol="features", inputCols=feature_cols)
lpoints = va.transform(df).select("features", (df['0']).alias('HandStart'),\
                                  (df['1']).alias('FirstDigitTouch'),\
                                  (df['2']).alias('BothStartLoadPhase'),\
                                  (df['3']).alias('LiftOff'),\
                                  (df['4']).alias('Replace'),\
                                  (df['5']).alias('BothReleased'))

In [7]:
#Splitting the data into train and test sets and caching them
splits=lpoints.randomSplit([0.8,0.2])
eeg_train = splits[0].cache()
eeg_valid=splits[1].cache()

### Apply Spark ML models to classify event

### Logistic regression

In [19]:
#labels are events to classify
labels= ['HandStart','FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']
#iterating over the events and fitting a logisitic regression model and train it for each event
start= time.time()
for label in labels:
    lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True, labelCol=label)
    lrmodel = lr.fit(eeg_train.select('features',label))
    validpredicts = lrmodel.transform(eeg_valid.select('features',label))
    bceval = BinaryClassificationEvaluator(labelCol=label)
    auc = bceval.evaluate(validpredicts)
    duration= time.time()-start
    print ('area Under ROC ' + label+ " : " + str(auc))
print('Time taken for logistic regression '+ str(duration) + 's')

area Under ROC HandStart : 0.523186244053
area Under ROC FirstDigitTouch : 0.564556766985
area Under ROC BothStartLoadPhase : 0.561506486301
area Under ROC LiftOff : 0.577604279174
area Under ROC Replace : 0.595241467798
area Under ROC BothReleased : 0.579037383844
Time taken for logistic regression 150.112603903s


### Random Forest

In [22]:
#labels are events that we are trying to classify
labels= ['HandStart','FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']
start= time.time()
#iterating over the events and fitting a Random forest model and train it for each event
for label in labels:
    rf = RandomForestClassifier(maxDepth=10, labelCol=label)
    rfmodel = rf.fit(eeg_train.select('features',label))
    validpredicts = rfmodel.transform(eeg_valid.select('features',label))
    bceval = BinaryClassificationEvaluator(labelCol=label)
    auc = bceval.evaluate(validpredicts)
    duration= time.time()-start
    print ('area Under ROC ' + label+ " : " + str(auc))
print('Time taken for Random Forest '+ str(duration) + 's')

area Under ROC HandStart : 0.782961545581
area Under ROC FirstDigitTouch : 0.771348510298
area Under ROC BothStartLoadPhase : 0.767280614945
area Under ROC LiftOff : 0.750555491872
area Under ROC Replace : 0.747684223274
area Under ROC BothReleased : 0.772492734803
Time taken for Random Forest 726.342959881s


### Linear SVC

In [10]:
#labels are events that we are trying to classify
labels= ['HandStart','FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']
start= time.time()
#iterating over the events and fitting a logisitic regression model and train it for each event
for label in labels:
    svc = LinearSVC(labelCol=label)
    svcmodel = svc.fit(eeg_train.select('features',label))
    validpredicts = svcmodel.transform(eeg_valid.select('features',label))
    bceval = BinaryClassificationEvaluator(labelCol=label)
    auc = bceval.evaluate(validpredicts)
    duration= time.time()-start
    print ('areaUnderROC ' + label+ " : " + str(auc))
print('Time taken for Linear SVC  '+ str(duration) + 's')

areaUnderROC HandStart : 0.510207631536
areaUnderROC FirstDigitTouch : 0.535821460209
areaUnderROC BothStartLoadPhase : 0.52314056731
areaUnderROC LiftOff : 0.525072114898
areaUnderROC Replace : 0.554215976997
areaUnderROC BothReleased : 0.511668626108
Time taken for Linear SVC  4049.53015184s


### Gradient Boosting Trees

In [15]:
labels= ['HandStart','FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']
start= time.time()
for label in labels:
    gbt = GBTClassifier(maxIter=10, maxDepth=10, labelCol=label)
    gbtmodel = gbt.fit(eeg_train.select('features',label))
    validpredicts = gbtmodel.transform(eeg_valid.select('features',label))
    bceval = BinaryClassificationEvaluator(labelCol=label)
    auc = bceval.evaluate(validpredicts)
    duration= time.time()-start
    print ('areaUnderROC ' + label+ " : " + str(auc))
print('Time taken for Gradient Boosted Tree  '+ str(duration) + 's')


areaUnderROC HandStart : 0.856550745915
areaUnderROC FirstDigitTouch : 0.827641404973
areaUnderROC BothStartLoadPhase : 0.820571549804
areaUnderROC LiftOff : 0.822932543696
areaUnderROC Replace : 0.79483539414
areaUnderROC BothReleased : 0.853228062129
Time taken for Gradient Boosted Tree  939.55162406s
