In [2]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.feature import Imputer
from pyspark.sql import DataFrameReader
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.sql.window import Window
import numpy
import pyeeg

In [3]:
import os
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [4]:
sc = SparkSession\
    .builder\
    .appName("myEEGSession")\
    .config("spark.mongodb.input.uri", "mongodb://54.188.74.0/eeg.eeg_features") \
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.11:2.4.0')\
    .getOrCreate()

In [5]:
ss = SparkSession.builder.getOrCreate()

In [6]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL)

## Load the Data

In [7]:
# non-rqa features
df = sc.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://54.188.74.0/eeg.eeg_features").load()

In [8]:
df.count(), df.distinct().count()

(376, 376)

In [9]:
#vrqa features
df_rqa = sc.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://54.188.74.0/test.eeg_features_rqa").load()

In [10]:
df_rqa.count(), df_rqa.distinct().count()

(376, 376)

### Joining Tables

In [11]:
labels = df.select('label').distinct().rdd.keys().collect()

In [12]:
master_channel_list = ["Fp1","Fp2","F7","F3","Fz","F4","F8","T7","C3","Cz",
                       "C4","T8","P7","P3","Pz","P4","P8","O1","O2"]

In [13]:
# Split both rqa and non-rqa tables by channel type ie labels
df_list = [df.filter(col("label")== l) for l in labels]
df_list_rqa = [df_rqa.filter(col("label")== l) for l in labels]

In [14]:
columns_all = df.columns
columns_all_rqa = df_rqa.columns

In [15]:
# add column name + channel name to all columns except these
columns_except = ['_id','f_labels','participant_group','participant_id','startdate','file_duration',
                  'label',  'sample_rate','signals_in_file','unique_id']
columns_except_rqa = ['_id', 'f_labels',  'participant_group', 'participant_id','startdate','file_duration',
                      'label', 'sample_rate', 'signals_in_file', 'unique_id']

In [16]:
# columns names to be changed
columns_rename = [c for c in columns_all if c not in columns_except]
columns_rename_rqa = [c for c in columns_all_rqa if c not in columns_except_rqa]

In [17]:
len(df_list), len(df_list_rqa)

(8, 8)

### Rename columns

In [18]:
for i in range(len(labels)):
    for c in df_list[i].columns:
        if c in columns_rename:
            df_list[i] = df_list[i].withColumnRenamed(c, c+"_"+labels[i])

In [19]:
for i in range(len(labels)):
    for c in df_list_rqa[i].columns:
        if c in columns_rename_rqa:
            df_list_rqa[i] = df_list_rqa[i].withColumnRenamed(c, c+"_"+labels[i])

In [20]:
# Create a base table that has unique keys , to join with df and df_rqa
temp = df.select(*['participant_group','participant_id', 'startdate','file_duration']).distinct().orderBy(*['participant_group','participant_id', 'startdate','file_duration'])

In [21]:
# Remove redundant columns
for i in range(len(df_list)):
    df_list[i] = df_list[i].drop(*['_id','f_labels','label','sample_rate','signals_in_file','unique_id','file_duration'])

for i in range(len(df_list_rqa)):
    df_list_rqa[i] = df_list_rqa[i].drop(*['unique_id' , '_id', 'f_labels', 'label','sample_rate','signals_in_file','file_duration'])

In [22]:
# Join every dataframe in the df_list to the temp
for i in range(len(df_list)):
    temp= temp.join(df_list[i],['participant_group','participant_id', 'startdate'])
for i in range(len(df_list_rqa)):    
    temp= temp.join(df_list_rqa[i],['participant_group','participant_id', 'startdate'])    

In [24]:
#Final count of files
temp.count()

47

In [26]:
df = temp
del temp
del df_list
del df_list_rqa
del df_rqa

### Sort the data base on the start and filter

In [27]:
window = Window.\
              partitionBy('participant_id','participant_group').\
              orderBy(df['startdate'].desc())
df = df.withColumn("rank_start_dt",F.dense_rank().over(window))

In [28]:
# Filter for the latest file
df = df.filter("rank_start_dt == 1")

In [29]:
df.count()

36

In [30]:
# df_meta contains metadata about the patient
df_meta = sc.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://54.188.74.0/eeg.eeg_metadata").load()

In [31]:
df_meta = df_meta.drop('Delivery_type',
 'Gender',
 'Gestational_Age',
 'Maternal_age',
 'Multiple_births','Relative_size',
 'Weight_gms',
 '_id',
 'num_recording')

In [32]:
# Join with df_meta to get the output variable Prematurity_Level
features = df.join(df_meta, on = ['participant_group','participant_id'] , how='inner')
features = features.drop('participant_group','participant_ID','startdate','file_duration','rank_start_dt')

In [33]:
features.count()

34

In [37]:
# Join the tables
features = df.join(df_meta, on = ['participant_group','participant_id'] , how='inner')

In [38]:
del df
del df_meta

## Feature Engineering

In [34]:
from collections import Counter
Counter([d[1] for d in features.dtypes])

Counter({'double': 937, 'struct<lyap_A7:string,lyap_D7:string>': 8, 'int': 72})

In [35]:
[f[0] for f in features.dtypes if f[1] in ('double','int', 'string')]

['dfa_A0_O1',
 'dfa_A7_O1',
 'dfa_D1_O1',
 'dfa_D2_O1',
 'dfa_D3_O1',
 'dfa_D4_O1',
 'dfa_D5_O1',
 'dfa_D6_O1',
 'dfa_D7_O1',
 'hurst_exponent_A0_O1',
 'hurst_exponent_A7_O1',
 'hurst_exponent_D1_O1',
 'hurst_exponent_D2_O1',
 'hurst_exponent_D3_O1',
 'hurst_exponent_D4_O1',
 'hurst_exponent_D5_O1',
 'hurst_exponent_D6_O1',
 'hurst_exponent_D7_O1',
 'lyap0_A0_O1',
 'lyap0_A7_O1',
 'lyap0_D1_O1',
 'lyap0_D2_O1',
 'lyap0_D3_O1',
 'lyap0_D4_O1',
 'lyap0_D5_O1',
 'lyap0_D6_O1',
 'lyap0_D7_O1',
 'lyap1_A0_O1',
 'lyap1_A7_O1',
 'lyap1_D1_O1',
 'lyap1_D2_O1',
 'lyap1_D3_O1',
 'lyap1_D4_O1',
 'lyap1_D5_O1',
 'lyap1_D6_O1',
 'lyap1_D7_O1',
 'lyap2_A0_O1',
 'lyap2_A7_O1',
 'lyap2_D1_O1',
 'lyap2_D2_O1',
 'lyap2_D3_O1',
 'lyap2_D4_O1',
 'lyap2_D5_O1',
 'lyap2_D6_O1',
 'lyap2_D7_O1',
 'power_A0_O1',
 'power_A7_O1',
 'power_D1_O1',
 'power_D2_O1',
 'power_D3_O1',
 'power_D4_O1',
 'power_D5_O1',
 'power_D6_O1',
 'power_D7_O1',
 'sample_entropy_A0_O1',
 'sample_entropy_A7_O1',
 'sample_entropy_D1_O1'

In [46]:
features = features.select(*[f[0] for f in features.dtypes if f[1] in ('double','int', 'string')])

In [47]:
# Get the int/ str features
feat_str = [d[0] for d in features.dtypes if d[1] == 'string']
feat_dbl = [d[0] for d in features.dtypes if d[1] in ['double','int']]
feat_all = []
feat_all.extend(feat_dbl)
feat_all.extend(feat_str)

In [48]:
len(feat_str), len(feat_all)

(0, 1009)

### Remove nulls

In [49]:
nulls_df = features.select([((count(when(isnan(c)| col(c).isNull(), c)))*100.0/count(c)).alias(c)  for c in feat_all])

#### Drop columns with more than 10% nulls

In [53]:
# Keep columns with less than 10% nulls remove rest
null_threshold = 10.0
def multicontd(x):
    if x < null_threshold and x != 0:
        return True

In [54]:
features.columns

['dfa_A0_O1',
 'dfa_A7_O1',
 'dfa_D1_O1',
 'dfa_D2_O1',
 'dfa_D3_O1',
 'dfa_D4_O1',
 'dfa_D5_O1',
 'dfa_D6_O1',
 'dfa_D7_O1',
 'hurst_exponent_A0_O1',
 'hurst_exponent_A7_O1',
 'hurst_exponent_D1_O1',
 'hurst_exponent_D2_O1',
 'hurst_exponent_D3_O1',
 'hurst_exponent_D4_O1',
 'hurst_exponent_D5_O1',
 'hurst_exponent_D6_O1',
 'hurst_exponent_D7_O1',
 'lyap0_A0_O1',
 'lyap0_A7_O1',
 'lyap0_D1_O1',
 'lyap0_D2_O1',
 'lyap0_D3_O1',
 'lyap0_D4_O1',
 'lyap0_D5_O1',
 'lyap0_D6_O1',
 'lyap0_D7_O1',
 'lyap1_A0_O1',
 'lyap1_A7_O1',
 'lyap1_D1_O1',
 'lyap1_D2_O1',
 'lyap1_D3_O1',
 'lyap1_D4_O1',
 'lyap1_D5_O1',
 'lyap1_D6_O1',
 'lyap1_D7_O1',
 'lyap2_A0_O1',
 'lyap2_A7_O1',
 'lyap2_D1_O1',
 'lyap2_D2_O1',
 'lyap2_D3_O1',
 'lyap2_D4_O1',
 'lyap2_D5_O1',
 'lyap2_D6_O1',
 'lyap2_D7_O1',
 'power_A0_O1',
 'power_A7_O1',
 'power_D1_O1',
 'power_D2_O1',
 'power_D3_O1',
 'power_D4_O1',
 'power_D5_O1',
 'power_D6_O1',
 'power_D7_O1',
 'sample_entropy_A0_O1',
 'sample_entropy_A7_O1',
 'sample_entropy_D1_O1'

In [None]:
null_cols = nulls_df.columns
null_counts = list(nulls_df.rdd.map(tuple).collect()[0])
null_cols_imp = [null_cols[i] for i,n in enumerate(null_counts) if multicontd(n)]

In [None]:
# Num of columns to impute
len(null_cols_imp)

In [None]:
null_cols_drop = [null_cols[i] for i,n in enumerate(null_counts) if n >= null_threshold]
features = features.drop(*null_cols_drop)

In [None]:
len(null_cols_drop), len(features.columns)

In [None]:
# columns to impute and their new names
del nulls_df
null_cols_imp_db = [f for f in null_cols_imp if f in feat_dbl]
null_cols_impNew = [col+'_imp' for col in null_cols_imp_db]

In [None]:
len(null_cols_impNew)

#### Impute columns

In [None]:
imputer = Imputer(inputCols= null_cols_imp_db , outputCols=null_cols_impNew,strategy='median')
features = imputer.fit(features).transform(features).drop(*null_cols_imp_db)

In [44]:
### Remove columns with infinity values
features = features.filter(~col('participant_id').isin(['A6-1-1', 'B16-1-2', 'B9-1-2', 'B9-2-2', 'B2-1-1', 'B14-1-1']))

In [None]:
import numpy as np
pd_df=features.toPandas()
list(pd_df['participant_id'][np.sum((pd_df == np.inf), axis =1 ) > 0])

#### Remove colums with populated with zero

In [56]:
zero_df = features.select([((count(when(features[c].between(0,0), c)))*100.0/count(c)).alias(c)  for c in features.columns if c in feat_dbl])

In [57]:
zero_threshold = 20.0
zero_cols = zero_df.columns
zero_counts = list(zero_df.rdd.map(tuple).collect()[0])
zero_cols_drop = [zero_cols[i] for i,n in enumerate(zero_counts) if n > zero_threshold]

In [58]:
del zero_df

In [59]:
features = features.drop(*zero_cols_drop)

In [None]:
# null_cols_imp = [f for f in null_cols_imp if f in feat_str]
# null_cols_impNew = [col+'_imp' for col in null_cols_imp]

#### Remove error logs and patient information from features

In [None]:
# Drop patient info
features = features.drop(*['participant_group', 'participant_id', 'startdate'])

In [None]:
temp_cols = features.columns
temp_cols.remove('Prematurity_Level')
temp_cols

In [None]:
# Drop error logs
errorcols = [f for f in temp_cols if f.startswith('error_nonrqa')]
features=features.drop(*errorcols)
temp_cols = [f for f in temp_cols if f not in errorcols]

In [None]:
# Set features df to df with features+label
features = features.select(*temp_cols,'Prematurity_Level')
from pyspark.sql.types import StringType
features = features.withColumn("Prematurity_Level", features["Prematurity_Level"].cast(StringType()))
features = features.withColumnRenamed('Prematurity_Level', "label")

#### Dummize label

In [None]:
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

features = features.withColumn("label", features["label"].cast(StringType()))
features = indexStringColumns(features, ['label'])

In [None]:
features = features.drop('file_duration')

In [None]:
# from pyspark.sql.types import DoubleType
# from pyspark.sql.functions import col, lit, udf, when
# import numpy as np
# replace_infs_udf = udf(
#     lambda x, v: float(v) if x and np.isinf(x) else x, DoubleType()
# )

# for c in features.columns:
#     features.withColumn(c, replace_infs_udf(col(c),lit(features.approxQuantile(c, [0.5], 0.25)[0]))).show()

## Assembler

In [None]:
va = VectorAssembler(outputCol="features", inputCols=features.columns[0:-1],) #except the last col.
df = va.transform(features).select("features", "label")

In [64]:
len(features.columns)

598

In [61]:
# va.transform(features).setHandleInvalid("skip").show()

In [62]:
[f for f in features.columns if f == 'file_duration']

['file_duration']

## Split &  Cache the data

In [97]:
train, test = df.randomSplit([0.8,0.2])

In [98]:
train = train.cache()
test = test.cache()

## ML Models

### Random Forest

In [73]:
# TRAIN
from pyspark.ml.classification import RandomForestClassifier
import time
init_time = time.time()
rf = RandomForestClassifier(maxDepth=30)
fit_time = time.time()
rfmodel = rf.fit(train)
trans_time = time.time()
rfpredicts = rfmodel.transform(test)
finish_time = time.time()
elapsed_time = finish_time - init_time
print('Time taken to fit and evaluate model %.3f seconds' % elapsed_time)

Time taken to fit and evaluate model 105.985 seconds


In [88]:
# EVALUATE
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval_start = time.time()
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rfpredicts)
eval_end = time.time()
print("Accuracy = %0.4f" % (accuracy))
eval_elapse = eval_end - eval_start
print('Time taken evaluate model %.3f seconds' % eval_elapse )

Accuracy = 0.6667
Time taken evaluate model 103.828 seconds


#### Feature importance

In [84]:
# feature importance
feat_imp = list(rfmodel.featureImportances.toArray())
feat_name = list(features.columns)[0:-1]

In [87]:
from collections import defaultdict
d = defaultdict()
for i in range(len(feat_imp)):
    d[feat_name[i]] = feat_imp[i]
    

In [100]:
sorted_by_value = sorted(d.items(), key=lambda kv: kv[1], reverse = True)

In [112]:
[f[0] for f in sorted_by_value]

['lyap1_D1_Fp1',
 'lyap0_D1_C4',
 'lyap2_D3_T7',
 'sample_entropy_A7_O2',
 'lyap2_D4_C3',
 'hurst_exponent_D2_C4_imp',
 'sample_entropy_A7_O1',
 'lyap1_A0_O2',
 'lyap0_D5_C3',
 'sample_entropy_D5_O1',
 'lyap1_A0_Fp1',
 'power_D1_T7',
 'lyap2_A7_O2_imp',
 'lyap2_D3_Fp2',
 'lyap1_D2_C3',
 'power_D4_Fp1',
 'lyap2_D4_T8',
 'dfa_D4_C3_imp',
 'lyap0_D7_T7_imp',
 'sample_entropy_D7_C3',
 'lyap2_A7_T8_imp',
 'lyap2_A7_T7_imp',
 'power_D6_T7',
 'power_D6_T8',
 'power_A0_C3',
 'dfa_D6_O2_imp',
 'lyap1_D1_O2',
 'lyap2_D6_C3',
 'hurst_exponent_D6_T8_imp',
 'lyap1_A0_Fp2',
 'dfa_D1_C3_imp',
 'power_D2_O2',
 'lyap1_D2_Fp2',
 'lyap1_D1_O1',
 'hurst_exponent_A0_O2_imp',
 'dfa_D5_Fp1',
 'lyap2_D7_Fp1_imp',
 'hurst_exponent_A0_T7',
 'lyap0_D6_T8',
 'lyap1_A7_Fp1_imp',
 'power_D6_Fp2',
 'hurst_exponent_A7_Fp2',
 'dfa_D2_Fp1',
 'lyap1_D5_Fp1',
 'lyap2_D4_Fp1',
 'lyap0_A7_T7_imp',
 'lyap1_D7_Fp1_imp',
 'lyap1_D4_Fp1',
 'power_D3_C4',
 'power_A0_O2',
 'lyap0_D1_T7',
 'sample_entropy_A7_T8',
 'dfa_D3_C3_imp'

#### Per-class performace

In [108]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# Compute raw scores on the test set
predictionAndLabels = rfpredicts.select('prediction', 'label').rdd.map(lambda row: [str(c) for c in row]).map(lambda x: (float(x[0]), float(x[1])))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

# Statistics by class
labels = train.rdd.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    try:
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    except:
        continue
# Weighted stats
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Summary Stats
Precision = 0.6666666666666666
Recall = 0.6666666666666666
F1 Score = 0.6666666666666666
Class 0.0 precision = 0.6666666666666666
Class 0.0 recall = 1.0
Class 0.0 F1 Measure = 0.8
Class 2.0 precision = 0.0
Class 2.0 recall = 0.0
Class 2.0 F1 Measure = 0.0
Weighted recall = 0.6666666666666666
Weighted precision = 0.4444444444444444
Weighted F(1) Score = 0.5333333333333333
Weighted F(0.5) Score = 0.4761904761904761
Weighted false positive rate = 0.6666666666666666


### Decision Tree

In [109]:
# TRAIN
from pyspark.ml.classification import DecisionTreeClassifier
dt_init_time = time.time()
dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)
dt_fit_time = time.time()
dtmodel = dt.fit(train)
dt_trans_time = time.time()
dtpredicts = dtmodel.transform(test)
dt_finish_time = time.time()
dt_elapsed_time = dt_finish_time - dt_init_time
print('Time taken to fit and evaluate model %.3f seconds' % dt_elapsed_time)

Time taken to fit and evaluate model 10.762 seconds


In [110]:
# EVALUATE
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dt_eval_start = time.time()
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtpredicts)
dt_eval_end = time.time()
print("Accuracy = %0.4f" % (accuracy))
dt_eval_elapse = dt_eval_end - dt_eval_start
print('Time taken evaluate model %.3f seconds' % dt_eval_elapse )

Accuracy = 0.5000
Time taken evaluate model 98.514 seconds


#### Per-class performace

In [111]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# Compute raw scores on the test set
predictionAndLabels = dtpredicts.select('prediction', 'label').rdd.map(lambda row: [str(c) for c in row]).map(lambda x: (float(x[0]), float(x[1])))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

# Statistics by class
labels = train.rdd.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    try:
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    except:
        continue

# Weighted stats
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Summary Stats
Precision = 0.5
Recall = 0.5
F1 Score = 0.5
Class 0.0 precision = 0.4
Class 0.0 recall = 1.0
Class 0.0 F1 Measure = 0.5714285714285715
Class 1.0 precision = 1.0
Class 1.0 recall = 0.3333333333333333
Class 1.0 F1 Measure = 0.5
Class 2.0 precision = 0.0
Class 2.0 recall = 0.0
Class 2.0 F1 Measure = 0.0
Weighted recall = 0.5
Weighted precision = 0.6333333333333333
Weighted F(1) Score = 0.44047619047619047
Weighted F(0.5) Score = 0.5086580086580087
Weighted false positive rate = 0.25


### One vs All logistic regression

In [115]:
# TRAIN
from pyspark.ml.classification import LogisticRegression, OneVsRest

dt_init_time = time.time()
# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
dt = OneVsRest(classifier=lr)
dt_fit_time = time.time()
dtmodel = dt.fit(train)
dt_trans_time = time.time()
dtpredicts = dtmodel.transform(test)
dt_finish_time = time.time()
dt_elapsed_time = dt_finish_time - dt_init_time
print('Time taken to fit and evaluate model %.3f seconds' % dt_elapsed_time)

Time taken to fit and evaluate model 438.119 seconds


In [116]:
# EVALUATE
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dt_eval_start = time.time()
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtpredicts)
dt_eval_end = time.time()
print("Accuracy = %0.4f" % (accuracy))
dt_eval_elapse = dt_eval_end - dt_eval_start
print('Time taken evaluate model %.3f seconds' % dt_eval_elapse )

Accuracy = 0.3333
Time taken evaluate model 10.750 seconds


In [None]:
# eval_start = time.time()
# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
# accuracy = evaluator.evaluate(rfpredicts)
# eval_end = time.time()
# print("Accuracy = %0.4f" % (accuracy))
# eval_elapse = eval_end - eval_start
# print('Time taken evaluate model %.3f seconds' % eval_elapse )

In [None]:
# from pyspark.ml.feature import PCA
# from pyspark.ml import Pipeline


# pca = PCA(k=100, inputCol="features", outputCol="pca")
# init_time = time.time()
# rf = RandomForestClassifier(maxDepth=30)
# pipeline = Pipeline (stages=[va, pca, rf])
# fit_time = time.time()
# rfmodel = pipeline.fit(train)
# trans_time = time.time()
# rfpredicts = rfmodel.transform(test)
# finish_time = time.time()
# elapsed_time = finish_time - init_time
# print('Time taken to fit and evaluate model %.3f seconds' % elapsed_time)

In [59]:
sc.stop()