In [45]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StructField, StructType
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.classification import DecisionTreeClassifier,NaiveBayes,RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
schema = StructType((
  StructField("label", LongType(), True),
  StructField("features", VectorUDT(), True)))
featuredData=spark.read.schema(schema).json("featuredData")
training,test=featuredData.randomSplit([0.8,0.2])

In [46]:
startTime=time.time()
DTEstimator=DecisionTreeClassifier(labelCol="label",\
                                   featuresCol="features",\
                                   impurity='gini', \
                                   maxDepth=5, \
                                   maxBins=5)
DTTransformer=DTEstimator.fit(training)
TrainPredictions=DTTransformer.transform(training)
TestPredictions=DTTransformer.transform(test)

In [47]:
DTEvaluator=MulticlassClassificationEvaluator(labelCol="label",\
                                              predictionCol="prediction",\
                                              metricName="accuracy")

TrainAccuracy=DTEvaluator.evaluate(TrainPredictions)
TestAccuracy=DTEvaluator.evaluate(TestPredictions)
ExecTime=time.time()-startTime
print("Decision Tree: \nTraining accracy: %s \nTest accuacy: %s \nRunning time: %s seconds" % (TrainAccuracy,TestAccuracy,ExecTime))

Decision Tree: 
Training accracy: 0.8934010152284264 
Test accuacy: 0.6949152542372882 
Running time: 1.0747079849243164 seconds


In [53]:
startTime=time.time()
NBEstimator=NaiveBayes(smoothing=1.0,\
                       modelType="multinomial")
NBTransformer=NBEstimator.fit(training)
TrainPredictions=NBTransformer.transform(training)
TestPredictions=NBTransformer.transform(test)

In [54]:
NBEvaluator=MulticlassClassificationEvaluator(labelCol="label",\
                                              predictionCol="prediction",\
                                              metricName="accuracy")

TrainAccuracy=NBEvaluator.evaluate(TrainPredictions)
TestAccuracy=NBEvaluator.evaluate(TestPredictions)
ExecTime=time.time()-startTime
print("Naive Bayes: \nTraining accracy: %s \nTest accuacy: %s \nRunning time: %s seconds" % (TrainAccuracy,TestAccuracy,ExecTime))

Naive Bayes: 
Training accracy: 0.7817258883248731 
Test accuacy: 0.6949152542372882 
Running time: 0.6066391468048096 seconds


In [55]:
startTime=time.time()
RFEstimator=RandomForestClassifier(numTrees=50,\
                                   featureSubsetStrategy="auto", \
                                   impurity='gini', \
                                   maxDepth=25, \
                                   maxBins=5)
RFTransformer=RFEstimator.fit(training)
TrainPredictions=RFTransformer.transform(training)
TestPredictions=RFTransformer.transform(test)

In [56]:
RFEvaluator=MulticlassClassificationEvaluator(labelCol="label",\
                                              predictionCol="prediction",\
                                              metricName="accuracy")

TrainAccuracy=RFEvaluator.evaluate(TrainPredictions)
TestAccuracy=RFEvaluator.evaluate(TestPredictions)
ExecTime=time.time()-startTime
print("Random Forest: \nTraining accracy: %s \nTest accuacy: %s \nRunning time: %s seconds" % (TrainAccuracy,TestAccuracy,ExecTime))

Random Forest: 
Training accracy: 1.0 
Test accuacy: 0.7627118644067796 
Running time: 1.5300071239471436 seconds
