#Decision Tree

In [0]:
# import libraries
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
# instantiate spark environment
spark=SparkSession.builder.appName("MLAlgo").getOrCreate()
# load the dataset
df = spark.read.csv("dbfs:/FileStore/shared_uploads/purvajainpj123@gmail.com/iris_dataset_new-1.csv", inferSchema=True, header=True).toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")
df.show(5)

+-------+-------+-------+-------+------+
|sep_len|sep_wid|pet_len|pet_wid| label|
+-------+-------+-------+-------+------+
|    5.1|    3.5|    1.4|    0.2|setosa|
|    4.9|    3.0|    1.4|    0.2|setosa|
|    4.7|    3.2|    1.3|    0.2|setosa|
|    4.6|    3.1|    1.5|    0.2|setosa|
|    5.0|    3.6|    1.4|    0.2|setosa|
+-------+-------+-------+-------+------+
only showing top 5 rows



###Data Processing

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
# transformer
vector_assembler = VectorAssembler(inputCols=["sep_len", "sep_wid", "pet_len", "pet_wid"],outputCol="features")
df_temp = vector_assembler.transform(df)
df_temp.show(3)

+-------+-------+-------+-------+------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid| label|         features|
+-------+-------+-------+-------+------+-----------------+
|    5.1|    3.5|    1.4|    0.2|setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|setosa|[4.7,3.2,1.3,0.2]|
+-------+-------+-------+-------+------+-----------------+
only showing top 3 rows



In [0]:
# drop the original data features column
df = df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df.show(3)

+------+-----------------+
| label|         features|
+------+-----------------+
|setosa|[5.1,3.5,1.4,0.2]|
|setosa|[4.9,3.0,1.4,0.2]|
|setosa|[4.7,3.2,1.3,0.2]|
+------+-----------------+
only showing top 3 rows



In [0]:
from pyspark.ml.feature import StringIndexer
# estimator
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = l_indexer.fit(df).transform(df)
df.show(3)

+------+-----------------+----------+
| label|         features|labelIndex|
+------+-----------------+----------+
|setosa|[5.1,3.5,1.4,0.2]|       0.0|
|setosa|[4.9,3.0,1.4,0.2]|       0.0|
|setosa|[4.7,3.2,1.3,0.2]|       0.0|
+------+-----------------+----------+
only showing top 3 rows



In [0]:
# data splitting
(training,testing) = df.randomSplit([0.7,0.3])

###train Decision Tree classifier using the pyspark.ml.classification library.

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
# train our model using training data
dt = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features")
model = dt.fit(training)

In [0]:
# test our model and make predictions using testing data
predictions = model.transform(testing)
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



###Accuracy and test error using pyspark.ml.evaluation library.

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)

Test Error = 0.0444444 
Accuracy = 0.955556 


#Random Forest Classifier

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
# train our model using training data
rf = RandomForestClassifier(labelCol="labelIndex",featuresCol="features", numTrees=10)
model = rf.fit(training)

In [0]:
# test our model and make predictions using testing data
predictions = model.transform(testing)
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [0]:
# evaluate the performance of the classifier
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex",predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)

Test Error = 0.0444444
Accuracy = 0.955556 
