# Spark - Decision Tree, Random Forest and GBT Classifier

**Imports**

In [1]:
import findspark
findspark.init('/home/sedat/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession
from pyspark.ml import pipeline
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

**Start spark session and read csv**

In [16]:
spark = SparkSession.builder.appName('my_tree').getOrCreate()

In [17]:
df = spark.read.format('libsvm').load('sample_libsvm_data_dt.txt')

23/03/27 14:58:11 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [4]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [5]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



**Split data**

In [6]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [7]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbt = GBTClassifier()

**Model**

In [15]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbt_model = gbt.fit(train_df)

**Predictions**

In [9]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbt_preds = gbt_model.transform(test_df)

**Model Evaluation**

In [10]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [11]:
# decision tree accuracy
acc_eval.evaluate(dtc_preds)

0.9230769230769231

In [12]:
# random forest accuracy
acc_eval.evaluate(rfc_preds)

0.9615384615384616

In [14]:
# gbt accuracy
acc_eval.evaluate(gbt_preds)

0.9230769230769231