<a href="https://colab.research.google.com/github/Ricardo-Jaramillo/PySpark/blob/main/08_DecisionTrees_%26_RandomForests_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Now let's dive into some Tree Methods with Spark
We'll se some random forests and decision trees methods and then we're gonna evaluate them sith some usefull evaluation methods.

## Install pyspark and download the data

In [1]:
# Install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=7b3217a3c129cd749400e46197d5f23b65fb38091c5c275cffb5a6b029567af8
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [6]:
# Download the necessary files
!wget https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/DecisionTress/sample_libsvm_data.txt

--2023-10-04 14:20:48--  https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/DecisionTress/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’


2023-10-04 14:20:48 (12.0 MB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]



## Import libraries and read in the data

In [22]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [8]:
# Create a spark session
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [11]:
# Read in the data
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [13]:
# Show the data
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



## Create Tree Classifiers. Fit and predict with splitted data

In [16]:
# Split
train_data, test_data = data.randomSplit([0.7, 0.3])

In [17]:
# Create decision tree objects
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100) # numTrees=100
gbt = GBTClassifier()

In [18]:
# Fit the models
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [19]:
# Make predictions
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [21]:
# Show some predictions
dtc_preds.show(5)
rfc_preds.show(5)
gbt_preds.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[122,123,124...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[123,124,125...|   [98.0,2.0]|[0.98,0.0

## Evaluate the model performance

In [23]:
# Create evaluation object
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [30]:
# Evaluate
dtc_acc = acc_eval.evaluate(dtc_preds)
rfc_acc = acc_eval.evaluate(rfc_preds)
gbt_acc = acc_eval.evaluate(gbt_preds)

print(f'DTC Accuracy: {dtc_acc}')
print(f'RFC Accuracy: {rfc_acc}')
print(f'GBT Accuracy: {gbt_acc}')

DTC Accuracy: 0.9090909090909091
RFC Accuracy: 0.9393939393939394
GBT Accuracy: 0.9090909090909091


In [32]:
# rfc_model.featureImportances