<a href="https://colab.research.google.com/github/Ricardo-Jaramillo/PySpark/blob/main/09_DecisionTrees_%26_RandomForests_University_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code a case use example of Decision Tree methods

## Install pyspark and download the data

In [1]:
# Install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=b7882b0c320acf23df890aa676723793c4f14b197debfc30ff2e73b1b0a829bf
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [8]:
# Download the necessary files
!wget https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/DecisionTress/College.csv

--2023-10-04 15:29:31--  https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/DecisionTress/College.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75708 (74K) [text/plain]
Saving to: ‘College.csv’


2023-10-04 15:29:31 (3.45 MB/s) - ‘College.csv’ saved [75708/75708]



## Import libraries and read in the data

In [35]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [10]:
# Create a spark session
spark = SparkSession.builder.appName('tree_example').getOrCreate()

In [14]:
# Read in the data
data = spark.read.csv('College.csv', header=True, inferSchema=True)

In [15]:
# Print Schema
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [16]:
# Show the data
data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [17]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

## Assemble data into a features columns
We'll need to index the lable `Private` with the StringIndexer method

In [19]:
# Printo ut column names
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [22]:
# Create assembler object
assembler = VectorAssembler(inputCols=['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad',
                                       'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni',
                                       'Expend', 'Grad_Rate'],
                            outputCol='features')

In [25]:
# Create 'features' column through assembler oject
output = assembler.transform(data)

In [26]:
# Create indexer object for 'Private' feature
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [27]:
# Fit and transform data with indexer
output_fixed = indexer.fit(output).transform(output)

In [29]:
# Show the output data
output_fixed.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|PrivateIndex|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|         0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30

In [30]:
# Save the features and labels columns
final_data = output_fixed.select('features', 'PrivateIndex')

In [31]:
# Finally, split the data into train and test
train_data, test_data = final_data.randomSplit([0.7, 0.3])

## Train and Fit Decision Tree models

In [44]:
# Create classifier objects
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [45]:
# Fit models
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [46]:
# Make predictions
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

## Evaluate

### Evaluate on Binary Classifier

In [47]:
# Create evaluator object
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [48]:
# Print evaluation results
print(f'DTC: {my_binary_eval.evaluate(dtc_preds)}')
print(f'RFC: {my_binary_eval.evaluate(rfc_preds)}')
print(f'GBT: {my_binary_eval.evaluate(gbt_preds)}')

DTC: 0.8405797101449275
RFC: 0.9783682232957595
GBT: 0.9574342458400428


### Evaluate on Multiclass Classifier

In [50]:
# Create evaluator object
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [52]:
# Print evaluation results
print(f'DTC: {acc_eval.evaluate(dtc_preds)}')
print(f'RFC: {acc_eval.evaluate(rfc_preds)}')
print(f'GBT: {acc_eval.evaluate(gbt_preds)}')

DTC: 0.9117647058823529
RFC: 0.9313725490196079
GBT: 0.9068627450980392
