# Árboles de Decisión - Bosques Aleatorios: Universidades

* Un árbol de decisión único
* Un bosque aleatorio
* Un clasificador de árboles impulsado por gradiente

Conjunto de datos de universidades para intentar clasificarlas como privadas o públicas

In [1]:
import warnings
warnings.filterwarnings("ignore")

## Creación Spark

In [2]:
import os, subprocess

java8_home = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home"

os.environ["JAVA_HOME"] = java8_home
os.environ["PATH"] = os.path.join(java8_home, "bin") + os.pathsep + os.environ.get("PATH","")

os.environ["HADOOP_USER_NAME"] = os.environ.get("USER", "tomas")

print("JAVA_HOME fijado a:", os.environ["JAVA_HOME"])
try:
    print("which java (kernel):", subprocess.check_output(["which","java"]).decode().strip())
    print("java -version (kernel):")
    print(subprocess.check_output(["java","-version"], stderr=subprocess.STDOUT).decode())
except Exception as e:
    print("Error llamando a java desde kernel:", e)

JAVA_HOME fijado a: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
which java (kernel): /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/bin/java
java -version (kernel):
openjdk version "1.8.0_292"
OpenJDK Runtime Environment (AdoptOpenJDK)(build 1.8.0_292-b10)
OpenJDK 64-Bit Server VM (AdoptOpenJDK)(build 25.292-b10, mixed mode)



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('treecode').getOrCreate()

25/09/13 22:13:30 WARN Utils: Your hostname, MacBook-Air-de-Tomas-3.local resolves to a loopback address: 127.0.0.1; using 192.168.1.4 instead (on interface en0)
25/09/13 22:13:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/09/13 22:13:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/13 22:13:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/09/13 22:13:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/09/13 22:13:33 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/09/13 22:13:33 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


## Importación de Datos

In [4]:
data = spark.read.csv('../PySparkCourse/MLData/College.csv',inferSchema=True,header=True)

                                                                                

In [5]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [6]:
data.head(10)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60),
 Row(School='Adelphi University', Private='Yes', Apps=2186, Accept=1924, Enroll=512, Top10perc=16, Top25perc=29, F_Undergrad=2683, P_Undergrad=1227, Outstate=12280, Room_Board=6450, Books=750, Personal=1500, PhD=29, Terminal=30, S_F_Ratio=12.2, perc_alumni=16, Expend=10527, Grad_Rate=56),
 Row(School='Adrian College', Private='Yes', Apps=1428, Accept=1097, Enroll=336, Top10perc=22, Top25perc=50, F_Undergrad=1036, P_Undergrad=99, Outstate=11250, Room_Board=3750, Books=400, Personal=1165, PhD=53, Terminal=66, S_F_Ratio=12.9, perc_alumni=30, Expend=8735, Grad_Rate=54),
 Row(School='Agnes Scott College', Private='Yes', Apps=417, Accept=349, Enroll=137, Top10perc=60, Top25perc=89, F_Undergrad=510, P

## Transformación de Datos

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [9]:
assembler = VectorAssembler(inputCols=['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'], outputCol="features")

In [10]:
output = assembler.transform(data)

### Variable Objetivo (Y): Private

In [11]:
from pyspark.ml.feature import StringIndexer

In [12]:
indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)

                                                                                

### Conjunto de Entrenamiento

In [13]:
final_data = output_fixed.select("features",'PrivateIndex')

In [14]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

## Modelos

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

### Árboles de decisión

In [17]:
model_1 = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
model_1_train = model_1.fit(train_data)
model_1_pred = model_1_train.transform(test_data)

                                                                                

### Bosques aleatorios

In [18]:
model_2 = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
model_2_train = model_2.fit(train_data)
model_2_pred = model_2_train.transform(test_data)

                                                                                

### Árboles de decisión Potenciados por gradientes

In [19]:
model_3 = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')
model_3_train = model_3.fit(train_data)
model_3_pred = model_3_train.transform(test_data)

## Evaluación de Modelos

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Accuracy

In [23]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")

In [24]:
dtc_acc = acc_evaluator.evaluate(model_1_pred)
rfc_acc = acc_evaluator.evaluate(model_2_pred)
gbt_acc = acc_evaluator.evaluate(model_3_pred)

25/09/13 22:41:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/09/13 22:41:57 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [25]:
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 91.93%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 95.07%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 92.83%


### f1

In [26]:
f1_evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="f1")

In [27]:
dtc_f1 = f1_evaluator.evaluate(model_1_pred)
rfc_f1 = f1_evaluator.evaluate(model_2_pred)
gbt_f1 = f1_evaluator.evaluate(model_3_pred)

In [28]:
print('-'*80)
print('A single decision tree had an F1 score of: {0:2.2f}%'.format(dtc_f1*100))
print('-'*80)
print('A random forest ensemble had an F1 score of: {0:2.2f}%'.format(rfc_f1*100))
print('-'*80)
print('A ensemble using GBT had an F1 score of: {0:2.2f}%'.format(gbt_f1*100))

--------------------------------------------------------------------------------
A single decision tree had an F1 score of: 91.98%
--------------------------------------------------------------------------------
A random forest ensemble had an F1 score of: 95.01%
--------------------------------------------------------------------------------
A ensemble using GBT had an F1 score of: 92.92%


### Matriz de Confusión

In [33]:
confusion_matrix = (
    model_1_pred.groupBy("PrivateIndex")
    .pivot("prediction")
    .count()
    .na.fill(0)
    .orderBy("PrivateIndex")
)

print('-'*80)
print('Confusion Matrix for Decision Tree Classifier')
print('-'*80)
confusion_matrix.show()

                                                                                

--------------------------------------------------------------------------------
Confusion Matrix for Decision Tree Classifier
--------------------------------------------------------------------------------




+------------+---+---+
|PrivateIndex|0.0|1.0|
+------------+---+---+
|         0.0|162| 10|
|         1.0|  8| 43|
+------------+---+---+



                                                                                

In [34]:
confusion_matrix = (
    model_2_pred.groupBy("PrivateIndex")
    .pivot("prediction")
    .count()
    .na.fill(0)
    .orderBy("PrivateIndex")
)

print('-'*80)
print('Confusion Matrix for Random Forest Classifier')
print('-'*80)
confusion_matrix.show()

                                                                                

--------------------------------------------------------------------------------
Confusion Matrix for Random Forest Classifier
--------------------------------------------------------------------------------




+------------+---+---+
|PrivateIndex|0.0|1.0|
+------------+---+---+
|         0.0|168|  4|
|         1.0|  7| 44|
+------------+---+---+



                                                                                

In [35]:
confusion_matrix = (
    model_3_pred.groupBy("PrivateIndex")
    .pivot("prediction")
    .count()
    .na.fill(0)
    .orderBy("PrivateIndex")
)

print('-'*80)
print('Confusion Matrix for Gradient Boosted Tree Classifier')
print('-'*80)
confusion_matrix.show()

                                                                                

--------------------------------------------------------------------------------
Confusion Matrix for Gradient Boosted Tree Classifier
--------------------------------------------------------------------------------




+------------+---+---+
|PrivateIndex|0.0|1.0|
+------------+---+---+
|         0.0|162| 10|
|         1.0|  6| 45|
+------------+---+---+



                                                                                