In [1]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('mytrees').getOrCreate()

In [2]:
from pyspark.ml import Pipeline

In [3]:
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier, GBTClassifier)

In [4]:
data=spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
#Split the data into train and test data
train,test=data.randomSplit([0.7, 0.3])

In [7]:
#Adding the number of trees in Random forest trees not necessarily improve the accuracy
dtc=DecisionTreeClassifier()
rfc=RandomForestClassifier(numTrees=100)
gbt=GBTClassifier() 

In [8]:
# Fit the model
dtc_model=dtc.fit(train)
rfc_model=rfc.fit(train)
gbt_model=gbt.fit(train)

In [9]:
#Make the predictions 
dtc_pred=dtc_model.transform(test)
rfc_pred=rfc_model.transform(test)
gbt_pred=gbt_model.transform(test)

In [10]:
dtc_pred.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [26.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [11]:
# Lets evaluate accuracy of the model 
#even if the model is binary, binaryClassClassificatonEvaluator and multiclassClassificationEvaluator will both work 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
Accuracy=MulticlassClassificationEvaluator()

In [13]:
Accuracy.evaluate(gbt_pred)

0.8723019454726771

In [14]:
Accuracy.evaluate(rfc_pred)

1.0

In [15]:
Accuracy.evaluate(dtc_pred)

0.8723019454726771

In [16]:
# To check the importance of each feature, use method featureimportances on the model
# The higher the value of each feature, more important is the feature
rfc_model.featureImportances

SparseVector(692, {178: 0.0008, 182: 0.001, 184: 0.0008, 204: 0.0022, 210: 0.0004, 215: 0.0005, 216: 0.0017, 233: 0.0014, 239: 0.0009, 240: 0.0006, 244: 0.0025, 245: 0.0029, 263: 0.0069, 272: 0.0093, 273: 0.016, 289: 0.0059, 290: 0.0093, 291: 0.0138, 292: 0.0005, 295: 0.0012, 300: 0.0176, 314: 0.0011, 315: 0.0007, 316: 0.0013, 317: 0.0078, 322: 0.0029, 323: 0.0063, 324: 0.0019, 325: 0.0011, 327: 0.0003, 328: 0.0321, 329: 0.0147, 330: 0.0006, 331: 0.0006, 346: 0.0033, 350: 0.0176, 351: 0.0133, 352: 0.0028, 353: 0.0015, 355: 0.0028, 357: 0.0081, 358: 0.0074, 359: 0.0007, 369: 0.0019, 377: 0.0071, 378: 0.0118, 379: 0.0307, 381: 0.0017, 382: 0.0036, 385: 0.0184, 397: 0.0012, 399: 0.0087, 401: 0.0084, 402: 0.0008, 405: 0.0109, 406: 0.0252, 407: 0.0529, 409: 0.0007, 413: 0.0062, 415: 0.0012, 425: 0.0051, 427: 0.002, 429: 0.007, 433: 0.0261, 434: 0.0205, 435: 0.0481, 436: 0.0019, 440: 0.0012, 453: 0.0007, 454: 0.0073, 455: 0.0117, 456: 0.0082, 461: 0.0177, 462: 0.0204, 463: 0.035, 464: 0.0014

In [17]:
# Decision trees, random forest and gradient booster trees comparison on a dataset

In [18]:
data=spark.read.csv("College.csv", inferSchema=True, header=True)

In [19]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [20]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [21]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [22]:
#Lets convert the data in the required formart of labels and features using VectorAssembler

In [23]:
from pyspark.ml.feature import VectorAssembler

In [24]:
# We need all the features except School name and Private columns
assembler=VectorAssembler(inputCols=[ 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [25]:
# Transform the data 
output=assembler.transform(data)
output.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)



In [26]:
#Above in the schema of the original datset, the variable 'Private' is string which is label for our dataset so need to convert into integer
from pyspark.ml.feature import StringIndexer

In [27]:
indexer=StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [28]:
# Now add this new integer 'PrivateIndex' label 
output_new=indexer.fit(output).transform(output)

In [29]:
output_new.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [31]:
# from the above variables we only need features and PrivateIndex
finaldata=output_new.select('features', 'PrivateIndex')

In [32]:
#Divide the data into train and test datasets
train,test= finaldata.randomSplit([0.7,0.3])

In [33]:
# in the arguments, we had to give the name of the label and feature as they are differnt names other than by default names:
# 'features' and ' labels'
dtc=DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc=RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt=GBTClassifier(labelCol='PrivateIndex', featuresCol='features') 

In [37]:
dtc_model=dtc.fit(train)
rfc_model=rfc.fit(train)
gbt_model=gbt.fit(train)

In [40]:
dtc_pred=dtc_model.transform(test)
rfc_pred=rfc_model.transform(test)
gbt_pred=gbt_model.transform(test)

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [44]:
# mention the name of label as the name is different from default name 'label'
accuracy=BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [46]:
accuracy.evaluate(rfc_pred)
#area under the curve is 98.4%
# by default 20 random sample are taken for  building decision trees. If we increase the number of trees in the RandomForestClassfier
# the accuracy may get improve

0.9840654608096471

In [48]:
accuracy.evaluate(dtc_pred)

0.9342700258397932

In [54]:
#Gradeient booster tree doenst have default rawpredictiosn colum unlike DT and RF
gbt_pred.printSchema()
accuracy2=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='PrivateIndex')

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [53]:
accuracy2.evaluate(rfc_pred)

0.9240956072351422

In [55]:
#From all the results above, we find that Gradient booster tree gave the hightest accuracy followed by Random forest which is 
#very close. But decision tree had lowest because it is based on only 1 decision tree compared to Random Forest where multiple 
#decision trees predict the value and later averages out to give final predicetd value. Thus, RF gave higher accuracy than DT

In [67]:
# To check the precision , recall and other metric, BinaryClassevaluator can directly help to evaluate but MulticlassClassifierEvaluator can.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [65]:
acc=MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName="accuracy")

In [66]:
acc.evaluate(rfc_pred)

0.9424778761061947