In [1]:
import findspark
findspark.init('/home/asif/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TreeFamily').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)

In [2]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Tree family example with college data

In [3]:
collegeData = spark.read.csv("College.csv", inferSchema=True, header=True)

In [4]:
collegeData.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F.Undergrad: integer (nullable = true)
 |-- P.Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room.Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S.F.Ratio: double (nullable = true)
 |-- perc.alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad.Rate: integer (nullable = true)



In [5]:
collegeData.head(1)

[Row(_c0='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F.Undergrad=2885, P.Undergrad=537, Outstate=7440, Room.Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S.F.Ratio=18.1, perc.alumni=12, Expend=7041, Grad.Rate=60)]

In [6]:
collegeData.columns

['_c0',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F.Undergrad',
 'P.Undergrad',
 'Outstate',
 'Room.Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S.F.Ratio',
 'perc.alumni',
 'Expend',
 'Grad.Rate']

In [7]:
print('num of unique values = ',collegeData.select('Private').distinct().count())
collegeData.select('Private').distinct().show()

num of unique values =  2
+-------+
|Private|
+-------+
|     No|
|    Yes|
+-------+



In [8]:
collegeData = collegeData.withColumnRenamed("F.Undergrad", "F_Undergrad")

In [9]:
collegeData = collegeData.withColumnRenamed("P.Undergrad", "P_Undergrad")

## Spark gives error if '.' or ' ' is in the column name. So we have to chane this. to rename column with withColumnRenamed() method is not so useful in this case. We can use toDF() method very efficiently.

In [10]:
import re
collegeData = collegeData.toDF(*(re.sub(r'[\.\s]+', '_', c) for c in collegeData.columns))
collegeData.columns

['_c0',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [11]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=[ 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [12]:
output = assembler.transform(collegeData)

## Spark mllib can't deal with string directly, so we have to convert "yes","no" of the 'Private' column to 0 and 1. So, we can  use stringIndexer
#### https://spark.apache.org/docs/2.1.0/ml-features.html#stringindexer

In [13]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private', outputCol='PrivateInt')
# for multiple column pipeline should be used

In [14]:
output_fix = indexer.fit(output).transform(output)

In [15]:
output_fix.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateInt: double (nullable = true)



In [16]:
final_data = output_fix.select('features','PrivateInt')
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [17]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
dct = DecisionTreeClassifier(labelCol="PrivateInt", featuresCol="features")
rfc = RandomForestClassifier(labelCol="PrivateInt", featuresCol="features",numTrees=100)
gbt = GBTClassifier(labelCol="PrivateInt", featuresCol="features",maxIter=100,maxDepth=20)
dct_model = dct.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dct_preds = dct_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [18]:
gbt_model.featureImportances

SparseVector(17, {0: 0.0417, 1: 0.0188, 2: 0.0, 3: 0.0152, 4: 0.007, 5: 0.472, 6: 0.0539, 7: 0.2899, 8: 0.0168, 9: 0.0026, 10: 0.0016, 11: 0.0336, 12: 0.0, 13: 0.0213, 14: 0.0152, 15: 0.0005, 16: 0.0098})

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
dct_rfc_eval = BinaryClassificationEvaluator(labelCol='PrivateInt')
print('DCT area under the curve=',dct_rfc_eval.evaluate(dct_preds))
print('RFC area under the curve=',dct_rfc_eval.evaluate(rfc_preds))

DCT area under the curve= 0.9271802325581396
RFC area under the curve= 0.9700581395348821


In [20]:
gbt_eval = BinaryClassificationEvaluator(labelCol='PrivateInt', rawPredictionCol='prediction')
print('gbt area under the curve=',gbt_eval.evaluate(gbt_preds))

gbt area under the curve= 0.8709302325581396


In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateInt', metricName='accuracy')
rfc_acc = acc_eval.evaluate(rfc_preds)
rfc_acc

0.9246031746031746

In [22]:
dct_acc = acc_eval.evaluate(dct_preds)
dct_acc

0.9047619047619048

In [23]:
gbt_acc = acc_eval.evaluate(gbt_preds)
gbt_acc

0.8968253968253969