In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [5]:
from pyspark.ml import pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.regression import GBTRegressor, RandomForestRegressor

In [6]:
data= spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [8]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [9]:
train_data , test_data = data.randomSplit([0.7,0.3])

In [10]:
dtc = DecisionTreeClassifier()

In [13]:
rfc = RandomForestClassifier(numTrees=100)

In [12]:
gbt = GBTClassifier()

In [14]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [15]:
dtc_pre = dtc_model.transform(test_data)
rfc_pre = rfc_model.transform(test_data)
gbt_pre = gbt_model.transform(test_data)

In [16]:
rfc_pre.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[122,123,124...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[124,125,126...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[124,125,126...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [94.0,6.0]|[0.94,0.06]|       0.0|
|  0.0|(692,[126,127,128...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[126,127,128...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[126,127,128...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[127,128,129...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[127,128,129...|   [94.0,6.0]|[0.94,0.06]|       0.0|
|  0.0|(692,[150,151,152...|  [82.0,18.0]|[0.82,0.18]|       0.0|
|  0.0|(692,[152,153,154...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(69

In [17]:
gbt_pre.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[-0.6472244504525...|[0.21510074015259...|       1.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.17677494257156...|[0.91321597802429...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.10438400601822...|[0.90103412270871...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [21]:
print('dtc')
acc_eval.evaluate(dtc_pre)

dtc


0.9615384615384616

In [22]:
print('rfc')
acc_eval.evaluate(rfc_pre)

rfc


1.0

In [23]:
print('gbt')
acc_eval.evaluate(gbt_pre)

gbt


0.9615384615384616

In [24]:
rfc_model.featureImportances

SparseVector(692, {99: 0.0005, 101: 0.0014, 119: 0.0005, 121: 0.0011, 129: 0.0001, 131: 0.0007, 149: 0.0004, 150: 0.0005, 153: 0.0006, 155: 0.0007, 158: 0.0005, 174: 0.0011, 184: 0.0007, 207: 0.0022, 212: 0.0009, 216: 0.0002, 233: 0.0005, 234: 0.0024, 235: 0.0067, 236: 0.006, 237: 0.0009, 240: 0.0004, 242: 0.0007, 243: 0.0015, 244: 0.0201, 245: 0.0042, 260: 0.0018, 261: 0.0085, 264: 0.0011, 266: 0.0006, 271: 0.0011, 272: 0.0439, 273: 0.0149, 274: 0.0017, 290: 0.0203, 291: 0.0016, 293: 0.0009, 299: 0.0015, 300: 0.0111, 301: 0.0136, 302: 0.0013, 313: 0.0018, 315: 0.0054, 317: 0.0028, 323: 0.0011, 324: 0.0013, 326: 0.0006, 327: 0.001, 328: 0.0038, 329: 0.0135, 330: 0.0202, 332: 0.0011, 341: 0.0018, 342: 0.0019, 344: 0.0051, 345: 0.0072, 346: 0.001, 350: 0.0082, 351: 0.0449, 352: 0.0004, 355: 0.002, 356: 0.0168, 357: 0.0084, 358: 0.0067, 359: 0.0029, 369: 0.0009, 372: 0.0066, 373: 0.001, 374: 0.0007, 377: 0.0295, 378: 0.0414, 379: 0.0278, 380: 0.0068, 381: 0.0011, 382: 0.0034, 384: 0.0071,

# another exemple

In [25]:
spark = SparkSession.builder.appName('university').getOrCreate()

In [27]:
data = spark.read.csv('College.csv', header= True , inferSchema=True)

In [28]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [29]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [30]:
from pyspark.ml.feature import VectorAssembler

In [31]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [32]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [33]:
output = assembler.transform(data)

In [34]:
from pyspark.ml.feature import StringIndexer

In [36]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [40]:
output_fixed = indexer.fit(output).transform(output)

In [43]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [47]:
output_fixed.select(['Private','PrivateIndex']).show()

+-------+------------+
|Private|PrivateIndex|
+-------+------------+
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|    Yes|         0.0|
|     No|         1.0|
+-------+------------+
only showing top 20 rows



In [48]:
final_data = output_fixed.select(['features','PrivateIndex'])

In [49]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [50]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier, GBTClassifier

In [51]:
from pyspark.ml import pipeline

In [52]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [54]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [55]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [62]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [57]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [59]:
print("dtc")
print(my_binary_eval.evaluate(dtc_preds))
print("rfc")
print(my_binary_eval.evaluate(rfc_preds))
print("gbt")
print(my_binary_eval.evaluate(gbt_preds))

dtc
0.9547779824022917
rfc
0.9832719459791284
gbt
0.9458256599140576


In [60]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [61]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [64]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')

In [66]:
print("dtc")
print(acc_eval.evaluate(dtc_preds))
print("rfc")
print(acc_eval.evaluate(rfc_preds))
print("gbt")
print(acc_eval.evaluate(gbt_preds))

dtc
0.8936170212765957
rfc
0.9361702127659575
gbt
0.9148936170212766
