In [9]:
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('Tree').getOrCreate()

In [5]:
data = spark.read.format('libsvm').load('/home/sai/ex/ML/tree/sample_libsvm_data.txt')

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [7]:
#Split data
tr_data, ts_data = data.randomSplit([0.8,0.2])

In [10]:
#DecisionTRee classifier
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(numTrees=80)
gb = GBTClassifier()

In [11]:
model_dt = dt.fit(tr_data)
model_rf = rf.fit(tr_data)
model_gb = gb.fit(tr_data)

In [12]:
pred_dt = model_dt.transform(ts_data)
pred_rf = model_rf.transform(ts_data)
pred_gb = model_gb.transform(ts_data)

In [13]:
pred_dt.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[122,123,124...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[129,130,131...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [0.0,48.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[154,155,156...|   [0.0,48.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[234,235,237...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[119,120,121...|   [0.0,48.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|   [0.0,48.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|   [0.0,48.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [14]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [15]:
acc_eval.evaluate(pred_dt)

0.8888888888888888

In [16]:
acc_eval.evaluate(pred_rf)

1.0

In [17]:
acc_eval.evaluate(pred_gb)

0.8888888888888888

In [18]:
#feature importance
model_rf.featureImportances

SparseVector(692, {99: 0.0008, 102: 0.0002, 152: 0.0001, 154: 0.0003, 155: 0.0007, 183: 0.0005, 184: 0.0016, 207: 0.0024, 208: 0.0159, 211: 0.0008, 214: 0.0004, 234: 0.0006, 235: 0.0029, 243: 0.0014, 244: 0.008, 263: 0.0012, 266: 0.001, 268: 0.0012, 273: 0.0018, 290: 0.0093, 299: 0.0007, 302: 0.0006, 314: 0.0033, 320: 0.0012, 322: 0.007, 323: 0.011, 327: 0.0006, 328: 0.0198, 331: 0.0005, 342: 0.0028, 343: 0.0006, 344: 0.0014, 345: 0.0166, 346: 0.0026, 350: 0.0177, 351: 0.025, 353: 0.0032, 355: 0.003, 357: 0.0088, 358: 0.0005, 373: 0.0114, 377: 0.0005, 378: 0.0362, 379: 0.0318, 380: 0.001, 385: 0.0101, 386: 0.0006, 387: 0.0032, 402: 0.0007, 403: 0.0006, 405: 0.0319, 406: 0.0625, 408: 0.0073, 409: 0.0012, 412: 0.0094, 425: 0.0043, 428: 0.0011, 429: 0.0109, 430: 0.0006, 432: 0.0012, 433: 0.0241, 434: 0.06, 440: 0.0002, 443: 0.0007, 453: 0.0031, 455: 0.0234, 456: 0.0206, 460: 0.003, 461: 0.0161, 462: 0.0616, 463: 0.035, 469: 0.009, 482: 0.0089, 483: 0.0417, 485: 0.0027, 488: 0.0077, 489: 0

# EX2

In [20]:
data = spark.read.csv('/home/sai/ex/ML/tree/College.csv', inferSchema=True, header=True)

In [21]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [22]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [23]:
assemble = VectorAssembler(inputCols= ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], 
outputCol='features')

In [26]:
assemble_col = assemble.transform(data)

In [28]:
#Get output
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')
int_df = indexer.fit(assemble_col).transform(assemble_col)

In [29]:
int_df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = true)



In [30]:
input_df = int_df.select('features', 'PrivateIndex')

In [31]:
tr_data, ts_data = input_df.randomSplit([0.8,0.2])

In [45]:
dt = DecisionTreeClassifier(featuresCol='features', labelCol='PrivateIndex')
rf = RandomForestClassifier(numTrees=180,featuresCol='features', labelCol='PrivateIndex')
gb = GBTClassifier(featuresCol='features', labelCol='PrivateIndex')

In [46]:
model_dt = dt.fit(tr_data)
model_rf = rf.fit(tr_data)
model_gb = gb.fit(tr_data)

In [47]:
pred_dt = model_dt.transform(ts_data)
pred_rf = model_rf.transform(ts_data)
pred_gb = model_gb.transform(ts_data)

In [48]:
pred_gb.show()

+--------------------+------------+----------+
|            features|PrivateIndex|prediction|
+--------------------+------------+----------+
|[81.0,72.0,51.0,3...|         0.0|       1.0|
|[245.0,208.0,125....|         0.0|       0.0|
|[280.0,143.0,79.0...|         0.0|       0.0|
|[285.0,280.0,208....|         1.0|       1.0|
|[346.0,274.0,146....|         0.0|       0.0|
|[368.0,317.0,159....|         0.0|       0.0|
|[368.0,344.0,212....|         1.0|       0.0|
|[404.0,400.0,169....|         0.0|       0.0|
|[443.0,330.0,151....|         0.0|       0.0|
|[452.0,331.0,269....|         0.0|       0.0|
|[461.0,381.0,235....|         1.0|       0.0|
|[465.0,361.0,176....|         0.0|       0.0|
|[477.0,417.0,204....|         0.0|       0.0|
|[478.0,327.0,117....|         0.0|       0.0|
|[480.0,405.0,380....|         1.0|       0.0|
|[484.0,384.0,177....|         0.0|       0.0|
|[499.0,441.0,199....|         0.0|       0.0|
|[500.0,336.0,156....|         0.0|       0.0|
|[510.0,485.0

In [49]:
eval_metric = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [50]:
print('DT Accuracy', eval_metric.evaluate(pred_dt))
print('RF Accuracy', eval_metric.evaluate(pred_rf))

DT Accuracy 0.9306575433911882
RF Accuracy 0.9758010680907878


In [51]:
#Since gb does not have raw_prediction, we go for prediction
eval_metric_gb = BinaryClassificationEvaluator(labelCol='PrivateIndex', rawPredictionCol='prediction')

In [52]:
print('RF Accuracy', eval_metric_gb.evaluate(pred_gb))

RF Accuracy 0.8869325767690255


In [58]:
eval_met = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                             metricName='accuracy')

In [59]:
print('DT Accuracy', eval_met.evaluate(pred_dt))
print('RF Accuracy', eval_met.evaluate(pred_rf))

DT Accuracy 0.8834355828220859
RF Accuracy 0.8834355828220859


# EX3

In [61]:
data = spark.read.csv('/home/sai/ex/ML/tree/dog_food.csv', inferSchema=True, header=True)

In [62]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [63]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [64]:
assemble = VectorAssembler(inputCols= ['A','B','C','D'], outputCol='features')

In [65]:
int_df = assemble.transform(data)

In [66]:
input_df = int_df.select('features', 'Spoiled')

In [67]:
rf = RandomForestClassifier(featuresCol='features', 
                            labelCol='Spoiled')

In [68]:
model_rf = rf.fit(input_df)

In [69]:
model_rf.featureImportances

SparseVector(4, {0: 0.0187, 1: 0.021, 2: 0.9386, 3: 0.0218})

In [70]:
#THis tells that feature 'C' is important for the prediction