In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName("decisio_tree").getOrCreate()

In [3]:
data=spark.read.csv("/FileStore/tables/College.csv",inferSchema=True,header=True)

In [4]:
data.describe().show()

In [5]:
data.head(1)

In [6]:
data.printSchema()

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

In [9]:
assembler=VectorAssembler(inputCols=['Accept',
                                     'Enroll',
                                     'Top10perc',
                                     'Top25perc',
                                     'F_Undergrad',
                                     'P_Undergrad',
                                     'Outstate',
                                     'Room_Board',
                                     'Books',
                                     'Personal',
                                     'PhD',
                                     'Terminal',
                                     'S_F_Ratio',
                                     'perc_alumni',
                                     'Expend',
                                     'Grad_Rate'],outputCol='features')

In [10]:
output=assembler.transform(data)

In [11]:
from pyspark.ml.feature import StringIndexer

In [12]:
indexer=StringIndexer(inputCol="Private",outputCol='PrivateIndex')

In [13]:
output_fixed=indexer.fit(output).transform(output)

In [14]:
output_fixed.head(1)

In [15]:
final_data=output_fixed.select('features','PrivateIndex')

In [16]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [18]:
from pyspark.ml import Pipeline

In [19]:
dtc=DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc=RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt=GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [20]:
dtc_model=dtc.fit(train_data)
rfc_model=rfc.fit(train_data)
gbt_model=gbt.fit(train_data)

In [21]:
dtc_preds=dtc_model.transform(test_data)
rfc_preds=rfc_model.transform(test_data)
gbt_preds=gbt_model.transform(test_data)

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
my_binary_eval=BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [24]:
print("DTC")
print(my_binary_eval.evaluate(dtc_preds))

In [25]:
print("RFC")
print(my_binary_eval.evaluate(rfc_preds))

In [26]:
my_binary_eval2=BinaryClassificationEvaluator(labelCol='PrivateIndex',rawPredictionCol='prediction')

In [27]:
print("GBT")
print(my_binary_eval2.evaluate(gbt_preds))

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [29]:
acc_eval=MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')

In [30]:
rfc_acc=acc_eval.evaluate(rfc_preds)

In [31]:
rfc_acc