## Importing the libraries

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName('College_Study').\
        getOrCreate()

## Preparing the data

### Importing the data

In [3]:
df = spark.read.csv('College.csv', inferSchema=True, header = True)
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
for item in df.head(1)[0]: print(item)

Abilene Christian University
Yes
1660
1232
721
23
52
2885
537
7440
3300
450
2200
70
78
18.1
12
7041
60


In [5]:
df.groupBy('Private').count().show()

+-------+-----+
|Private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



In [6]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

### Encoding and Assembling the data

In [7]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
assembler = VectorAssembler(inputCols=df.columns[2:], outputCol='features')
output = assembler.transform(df)

indexer = StringIndexer(inputCol='Private', outputCol='label')
output_fixed = indexer.fit(output).transform(output)

data = output_fixed.select('features', 'label')

### Splitting the data

In [8]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [9]:
train_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[81.0,72.0,51.0,3...|  0.0|
|[150.0,130.0,88.0...|  0.0|
|[152.0,128.0,75.0...|  0.0|
|[174.0,146.0,88.0...|  0.0|
|[191.0,165.0,63.0...|  0.0|
+--------------------+-----+
only showing top 5 rows



## Fitting and predicting with Tree Models

### Fitting the models

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier
dtc = DecisionTreeClassifier().fit(train_data)
rfc = RandomForestClassifier(numTrees=100).fit(train_data)
gbt = GBTClassifier().fit(train_data)

### Predicting with test data

In [11]:
dtc_preds = dtc.transform(test_data)
rfc_preds = rfc.transform(test_data)
gbt_preds = gbt.transform(test_data)

### Evaluating the performance

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator as BCE
my_eval = BCE()
print('Area Under ROC:\n')
print('DTC: {:.3f}, RFC: {:.3f}, GBT: {:.3f}'.format(my_eval.evaluate(dtc_preds), 
                                                     my_eval.evaluate(rfc_preds), 
                                                     my_eval.evaluate(gbt_preds)))

Area Under ROC:

DTC: 0.960, RFC: 0.981, GBT: 0.958


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as MCE
my_eval_2 = MCE(metricName='f1')
print('F1 Score:\n')
print('DTC: {:.3f}, RFC: {:.3f}, GBT: {:.3f}'.format(my_eval_2.evaluate(dtc_preds),
                                                     my_eval_2.evaluate(rfc_preds),
                                                     my_eval_2.evaluate(gbt_preds)))

F1 Score:

DTC: 0.916, RFC: 0.931, GBT: 0.916
