In [58]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, FMClassifier, RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import count_distinct
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
spark = SparkSession.builder.appName("tree").getOrCreate()
spark

In [3]:
df = spark.read.csv('College.csv',inferSchema=True,header=True)
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
df.count()

777

In [5]:
df.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

In [6]:
df.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [7]:
df.select(count_distinct('School')).show()

+----------------------+
|count(DISTINCT School)|
+----------------------+
|                   777|
+----------------------+



In [8]:
df.select('School').distinct().collect()

[Row(School='Colorado College'),
 Row(School='Fresno Pacific College'),
 Row(School='Mount Marty College'),
 Row(School='University of Oklahoma'),
 Row(School='Widener University'),
 Row(School='Bethune Cookman College'),
 Row(School='Marquette University'),
 Row(School='New York University'),
 Row(School='Norwich University'),
 Row(School='SUNY College  at Oswego'),
 Row(School='University of Nebraska at Lincoln'),
 Row(School='Lindenwood College'),
 Row(School='Auburn University-Main Campus'),
 Row(School='Butler University'),
 Row(School='Carroll College'),
 Row(School='Smith College'),
 Row(School='University of California at Berkeley'),
 Row(School='Adelphi University'),
 Row(School='Blackburn College'),
 Row(School='Fordham University'),
 Row(School='Taylor University'),
 Row(School='Texas Southern University'),
 Row(School='University of Maine at Presque Isle'),
 Row(School='Stetson University'),
 Row(School='Tennessee Wesleyan College'),
 Row(School='Virginia Wesleyan College')

In [9]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [10]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'],outputCol='features')
assembler

VectorAssembler_39f858fd4d96

In [11]:
output = assembler.transform(df)
output.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924

In [12]:
output.groupBy('Private').count().show()

+-------+-----+
|Private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



In [13]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')
indexer

StringIndexer_a9bef6c1720b

In [14]:
output = indexer.fit(output).transform(output)
output.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|            features|PrivateIndex|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+------------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|         0.0|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30

In [15]:
final_df = output.select('features','PrivateIndex')
final_df.show(5)

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
+--------------------+------------+
only showing top 5 rows



In [16]:
train_df, test_df = final_df.randomSplit([0.75,0.25])

In [17]:
train_df.count(), test_df.count()

(590, 187)

In [18]:
train_df.show(5)

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[81.0,72.0,51.0,3...|         0.0|
|[100.0,90.0,35.0,...|         0.0|
|[141.0,118.0,55.0...|         0.0|
|[150.0,130.0,88.0...|         0.0|
|[152.0,128.0,75.0...|         0.0|
+--------------------+------------+
only showing top 5 rows



In [19]:
test_df.show(5)

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[174.0,146.0,88.0...|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[202.0,184.0,122....|         0.0|
|[222.0,185.0,91.0...|         0.0|
|[233.0,233.0,153....|         1.0|
+--------------------+------------+
only showing top 5 rows



In [20]:
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='PrivateIndex')
rfc = RandomForestClassifier(featuresCol='features',labelCol='PrivateIndex')
gbc = GBTClassifier(featuresCol='features',labelCol='PrivateIndex')
fmc = FMClassifier(featuresCol='features',labelCol='PrivateIndex')

In [21]:
dtc_model = dtc.fit(train_df)
rfc_model = rfc.fit(train_df)
gbc_model = gbc.fit(train_df)
fm_model = fmc.fit(train_df)

In [22]:
dtc_preds = dtc_model.transform(test_df)
rfc_preds = rfc_model.transform(test_df)
gbc_preds = gbc_model.transform(test_df)
fm_preds = fm_model.transform(test_df)

In [23]:
auc_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')
print("AUC of DT:",auc_eval.evaluate(dtc_preds))
print("AUC of RF:",auc_eval.evaluate(rfc_preds))
print("AUC of GB:",auc_eval.evaluate(gbc_preds))
print("AUC of FM:",auc_eval.evaluate(fm_preds))

AUC of DT: 0.8507598784194529
AUC of RF: 0.9840425531914893
AUC of GB: 0.9556990881458963
AUC of FM: 0.9609422492401214


In [24]:
pr_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex',metricName='areaUnderPR')
print("Area under PR curve of DT:",pr_eval.evaluate(dtc_preds))
print("Area under PR curve of RF:",pr_eval.evaluate(rfc_preds))
print("Area under PR curve of GB:",pr_eval.evaluate(gbc_preds))
print("Area under PR curve of FM:",pr_eval.evaluate(fm_preds))

Area under PR curve of DT: 0.8043557176742537
Area under PR curve of RF: 0.9550830265426585
Area under PR curve of GB: 0.9198691352401103
Area under PR curve of FM: 0.8544879577315782


In [25]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')
print("DT Accuracy:",acc_eval.evaluate(dtc_preds))
print("RF Accuracy:",acc_eval.evaluate(rfc_preds))
print("GB Accuracy:",acc_eval.evaluate(gbc_preds))
print("FM Accuracy:",acc_eval.evaluate(fm_preds))

DT Accuracy: 0.9144385026737968
RF Accuracy: 0.9197860962566845
GB Accuracy: 0.9251336898395722
FM Accuracy: 0.93048128342246


In [26]:
f1_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='f1')
print("DT F1 Score:",f1_eval.evaluate(dtc_preds))
print("RF F1 Score:",f1_eval.evaluate(rfc_preds))
print("GB F1 Score:",f1_eval.evaluate(gbc_preds))
print("FM F1 Score:",f1_eval.evaluate(fm_preds))

DT F1 Score: 0.9124191316704686
RF F1 Score: 0.9182365274479236
GB F1 Score: 0.9239972215945704
FM F1 Score: 0.931179859978191


In [27]:
precision_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='weightedPrecision')
print("DT Precision Score:",precision_eval.evaluate(dtc_preds))
print("RF Precision Score:",precision_eval.evaluate(rfc_preds))
print("GB Precision Score:",precision_eval.evaluate(gbc_preds))
print("FM Precision Score:",precision_eval.evaluate(fm_preds))

DT Precision Score: 0.9129430346387561
RF Precision Score: 0.9184470026255015
GB Precision Score: 0.923983335406044
FM Precision Score: 0.9324626253952144


In [28]:
recall_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='weightedRecall')
print("DT Recall Score:",recall_eval.evaluate(dtc_preds))
print("RF Recall Score:",recall_eval.evaluate(rfc_preds))
print("GB Recall Score:",recall_eval.evaluate(gbc_preds))
print("FM Recall Score:",recall_eval.evaluate(fm_preds))

DT Recall Score: 0.9144385026737968
RF Recall Score: 0.9197860962566845
GB Recall Score: 0.9251336898395722
FM Recall Score: 0.93048128342246


In [29]:
logloss_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='logLoss')
print("DT Log Loss Score:",logloss_eval.evaluate(dtc_preds))
print("RF Log Loss Score:",logloss_eval.evaluate(rfc_preds))
print("GB Log Loss Score:",logloss_eval.evaluate(gbc_preds))
print("FM Log Loss Score:",logloss_eval.evaluate(fm_preds))

DT Log Loss Score: 1.598167883571452
RF Log Loss Score: 0.16477105437949827
GB Log Loss Score: 0.231188364605086
FM Log Loss Score: 2.4010914071328275


In [30]:
param_grid = ParamGridBuilder().addGrid(dtc.maxDepth,[5,10,15]).addGrid(dtc.impurity,['entropy','gini']).addGrid(dtc.minInstancesPerNode,[1,5,10]).build()

dt_cross_eval = CrossValidator(estimator=dtc,
                           estimatorParamMaps=param_grid,
                           evaluator=acc_eval,
                           numFolds=3)
dt_cross_eval

CrossValidator_597bee8fb940

In [31]:
dt_cv = dt_cross_eval.fit(train_df)
dt_cv

CrossValidatorModel_a0d64997c016

In [35]:
optimized_dt_model = dt_cv.bestModel
optimized_dt_model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_07e3660b1e33, depth=4, numNodes=11, numClasses=2, numFeatures=17

In [36]:
optimized_dt_model.getMaxDepth()

5

In [38]:
optimized_dt_model.getImpurity()

'gini'

In [39]:
optimized_dt_model.getMinInstancesPerNode()

10

In [40]:
optimized_dt_preds = optimized_dt_model.transform(test_df)
acc_eval.evaluate(optimized_dt_preds)

0.9144385026737968

In [42]:
param_grid = ParamGridBuilder().addGrid(rfc.numTrees,[20,100,150,200]).addGrid(rfc.impurity,['gini','entropy']).addGrid(rfc.featureSubsetStrategy,['sqrt','log2','onethird','all','auto']).build()

rf_cross_eval = CrossValidator(estimator=rfc,
                              estimatorParamMaps=param_grid,
                              evaluator=acc_eval,
                              numFolds=3)
rf_cross_eval

CrossValidator_4964c04509d5

In [44]:
rf_cv = rf_cross_eval.fit(train_df)
rf_cv

CrossValidatorModel_6bd1d6faf6ed

In [45]:
optimized_rf_model = rf_cv.bestModel
optimized_rf_model

RandomForestClassificationModel: uid=RandomForestClassifier_76209eb8ad04, numTrees=150, numClasses=2, numFeatures=17

In [47]:
optimized_rf_model.getNumTrees

150

In [48]:
optimized_rf_model.getImpurity()

'gini'

In [49]:
optimized_rf_model.getFeatureSubsetStrategy()

'onethird'

In [50]:
optimized_rf_preds = optimized_rf_model.transform(test_df)
acc_eval.evaluate(optimized_rf_preds)

0.9411764705882353

In [57]:
optimized_rf_model.save('private_college_classifier.pkl')

In [59]:
loaded_model = RandomForestClassificationModel.load('private_college_classifier.pkl')
loaded_model

RandomForestClassificationModel: uid=RandomForestClassifier_76209eb8ad04, numTrees=150, numClasses=2, numFeatures=17