In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz
!tar xf spark-2.3.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

In [0]:

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


In [7]:
from google.colab import files
files.upload()

Saving College.csv to College (1).csv


{u'College.csv': ",Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate\r\nAbilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60\r\nAdelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56\r\nAdrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54\r\nAgnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59\r\nAlaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15\r\nAlbertson College,Yes,587,479,158,38,62,678,41,13500,3335,500,675,67,73,9.4,11,9727,55\r\nAlbertus Magnus College,Yes,353,340,103,17,45,416,230,13290,5720,500,1500,90,93,11.5,26,8861,63\r\nAlbion College,Yes,1899,1720,489,37,68,1594,32,13868,4826,450,850,89,100,13.7,37,11487,73\r\nAlbright College,Yes,1038,839,227,30,63,973,306,1

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tree').getOrCreate()
df = spark.read.csv('College.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F.Undergrad: integer (nullable = true)
 |-- P.Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room.Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S.F.Ratio: double (nullable = true)
 |-- perc.alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad.Rate: integer (nullable = true)



In [9]:
import pandas as pd

pd.DataFrame(df.take(5), columns = df.columns).transpose()

Unnamed: 0,0,1,2,3,4
_c0,Abilene Christian University,Adelphi University,Adrian College,Agnes Scott College,Alaska Pacific University
Private,Yes,Yes,Yes,Yes,Yes
Apps,1660,2186,1428,417,193
Accept,1232,1924,1097,349,146
Enroll,721,512,336,137,55
Top10perc,23,16,22,60,16
Top25perc,52,29,50,89,44
F.Undergrad,2885,2683,1036,510,249
P.Undergrad,537,1227,99,63,869
Outstate,7440,12280,11250,12960,7560


In [10]:
df.columns

['_c0',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F.Undergrad',
 'P.Undergrad',
 'Outstate',
 'Room.Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S.F.Ratio',
 'perc.alumni',
 'Expend',
 'Grad.Rate']

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Apps','Accept','Enroll','Top10perc','Top25perc','F.Undergrad','P.Undergrad','Outstate','Room.Board','Books','Personal','PhD','Terminal','S.F.Ratio','perc.alumni','Expend','Grad.Rate'], outputCol = 'features')

In [0]:
output = assembler.transform(df)

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'Private', outputCol = 'PrivateIndex')
outputFixed = indexer.fit(output).transform(output)
outputFixed.printSchema()

In [0]:
final_df = outputFixed.select('features', 'PrivateIndex')
final_df.show(3)

In [0]:
train, test = final_df.randomSplit([0.7, 0.3])

In [0]:
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier, 
                                      GBTClassifier)
from pyspark.ml import Pipeline

dt = DecisionTreeClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
rf = RandomForestClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
gb = GBTClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')

In [0]:
dt_model = dt.fit(train)
rf_model = rf.fit(train)
gb_model = gb.fit(train)

In [0]:
dt_predictions = dt_model.transform(test)
rf_predictions = rf_model.transform(test)
gb_predictions = gb_model.transform(test)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binary_evaluator = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

print('Decision Tree:', binary_evaluator.evaluate(dt_predictions))

In [0]:
print('Random Forest:' , binary_evaluator.evaluate(rf_predictions))

In [0]:
print('Gradient-boosted Trees:', binary_evaluator.evaluate(gb_predictions))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'PrivateIndex', metricName = 'accuracy')
print('Decision Tree Accu:', multi_evaluator.evaluate(dt_predictions))

In [0]:
multi_evaluator.getMetricName()

In [0]:
print('Random Forest Accu:', multi_evaluator.evaluate(rf_predictions))

In [0]:
print('Gradient-boosted Trees Accu:', multi_evaluator.evaluate(gb_predictions))