In [0]:
# Sean Wendlandt
# 5/15/23
# Lab 7

In [0]:
from pyspark.sql.types import StructType, StructField, LongType, StringType
import pyspark.sql.functions as f
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, Bucketizer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
# Create DF schema
heartSchema = StructType( \
                          [StructField('id', LongType(), True), \
                           StructField('age', LongType(), True), \
                           StructField('sex', StringType(), True), \
                           StructField('chol', LongType(), True), \
                           StructField('pred', StringType(), True)
                          ])

In [0]:
heartTrainPath = "dbfs:///FileStore/tables/hearttraining__1_.csv"
heartTestPath = "dbfs:///FileStore/tables/hearttesting__1_.csv"

heartTrain = spark.read.format("csv").option("header", True).schema(heartSchema).option("ignoreLeadingWhiteSpace", True).option("mode", "dropMalformed").load(heartTrainPath)
heartTest = spark.read.format("csv").option("header", True).schema(heartSchema).option("ignoreLeadingWhiteSpace", True).option("mode", "dropMalformed").load(heartTestPath)

In [0]:
heartTrain.show(5)

+---+---+------+----+----+
| id|age|   sex|chol|pred|
+---+---+------+----+----+
|  0| 63|  male| 233|  no|
|  1| 67|  male| 286| yes|
|  2| 67|  male| 229| yes|
|  3| 37|  male| 250|  no|
|  4| 41|female| 204|  no|
+---+---+------+----+----+
only showing top 5 rows



In [0]:
heartTest.show(5)

+---+---+------+----+----+
| id|age|   sex|chol|pred|
+---+---+------+----+----+
|  0| 45|  male| 260|  no|
|  1| 34|  male| 182|  no|
|  2| 57|female| 303|  no|
|  3| 71|female| 265|  no|
|  4| 49|  male| 188|  no|
+---+---+------+----+----+
only showing top 5 rows



In [0]:
# Break age up
ageSplits = [-float("inf"), 40, 50, 60, 70, float("inf")]
ageBucket = Bucketizer(splits = ageSplits, inputCol = "age", outputCol = "ageBucket")

#  sex and pred into numbers
sexIndex = StringIndexer(inputCol = "sex", outputCol = "sexIndex")
predIndex = StringIndexer(inputCol = "pred", outputCol = "label")

In [0]:
# Create vectors for Logistic Regression
vectAssem = VectorAssembler(inputCols = ['ageBucket', 'sexIndex', 'chol'], outputCol = 'features')

In [0]:
# Create the Logistic Regression
lr = LogisticRegression(maxIter = 10, regParam = 0.01)

In [0]:
# build pipe
myStages = [ageBucket, sexIndex, predIndex, vectAssem, lr]
p = Pipeline(stages = myStages)

In [0]:
# Fit 
pModel = p.fit(heartTrain)

In [0]:
trainPredictions = pModel.transform(heartTrain)
trainPredictions.select(f.col("id"), f.col("label"), f.col("probability"), f.col("prediction")).show(20, False)
trainPredictions.show()

+---+-----+----------------------------------------+----------+
|id |label|probability                             |prediction|
+---+-----+----------------------------------------+----------+
|0  |0.0  |[0.2900998290975521,0.709900170902448]  |1.0       |
|1  |1.0  |[0.26945104974553674,0.7305489502544633]|1.0       |
|2  |1.0  |[0.29169570753051055,0.7083042924694894]|1.0       |
|3  |0.0  |[0.8157837695460767,0.18421623045392332]|0.0       |
|4  |0.0  |[0.9267011729340465,0.07329882706595348]|0.0       |
|5  |0.0  |[0.4761609080293803,0.5238390919706197] |1.0       |
|6  |1.0  |[0.690563407035725,0.30943659296427495] |0.0       |
|7  |0.0  |[0.8087162546158394,0.1912837453841606] |0.0       |
|8  |1.0  |[0.28180687357742945,0.7181931264225705]|1.0       |
|9  |1.0  |[0.49210026031201703,0.507899739687983] |1.0       |
|10 |0.0  |[0.4974185535932548,0.5025814464067452] |1.0       |
|11 |0.0  |[0.8260287536603729,0.17397124633962713]|0.0       |
|12 |1.0  |[0.4665222044078181,0.5334777

In [0]:
testPredictions = pModel.transform(heartTest)
testPredictions.select(f.col("id"), f.col("label"), f.col("probability"), f.col("prediction")).show(20, False)
testPredictions.show()

+---+-----+----------------------------------------+----------+
|id |label|probability                             |prediction|
+---+-----+----------------------------------------+----------+
|0  |0.0  |[0.6600292446882523,0.3399707553117477] |0.0       |
|1  |0.0  |[0.8347346706754634,0.16526532932453664]|0.0       |
|2  |0.0  |[0.82351300473095,0.17648699526904998]  |0.0       |
|3  |0.0  |[0.5008210148318487,0.4991789851681513] |0.0       |
|4  |0.0  |[0.6905482670360443,0.3094517329639557] |0.0       |
|5  |0.0  |[0.44111895396780615,0.5588810460321938]|1.0       |
|6  |0.0  |[0.5046714780368633,0.4953285219631367] |0.0       |
|7  |0.0  |[0.4795390246638692,0.5204609753361308] |1.0       |
|8  |0.0  |[0.2794640892028315,0.7205359107971685] |1.0       |
|9  |0.0  |[0.8246243812239777,0.17537561877602226]|0.0       |
|10 |0.0  |[0.6742174450106633,0.3257825549893367] |0.0       |
|11 |0.0  |[0.4698932789880572,0.5301067210119428] |1.0       |
|12 |0.0  |[0.500319763938094,0.49968023

In [0]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label')
print("Portion correctly predicted:",evaluator.evaluate(trainPredictions))

Portion correctly predicted: 0.7431660546715627
