In [0]:
# COMP4334 A2
# Sean Wendlandt

from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, FloatType
import pyspark.sql.functions as f

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, Bucketizer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Define the schema for the diabetes dataset
diabetesSchema = StructType( \
                [StructField('Pregnancies', IntegerType(), True), \
                StructField('Glucose', IntegerType(), True), \
                StructField('BloodPressure', IntegerType(), True), \
                StructField('SkinThickness', IntegerType(), True), \
                StructField('Insulin', IntegerType(), True), \
                StructField('BMI', FloatType(), True), \
                StructField('DiabetesPedigreeFunction', FloatType(), True), \
                StructField('Age', IntegerType(), True), \
                StructField('Outcome', IntegerType(), True) \
                ])



In [0]:
# Load in the full dataset
diabPath = "dbfs:////FileStore/tables/diabetes-1.csv"
diabetes = spark.read.format("csv").option("header", True).schema(diabetesSchema).option("ignoreLeadingWhiteSpace", True).option("mode", "dropMalformed").load(diabPath)


diabetes.show(10)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [0]:
# split the data into train and test
splits = diabetes.randomSplit(weights = [0.7, 0.3], seed = 2023)

In [0]:
# Get the Training and Testing data sets
diabetesTrain = splits[0]
diabetesTest = splits[1]

In [0]:
# Repartition test data and save to separate folder
print(diabetesTest.rdd.getNumPartitions())
diabetesTest = diabetesTest.repartition(50).persist()
print(diabetesTest.rdd.getNumPartitions())

# Save newly partitioned data to FileStore
dbutils.fs.rm("FileStore/tables/diabetesTestData/", True)
diabetesTest.write.format("csv").option("header", True).save("FileStore/tables/diabetesTestData/")

1
50


In [0]:
# Define the feature columns and label column
featureCols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
labelCol = 'Outcome'

# Create the feature vector assembler
assembler = VectorAssembler(inputCols=featureCols, outputCol='features')

# Create the label indexer
labelIndexer = StringIndexer(inputCol=labelCol, outputCol='label')

# Create the classifiers
# lr = LogisticRegression(maxIter = 10, regParam = 0.01)
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

In [0]:
# Build the pipelines
lrPipeline = Pipeline(stages=[assembler, labelIndexer, lr])
dtPipeline = Pipeline(stages=[assembler, labelIndexer, dt])
rfPipeline = Pipeline(stages=[assembler, labelIndexer, rf])

In [0]:
# Fit the models
lrModel = lrPipeline.fit(diabetesTrain)
# Make predictions on the train set
lrTrainPredictions = lrModel.transform(diabetesTrain)

In [0]:
dtModel = dtPipeline.fit(diabetesTrain)
dtTrainPredictions = dtModel.transform(diabetesTrain)

In [0]:
rfModel = rfPipeline.fit(diabetesTrain)
rfTrainPredictions = rfModel.transform(diabetesTrain)

In [0]:
# Evaluate the models
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
lrTrainAccuracy = evaluator.evaluate(lrTrainPredictions)
dtTrainAccuracy = evaluator.evaluate(dtTrainPredictions)
rfTrainAccuracy = evaluator.evaluate(rfTrainPredictions)
print(f"Logistic Regression: {evaluator.evaluate(lrTrainPredictions)}")
print(f"Decision Tree: {evaluator.evaluate(dtTrainPredictions)}")
print(f"Random Forrest: {evaluator.evaluate(rfTrainPredictions)}")

Logistic Regression: 0.7835249042145593
Decision Tree: 0.8371647509578544
Random Forrest: 0.8524904214559387


In [0]:

def getConfusionMatrix(model, name = "Model Results"):
    results = model.select(['prediction', 'label'])
    predictionAndLabels = results.rdd
    metrics = MulticlassMetrics(predictionAndLabels)

    cm = metrics.confusionMatrix().toArray()
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])
    recall = (cm[0][0])/(cm[0][0]+cm[0][1])
    f1 = (2*recall*precision)/(recall+precision)
    print(f"{name}: Precision - {precision}, Recall - {recall}, F1 - {f1}")

In [0]:
getConfusionMatrix(lrTrainPredictions, "Logistic Regression")
getConfusionMatrix(dtTrainPredictions, "Decision Tree")
getConfusionMatrix(rfTrainPredictions, "Random Forest")

Logistic Regression: Precision - 0.7957559681697612, Recall - 0.8928571428571429, F1 - 0.8415147265077139
Decision Tree: Precision - 0.8311345646437994, Recall - 0.9375, F1 - 0.8811188811188811
Random Forest: Precision - 0.8398950131233596, Recall - 0.9523809523809523, F1 - 0.8926080892608089


In [0]:
# Setup the stream
sourceStream = spark.readStream.format("csv").option("header", True).schema(diabetesSchema).option("maxFilesPerTrigger", 1).load("dbfs:///FileStore/tables/diabetesTestData/")

In [0]:
# Load Stream
diabetesStream = rfModel.transform(sourceStream).select(f.col("label"), f.col("probability"), f.col("prediction"))
display(diabetesStream)

label,probability,prediction
1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.18477201856205053, 0.8152279814379495))",1.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7677447097540054, 0.23225529024599453))",0.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7197280828496351, 0.28027191715036504))",0.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.4610777358527467, 0.5389222641472533))",1.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9438220394188035, 0.056177960581196625))",0.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8433163390816951, 0.15668366091830493))",0.0
1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.5111836142246857, 0.4888163857753144))",0.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6366820566400484, 0.3633179433599517))",0.0
0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.5299674735375577, 0.4700325264624422))",0.0
1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.14224922220946692, 0.857750777790533))",1.0
