In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext

In [None]:
spark = SparkSession \
    .builder \
    .appName("ML_Classifications") \
    .getOrCreate()



In [None]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)



In [None]:
df = spark.read.csv('diabetes.csv',header='true',inferSchema=True)

In [None]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [None]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [None]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Pregnancies,768,3.8450520833333335,3.36957806269887,0,17
Glucose,768,120.89453125,31.97261819513622,0,199
BloodPressure,768,69.10546875,19.355807170644777,0,122
SkinThickness,768,20.536458333333332,15.952217567727642,0,99
Insulin,768,79.79947916666667,115.24400235133803,0,846
BMI,768,31.992578124999977,7.884160320375441,0.0,67.1
DiabetesPedigreeFunction,768,0.4718763020833327,0.331328595012775,0.078,2.42
Age,768,33.240885416666664,11.760231540678689,21,81
Outcome,768,0.3489583333333333,0.476951377242799,0,1


In [None]:
df.groupBy('Outcome').count().orderBy('count').show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  268|
|      0|  500|
+-------+-----+



In [None]:
(trainingData, testData) = df.randomSplit([0.8,0.2], seed = 13234)

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
featureColumns = df.columns[:--1]

In [None]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
scaler = StandardScaler(inputCol = 'features',outputCol='scaledFeatures',withStd=True,withMean=False)

In [None]:
lr = LogisticRegression(featuresCol= "scaledFeatures", labelCol="Outcome")

In [None]:
pipeline = Pipeline(stages=[assembler,scaler,lr])

In [None]:
pipeline = pipeline.fit(trainingData)

In [None]:
predictions = pipeline.transform(testData)

In [None]:
predictions.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------+--------------+--------------------+--------------------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|features|scaledFeatures|       rawPrediction|         probability|prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------+--------------+--------------------+--------------------+----------+
|          0|     91|           80|            0|      0|32.4|                   0.601| 27|      0|   [0.0]|         [0.0]|[1.20813782694454...|[0.76996929314526...|       0.0|
|          0|     95|           85|           25|     36|37.4|                   0.247| 24|      1|   [0.0]|         [0.0]|[1.20813782694454...|[0.76996929314526...|       0.0|
|          0|     98|           82|           15|     84|25.2|                   0.299| 22|      0|   [0.0]|       

In [None]:
def evaluate(result):
    predictionAndLabels = result.select("prediction", "Outcome")
    metrics = ["f1","precisionByLabel","recallByLabel","weightedPrecision","weightedRecall","accuracy"]
    for m in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName=m)
        print(str(m) + ": " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
evaluate(predictions)

f1: 0.6440978825916064
precisionByLabel: 0.7251908396946565
recallByLabel: 0.8796296296296297
weightedPrecision: 0.6384455239417073
weightedRecall: 0.6818181818181819
accuracy: 0.6818181818181818


In [None]:
predictions.select("prediction", "Outcome").write.save(path="predictions",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

In [None]:
sc.stop()