# Diabetes Prediction Machine Learning Model


### Loading Dataset into the databricks file system

In [0]:
 %sh
 rm -r /dbfs/ml_lab
 mkdir /dbfs/ml_lab
 wget -O /dbfs/ml_lab/diabetes.csv https://raw.githubusercontent.com/kuljotSB/DatabricksUdemyCourse/refs/heads/main/MachineLearningModel/diabetes.csv

### Exploring and Cleaning the data


In [0]:
df = spark.read.format("csv").option("header", "true").load("/ml_lab/diabetes.csv")
display(df)

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = df.dropna().select(col("Pregnancies").astype("int"),
                           col("Glucose").astype("int"),
                          col("BloodPressure").astype("int"),
                          col("SkinThickness").astype("int"),
                          col("Insulin").astype("int"),
                          col("BMI").astype("float"),
                          col("DiabetesPedigreeFunction").astype("float"),
                          col("Age").astype("int"),
                          col("Outcome").astype("int")
                          )
display(data)

### Splitting the data

In [0]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

### Performing Feature Engineering

#### Normalizing/scaling our features

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

numericFeatures = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction"]
numericColVector = VectorAssembler(inputCols=numericFeatures, outputCol = "numericFeatures")
vectorizedData = numericColVector.transform(train)

minMax = MinMaxScaler(inputCol= numericColVector.getOutputCol(), outputCol="normalizedFeatures")
scaledData = minMax.fit(vectorizedData).transform(vectorizedData)

compareNumerics = scaledData.select("numericFeatures", "normalizedFeatures")
display(compareNumerics)

### Preparing the features and the labels

In [0]:
preppedData = scaledData[col("normalizedFeatures").alias("features"), col("Outcome").alias("label")]
display(preppedData)

### Train a Machine Learning Model

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
model = lr.fit(preppedData)
print ("Model trained!")

### Testing the prepared model

In [0]:
# Prepare the test data

vectorizedTestData = numericColVector.transform(test)
scaledTestData = minMax.fit(vectorizedTestData).transform(vectorizedTestData)
preppedTestData = scaledTestData[col("normalizedFeatures").alias("features"), col("Outcome").alias("label")]
   
# Get predictions
prediction = model.transform(preppedTestData)
predicted = prediction.select("features", "probability", col("prediction").astype("Int"), col("label").alias("trueLabel"))
display(predicted)

#### Evaluating Our Model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Individual class metrics
labels = [0,1]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighted (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

### Using a Pipeline for Encapsulation


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
   

numFeatures = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
   
# Define the feature engineering and model training algorithm steps
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["normalizedFeatures"], outputCol="Features")
algo = LogisticRegression(labelCol="Outcome", featuresCol="Features", maxIter=10, regParam=0.3)
   
# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[ numVector, numScaler, featureVector, algo])
   
# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained!")

#### Use the pipline to inference prediction

In [0]:
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Outcome").alias("trueLabel"))
display(predicted)

### Saving the Model

In [0]:
model.save("/models/diabetes.model")

#### Locally Inferencing our Saved Model

In [0]:
from pyspark.ml.pipeline import PipelineModel

persistedModel = PipelineModel.load("/models/diabetes.model")
   
newData = spark.createDataFrame ([{"Pregnancies": 8,
                                  "Glucose": 85,
                                  "BloodPressure": 65,
                                  "SkinThickness": 29,
                                  "Insulin": 0,
                                  "BMI": 26.6,
                                  "DiabetesPedigreeFunction": 0.672,
                                  "Age": 34
                                  }])
   
   
predictions = persistedModel.transform(newData)
display(predictions.select("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age",  col("prediction").alias("PredictedOutcome")))