In [19]:
from google.colab import files
uploaded=files.upload()

Saving diabetes.csv to diabetes (1).csv


In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("customer_data") \
    .master("local[1]") \
    .getOrCreate()


In [28]:
df=spark.read.csv("diabetes.csv",header=True,inferSchema=True)

In [29]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [30]:
# firstly removing the duplicates
from pyspark.sql.functions import count, col
from pyspark.sql.window import Window

duplicate_count = df.groupBy(df.columns).count().filter("count > 1").count()


In [31]:
duplicate_count

0

In [33]:
# checking the null values
from pyspark.sql.functions import col, sum
null_counts = df.select([sum(col(c).isNull().cast("integer")).alias(c) for c in df.columns]).collect()[0].asDict()

In [34]:
null_counts

{'Pregnancies': 0,
 'Glucose': 0,
 'BloodPressure': 0,
 'SkinThickness': 0,
 'Insulin': 0,
 'BMI': 0,
 'DiabetesPedigreeFunction': 0,
 'Age': 0,
 'Outcome': 0}

In [43]:
# in the dataset some we have to check the outliers in only column then how should i do it)
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [47]:
# lets split the data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
feature_cols=df.drop("Outcome")
target_col=df.select("Outcome")

In [49]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Step 1: Define feature and target columns
feature_cols = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'
]
target_col = 'Outcome'

# Step 2: Handle nulls (based on your earlier null-checking code)
train_data = train_data.na.drop(subset=feature_cols + [target_col])
test_data = test_data.na.drop(subset=feature_cols + [target_col])

# Step 3: Create pipeline stages
# Stage 1: Assemble features into a 'features' column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Stage 2: Logistic Regression (will be added to pipeline later)
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Create pipeline (only assembler for now, as target renaming is done separately)
pipeline = Pipeline(stages=[assembler])

# Step 4: Rename target column to 'label' (MLlib convention)
train_data = train_data.withColumnRenamed(target_col, "label")
test_data = test_data.withColumnRenamed(target_col, "label")

# Step 5: Apply pipeline to transform data
pipeline_model = pipeline.fit(train_data)
train_data = pipeline_model.transform(train_data)
test_data = pipeline_model.transform(test_data)

# Step 6: Train the model
model = lr.fit(train_data)

# Step 7: Make predictions
predictions = model.transform(test_data)

# Step 8: Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

# Show sample predictions
predictions.select("features", "label", "prediction", "probability").show(5, truncate=False)

Test Accuracy: 0.7805
+------------------------------------------+-----+----------+-----------------------------------------+
|features                                  |label|prediction|probability                              |
+------------------------------------------+-----+----------+-----------------------------------------+
|(8,[1,5,6,7],[73.0,21.1,0.342,25.0])      |0    |0.0       |[0.9552777667567867,0.04472223324321334] |
|[0.0,84.0,82.0,31.0,125.0,38.2,0.233,23.0]|0    |0.0       |[0.9352055279438387,0.06479447205616129] |
|[0.0,91.0,68.0,32.0,210.0,39.9,0.381,25.0]|0    |0.0       |[0.8781035415696186,0.12189645843038144] |
|(8,[1,6,7],[94.0,0.256,25.0])             |0    |0.0       |[0.9875831522007745,0.012416847799225472]|
|[0.0,98.0,82.0,15.0,84.0,25.2,0.299,22.0] |0    |0.0       |[0.963600042732535,0.03639995726746503]  |
+------------------------------------------+-----+----------+-----------------------------------------+
only showing top 5 rows

