In [2]:
import pandas as pd
import numpy as np

In [24]:
#Khởi tạo SparkSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Diabetes").getOrCreate()

df = spark.read.csv(
    "diabetes_dataset.csv",
    header=True,
    inferSchema=True
)

In [25]:
#Check null
from pyspark.sql.functions import col, sum

df.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df.columns
]).show()

+---+------+---------+---------------+------------+-----------------+--------------+----------------------------+----------------------------------+----------+-------------------+-------------------------+-----------------------+--------------------+----------------------+---+------------------+-----------+------------+----------+-----------------+---------------+---------------+-------------+---------------+--------------------+-------------+-----+-------------------+--------------+------------------+
|age|gender|ethnicity|education_level|income_level|employment_status|smoking_status|alcohol_consumption_per_week|physical_activity_minutes_per_week|diet_score|sleep_hours_per_day|screen_time_hours_per_day|family_history_diabetes|hypertension_history|cardiovascular_history|bmi|waist_to_hip_ratio|systolic_bp|diastolic_bp|heart_rate|cholesterol_total|hdl_cholesterol|ldl_cholesterol|triglycerides|glucose_fasting|glucose_postprandial|insulin_level|hba1c|diabetes_risk_score|diabetes_stage|diag

In [26]:
# Check duplicates
# This cell computes total rows, distinct rows (all columns), and shows sample duplicate groups
total = df.count()
distinct_all = df.dropDuplicates().count()
duplicates_all = total - distinct_all
print(f"Total rows: {total}")
print(f"Distinct rows: {distinct_all}")
print(f"Duplicate rows: {duplicates_all}")
if duplicates_all > 0:
    from pyspark.sql.functions import col
    # Group by all columns to find groups with count>1 (may be expensive on large datasets)
    dup_groups = df.groupBy(df.columns).count().filter(col("count") > 1).orderBy(col("count").desc())
    print("Duplicate groups:")
    dup_groups.show(10, truncate=False)
else:
    print("No duplicates found.")

Total rows: 100000
Distinct rows: 100000
Duplicate rows: 0
No duplicates found.


In [27]:
#Drop unnecessary columns
cols_to_drop = [
    "ethnicity",
    "education_level",
    "smoking_status",
    "alcohol_consumption_per_week",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "hypertension_history",
    "cardiovascular_history",
    "waist_to_hip_ratio",
    "diastolic_bp",
    "heart_rate",
    "cholesterol_total",
    "ldl_cholesterol",
    "glucose_postprandial",
    "insulin_level",
    "income_level",
    "employment_status",
    "diabetes_stage",
    "diagnosed_diabetes"
]

df = df.drop(*cols_to_drop)

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

df = df.withColumn(
    "gender",
    when(col("gender") == "Male", 1)
    .when(col("gender") == "Female", 0)
    .otherwise(None)   # nếu có giá trị lạ thì để null
)

df.select("gender").show(5)
df.printSchema()


+------+
|gender|
+------+
|     1|
|     0|
|     1|
|     0|
|     1|
+------+
only showing top 5 rows
root
 |-- age: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- physical_activity_minutes_per_week: integer (nullable = true)
 |-- diet_score: double (nullable = true)
 |-- family_history_diabetes: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- systolic_bp: integer (nullable = true)
 |-- hdl_cholesterol: integer (nullable = true)
 |-- triglycerides: integer (nullable = true)
 |-- glucose_fasting: integer (nullable = true)
 |-- hba1c: double (nullable = true)
 |-- diabetes_risk_score: double (nullable = true)



In [30]:
df.show()

+---+------+----------------------------------+----------+-----------------------+----+-----------+---------------+-------------+---------------+-----+-------------------+
|age|gender|physical_activity_minutes_per_week|diet_score|family_history_diabetes| bmi|systolic_bp|hdl_cholesterol|triglycerides|glucose_fasting|hba1c|diabetes_risk_score|
+---+------+----------------------------------+----------+-----------------------+----+-----------+---------------+-------------+---------------+-----+-------------------+
| 58|     1|                               215|       5.7|                      0|30.5|        134|             41|          145|            136| 8.18|               29.6|
| 48|     0|                               143|       6.7|                      0|23.1|        129|             55|           30|             93| 5.63|               23.0|
| 60|     1|                                57|       6.4|                      1|22.2|        115|             66|           36|           

In [31]:
# Export cleaned DataFrame as single CSV file
import os, glob, shutil
out_dir = "output/cleaned_data_tmp"
# Coalesce to a single partition and write out
df.coalesce(1).write.csv(out_dir, header=True, mode="overwrite")
# Find the single part file Spark produced and move it
part_files = glob.glob(os.path.join(out_dir, "part-*.csv"))
if part_files:
    part_file = part_files[0]
    dest = "output/cleaned_data.csv"
    shutil.move(part_file, dest)
    # cleanup temporary folder contents except the moved file
    for f in glob.glob(os.path.join(out_dir, "*")):
        # skip if it's the moved file (already relocated)
        if os.path.abspath(f) == os.path.abspath(dest):
            continue
        if os.path.isdir(f):
            shutil.rmtree(f, ignore_errors=True)
        else:
            try:
                os.remove(f)
            except OSError:
                pass
    try:
        os.rmdir(out_dir)
    except OSError:
        pass
    print("Written:", dest)
else:
    print("No part file found in", out_dir)

Written: output/cleaned_data.csv
