In [1]:
from pyspark.sql import SparkSession
import os

spark = (
    SparkSession.builder
    .appName("S3AvroAnalytics")
    .config("spark.jars", "/drivers/postgresql-42.5.0.jar")
    .getOrCreate()
)

In [2]:
from pyspark.sql import functions as F, Window

In [3]:
# Read json files from S3
input_path = "s3a://mlops-proj-bucket/topics/csv-data-topic/"
df = spark.read.format("json").load(input_path)

In [6]:
df.printSchema()
df.show(5, truncate=False)

root
 |-- Residence_type: string (nullable = true)
 |-- age: string (nullable = true)
 |-- avg_glucose_level: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- hypertension: string (nullable = true)
 |-- id: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- partition: integer (nullable = true)

+--------------+---+-----------------+----+------------+------+-------------+------------+-----+---------------+------+-------------+---------+
|Residence_type|age|avg_glucose_level|bmi |ever_married|gender|heart_disease|hypertension|id   |smoking_status |stroke|work_type    |partition|
+--------------+---+-----------------+----+------------+------+-------------+------------+-----+---------------+------+-------------+---------+
|Urban         |64 |74.1 

In [7]:
df.columns

['Residence_type',
 'age',
 'avg_glucose_level',
 'bmi',
 'ever_married',
 'gender',
 'heart_disease',
 'hypertension',
 'id',
 'smoking_status',
 'stroke',
 'work_type',
 'partition']

In [9]:
# Create a temporary view for SQL queries
df.createOrReplaceTempView("stroke_data")

In [10]:
# Basic dataset overview
print("=== DATASET OVERVIEW ===")
spark.sql("""
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT id) as unique_patients,
    AVG(CAST(age AS DOUBLE)) as avg_age,
    AVG(CAST(avg_glucose_level AS DOUBLE)) as avg_glucose,
    AVG(CAST(bmi AS DOUBLE)) as avg_bmi
FROM stroke_data
""").show()

=== DATASET OVERVIEW ===
+-------------+---------------+-----------------+------------------+-----------------+
|total_records|unique_patients|          avg_age|       avg_glucose|          avg_bmi|
+-------------+---------------+-----------------+------------------+-----------------+
|         5100|           5100|43.21215686274512|106.16418431372549|28.89275510204084|
+-------------+---------------+-----------------+------------------+-----------------+



In [11]:
# Stroke distribution
print("=== STROKE DISTRIBUTION ===")
spark.sql("""
SELECT 
    stroke,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM stroke_data), 2) as percentage
FROM stroke_data
GROUP BY stroke
ORDER BY stroke
""").show()

=== STROKE DISTRIBUTION ===
+------+-----+----------+
|stroke|count|percentage|
+------+-----+----------+
|     0| 4852|     95.14|
|     1|  248|      4.86|
+------+-----+----------+



In [12]:
# Stroke rate by gender
print("=== STROKE RATE BY GENDER ===")
spark.sql("""
SELECT 
    gender,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY gender
ORDER BY stroke_rate_percent DESC
""").show()

=== STROKE RATE BY GENDER ===
+------+-----+------------+-------------------+
|gender|total|stroke_cases|stroke_rate_percent|
+------+-----+------------+-------------------+
|  Male| 2113|         107|               5.06|
|Female| 2986|         141|               4.72|
| Other|    1|           0|               0.00|
+------+-----+------------+-------------------+



In [13]:
# Age distribution and stroke risk
print("=== AGE DISTRIBUTION AND STROKE RISK ===")
spark.sql("""
SELECT 
    CASE 
        WHEN CAST(age AS INT) < 30 THEN 'Under 30'
        WHEN CAST(age AS INT) BETWEEN 30 AND 45 THEN '30-45'
        WHEN CAST(age AS INT) BETWEEN 46 AND 60 THEN '46-60'
        WHEN CAST(age AS INT) > 60 THEN 'Over 60'
    END as age_group,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY age_group
ORDER BY stroke_rate_percent DESC
""").show()

=== AGE DISTRIBUTION AND STROKE RISK ===
+---------+-----+------------+-------------------+
|age_group|total|stroke_cases|stroke_rate_percent|
+---------+-----+------------+-------------------+
|  Over 60| 1301|         176|              13.53|
|    46-60| 1186|          59|               4.97|
|    30-45| 1100|          11|               1.00|
| Under 30| 1513|           2|               0.13|
+---------+-----+------------+-------------------+



In [14]:
# Hypertension and heart disease impact
print("=== HYPERTENSION & HEART DISEASE IMPACT ===")
spark.sql("""
SELECT 
    hypertension,
    heart_disease,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY hypertension, heart_disease
ORDER BY stroke_rate_percent DESC
""").show()

=== HYPERTENSION & HEART DISEASE IMPACT ===
+------------+-------------+-----+------------+-------------------+
|hypertension|heart_disease|total|stroke_cases|stroke_rate_percent|
+------------+-------------+-----+------------+-------------------+
|           1|            1|   64|          13|              20.31|
|           0|            1|  212|          33|              15.57|
|           1|            0|  432|          53|              12.27|
|           0|            0| 4392|         149|               3.39|
+------------+-------------+-----+------------+-------------------+



In [15]:
# Smoking status analysis
print("=== SMOKING STATUS ANALYSIS ===")
spark.sql("""
SELECT 
    smoking_status,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY smoking_status
ORDER BY stroke_rate_percent DESC
""").show()

=== SMOKING STATUS ANALYSIS ===
+---------------+-----+------------+-------------------+
| smoking_status|total|stroke_cases|stroke_rate_percent|
+---------------+-----+------------+-------------------+
|formerly smoked|  884|          69|               7.81|
|         smokes|  789|          42|               5.32|
|   never smoked| 1887|          90|               4.77|
|        Unknown| 1540|          47|               3.05|
+---------------+-----+------------+-------------------+



In [16]:
# Work type and stroke risk
print("=== WORK TYPE AND STROKE RISK ===")
spark.sql("""
SELECT 
    work_type,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY work_type
ORDER BY stroke_rate_percent DESC
""").show()

=== WORK TYPE AND STROKE RISK ===
+-------------+-----+------------+-------------------+
|    work_type|total|stroke_cases|stroke_rate_percent|
+-------------+-----+------------+-------------------+
|Self-employed|  816|          65|               7.97|
|      Private| 2920|         148|               5.07|
|     Govt_job|  656|          33|               5.03|
|     children|  686|           2|               0.29|
| Never_worked|   22|           0|               0.00|
+-------------+-----+------------+-------------------+



In [17]:
# Glucose level analysis by stroke status
print("=== GLUCOSE LEVEL ANALYSIS ===")
spark.sql("""
SELECT 
    stroke,
    ROUND(AVG(CAST(avg_glucose_level AS DOUBLE)), 2) as avg_glucose,
    ROUND(MIN(CAST(avg_glucose_level AS DOUBLE)), 2) as min_glucose,
    ROUND(MAX(CAST(avg_glucose_level AS DOUBLE)), 2) as max_glucose,
    ROUND(STDDEV(CAST(avg_glucose_level AS DOUBLE)), 2) as std_glucose
FROM stroke_data
GROUP BY stroke
""").show()

=== GLUCOSE LEVEL ANALYSIS ===
+------+-----------+-----------+-----------+-----------+
|stroke|avg_glucose|min_glucose|max_glucose|std_glucose|
+------+-----------+-----------+-----------+-----------+
|     0|     104.84|      55.12|     267.76|       43.9|
|     1|     132.16|      56.11|     271.74|      61.74|
+------+-----------+-----------+-----------+-----------+



In [18]:
# BMI analysis by stroke status
print("=== BMI ANALYSIS ===")
spark.sql("""
SELECT 
    stroke,
    ROUND(AVG(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as avg_bmi,
    ROUND(MIN(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as min_bmi,
    ROUND(MAX(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as max_bmi
FROM stroke_data
GROUP BY stroke
""").show()

=== BMI ANALYSIS ===
+------+-------+-------+-------+
|stroke|avg_bmi|min_bmi|max_bmi|
+------+-------+-------+-------+
|     0|  28.82|   10.3|   97.6|
|     1|  30.44|   16.9|   56.6|
+------+-------+-------+-------+



In [19]:
# Residence type comparison
print("=== RESIDENCE TYPE COMPARISON ===")
spark.sql("""
SELECT 
    Residence_type,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent,
    ROUND(AVG(CAST(age AS DOUBLE)), 2) as avg_age
FROM stroke_data
GROUP BY Residence_type
""").show()

=== RESIDENCE TYPE COMPARISON ===
+--------------+-----+------------+-------------------+-------+
|Residence_type|total|stroke_cases|stroke_rate_percent|avg_age|
+--------------+-----+------------+-------------------+-------+
|         Urban| 2591|         134|               5.17|  43.52|
|         Rural| 2509|         114|               4.54|  42.89|
+--------------+-----+------------+-------------------+-------+



In [20]:
# Multi-factor risk analysis
print("=== MULTI-FACTOR RISK ANALYSIS ===")
spark.sql("""
SELECT 
    hypertension,
    heart_disease,
    smoking_status,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY hypertension, heart_disease, smoking_status
HAVING COUNT(*) > 10  -- Only show groups with sufficient data
ORDER BY stroke_rate_percent DESC
LIMIT 10
""").show()

=== MULTI-FACTOR RISK ANALYSIS ===
+------------+-------------+---------------+-----+------------+-------------------+
|hypertension|heart_disease| smoking_status|total|stroke_cases|stroke_rate_percent|
+------------+-------------+---------------+-----+------------+-------------------+
|           1|            1|         smokes|   15|           5|              33.33|
|           0|            1|         smokes|   46|          10|              21.74|
|           1|            1|formerly smoked|   21|           4|              19.05|
|           0|            1|        Unknown|   43|           8|              18.60|
|           1|            1|   never smoked|   23|           4|              17.39|
|           1|            0|formerly smoked|   99|          15|              15.15|
|           1|            0|   never smoked|  207|          28|              13.53|
|           0|            1|formerly smoked|   56|           7|              12.50|
|           0|            1|   never smok

In [21]:
# Data quality check for BMI
print("=== BMI DATA QUALITY ===")
spark.sql("""
SELECT 
    COUNT(*) as total_records,
    SUM(CASE WHEN bmi = 'N/A' THEN 1 ELSE 0 END) as missing_bmi,
    ROUND(SUM(CASE WHEN bmi = 'N/A' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as missing_bmi_percent
FROM stroke_data
""").show()

=== BMI DATA QUALITY ===
+-------------+-----------+-------------------+
|total_records|missing_bmi|missing_bmi_percent|
+-------------+-----------+-------------------+
|         5100|        200|               3.92|
+-------------+-----------+-------------------+



In [22]:
# Age vs Glucose level correlation by stroke
print("=== AGE vs GLUCOSE LEVEL ===")
spark.sql("""
SELECT 
    stroke,
    ROUND(CORR(CAST(age AS DOUBLE), CAST(avg_glucose_level AS DOUBLE)), 3) as age_glucose_correlation
FROM stroke_data
GROUP BY stroke
""").show()

=== AGE vs GLUCOSE LEVEL ===
+------+-----------------------+
|stroke|age_glucose_correlation|
+------+-----------------------+
|     0|                  0.223|
|     1|                   0.11|
+------+-----------------------+



In [23]:
# High-risk profile identification
print("=== HIGH-RISK PROFILES ===")
spark.sql("""
SELECT 
    COUNT(*) as high_risk_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM stroke_data), 2) as percentage
FROM stroke_data
WHERE 
    CAST(age AS INT) > 60 
    AND CAST(hypertension AS INT) = 1 
    AND CAST(avg_glucose_level AS DOUBLE) > 200
""").show()

=== HIGH-RISK PROFILES ===
+---------------+----------+
|high_risk_count|percentage|
+---------------+----------+
|             66|      1.29|
+---------------+----------+



In [24]:
# Summary statistics for numerical columns
print("=== NUMERICAL COLUMNS SUMMARY ===")
spark.sql("""
SELECT 
    'age' as column_name,
    ROUND(MIN(CAST(age AS DOUBLE)), 2) as min_value,
    ROUND(MAX(CAST(age AS DOUBLE)), 2) as max_value,
    ROUND(AVG(CAST(age AS DOUBLE)), 2) as avg_value,
    ROUND(STDDEV(CAST(age AS DOUBLE)), 2) as std_value
FROM stroke_data
UNION ALL
SELECT 
    'avg_glucose_level' as column_name,
    ROUND(MIN(CAST(avg_glucose_level AS DOUBLE)), 2) as min_value,
    ROUND(MAX(CAST(avg_glucose_level AS DOUBLE)), 2) as max_value,
    ROUND(AVG(CAST(avg_glucose_level AS DOUBLE)), 2) as avg_value,
    ROUND(STDDEV(CAST(avg_glucose_level AS DOUBLE)), 2) as std_value
FROM stroke_data
UNION ALL
SELECT 
    'bmi' as column_name,
    ROUND(MIN(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as min_value,
    ROUND(MAX(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as max_value,
    ROUND(AVG(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as avg_value,
    ROUND(STDDEV(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as std_value
FROM stroke_data
""").show()

=== NUMERICAL COLUMNS SUMMARY ===
+-----------------+---------+---------+---------+---------+
|      column_name|min_value|max_value|avg_value|std_value|
+-----------------+---------+---------+---------+---------+
|              age|     0.08|     82.0|    43.21|    22.61|
|avg_glucose_level|    55.12|   271.74|   106.16|    45.31|
|              bmi|     10.3|     97.6|    28.89|     7.85|
+-----------------+---------+---------+---------+---------+



In [25]:
# Window functions for patient ranking
print("=== PATIENT RISK RANKING ===")
spark.sql("""
SELECT 
    id,
    age,
    avg_glucose_level,
    bmi,
    hypertension,
    heart_disease,
    stroke,
    ROUND(CAST(age AS DOUBLE) * 0.3 + 
          CAST(avg_glucose_level AS DOUBLE) * 0.01 + 
          CAST(hypertension AS INT) * 20 + 
          CAST(heart_disease AS INT) * 25, 2) as risk_score,
    RANK() OVER (ORDER BY CAST(age AS DOUBLE) * 0.3 + 
                 CAST(avg_glucose_level AS DOUBLE) * 0.01 + 
                 CAST(hypertension AS INT) * 20 + 
                 CAST(heart_disease AS INT) * 25 DESC) as risk_rank
FROM stroke_data
WHERE bmi != 'N/A'
ORDER BY risk_score DESC
LIMIT 10
""").show()

=== PATIENT RISK RANKING ===
+-----+---+-----------------+----+------------+-------------+------+----------+---------+
|   id|age|avg_glucose_level| bmi|hypertension|heart_disease|stroke|risk_score|risk_rank|
+-----+---+-----------------+----+------------+-------------+------+----------+---------+
|20463| 81|           250.89|28.1|           1|            1|     1|     71.81|        1|
|67895| 82|           215.94|27.9|           1|            1|     1|     71.76|        2|
|65955| 81|           220.64|  30|           1|            1|     0|     71.51|        3|
|63836| 81|           217.94|24.1|           1|            1|     0|     71.48|        4|
|68627| 80|           175.29|31.5|           1|            1|     1|     70.75|        5|
|62791| 79|           205.23|  22|           1|            1|     0|     70.75|        6|
|28333| 79|           200.28|  30|           1|            1|     0|      70.7|        7|
|54353| 78|           227.16|41.7|           1|            1|     0|   

In [27]:
# Pivot table - Stroke rate by age group and gender
print("=== PIVOT: STROKE RATE BY AGE GROUP AND GENDER ===")
spark.sql("""
SELECT 
    gender,
    ROUND(
        SUM(CASE WHEN CAST(age AS INT) < 50 AND stroke = '1' THEN 1 ELSE 0 END) * 100.0 / 
        NULLIF(SUM(CASE WHEN CAST(age AS INT) < 50 THEN 1 ELSE 0 END), 0), 
        2
    ) as under_50_stroke_rate,
    ROUND(
        SUM(CASE WHEN CAST(age AS INT) >= 50 AND stroke = '1' THEN 1 ELSE 0 END) * 100.0 / 
        NULLIF(SUM(CASE WHEN CAST(age AS INT) >= 50 THEN 1 ELSE 0 END), 0), 
        2
    ) as over_50_stroke_rate
FROM stroke_data
GROUP BY gender
""").show()

=== PIVOT: STROKE RATE BY AGE GROUP AND GENDER ===
+------+--------------------+-------------------+
|gender|under_50_stroke_rate|over_50_stroke_rate|
+------+--------------------+-------------------+
|Female|                0.82|              10.00|
|  Male|                0.51|              10.80|
| Other|                0.00|               NULL|
+------+--------------------+-------------------+

