In [1]:
from pyspark.sql import SparkSession
import os

spark = (
    SparkSession.builder
    .appName("S3AvroAnalytics")
    .config("spark.jars", "/drivers/postgresql-42.5.0.jar")
    .getOrCreate()
)

In [2]:
from pyspark.sql import functions as F, Window

In [3]:
# Read json files from S3
input_path = "s3a://mlops-proj-bucket/topics/csv-data-topic/"
df = spark.read.format("json").load(input_path)

In [4]:
df.printSchema()
df.show(5, truncate=False)

root
 |-- Residence_type: string (nullable = true)
 |-- age: string (nullable = true)
 |-- avg_glucose_level: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- hypertension: string (nullable = true)
 |-- id: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- partition: integer (nullable = true)

+--------------+---+-----------------+----+------------+------+-------------+------------+-----+---------------+------+-------------+---------+
|Residence_type|age|avg_glucose_level|bmi |ever_married|gender|heart_disease|hypertension|id   |smoking_status |stroke|work_type    |partition|
+--------------+---+-----------------+----+------------+------+-------------+------------+-----+---------------+------+-------------+---------+
|Urban         |64 |74.1 

In [5]:
df.columns

['Residence_type',
 'age',
 'avg_glucose_level',
 'bmi',
 'ever_married',
 'gender',
 'heart_disease',
 'hypertension',
 'id',
 'smoking_status',
 'stroke',
 'work_type',
 'partition']

In [6]:
# Create a temporary view for SQL queries
df.createOrReplaceTempView("stroke_data")

In [35]:
# Basic dataset overview
dataset_overview = spark.sql("""
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT id) as unique_patients,
    AVG(CAST(age AS DOUBLE)) as avg_age,
    AVG(CAST(avg_glucose_level AS DOUBLE)) as avg_glucose,
    AVG(CAST(bmi AS DOUBLE)) as avg_bmi
FROM stroke_data
""")

In [33]:
dataset_overview.show(10, truncate=False)

+-------------+---------------+-----------------+------------------+-----------------+
|total_records|unique_patients|avg_age          |avg_glucose       |avg_bmi          |
+-------------+---------------+-----------------+------------------+-----------------+
|5100         |5100           |43.21215686274512|106.16418431372549|28.89275510204084|
+-------------+---------------+-----------------+------------------+-----------------+



In [36]:
# Stroke distribution
stroke_distribution=spark.sql("""
SELECT 
    stroke,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM stroke_data), 2) as percentage
FROM stroke_data
GROUP BY stroke
ORDER BY stroke
""")

In [37]:
stroke_distribution.show()

+------+-----+----------+
|stroke|count|percentage|
+------+-----+----------+
|     0| 4852|     95.14|
|     1|  248|      4.86|
+------+-----+----------+



In [38]:
# Stroke rate by gender
gender_analysis = spark.sql("""
SELECT 
    gender,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY gender
ORDER BY stroke_rate_percent DESC
""")

In [39]:
gender_analysis.show()

+------+-----+------------+-------------------+
|gender|total|stroke_cases|stroke_rate_percent|
+------+-----+------------+-------------------+
|  Male| 2113|         107|               5.06|
|Female| 2986|         141|               4.72|
| Other|    1|           0|               0.00|
+------+-----+------------+-------------------+



In [40]:
# Age distribution and stroke risk
age_group_analysis = spark.sql("""
SELECT 
    CASE 
        WHEN CAST(age AS INT) < 30 THEN 'Under 30'
        WHEN CAST(age AS INT) BETWEEN 30 AND 45 THEN '30-45'
        WHEN CAST(age AS INT) BETWEEN 46 AND 60 THEN '46-60'
        WHEN CAST(age AS INT) > 60 THEN 'Over 60'
    END as age_group,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY age_group
ORDER BY stroke_rate_percent DESC
""")

In [41]:
age_group_analysis.show()

+---------+-----+------------+-------------------+
|age_group|total|stroke_cases|stroke_rate_percent|
+---------+-----+------------+-------------------+
|  Over 60| 1301|         176|              13.53|
|    46-60| 1186|          59|               4.97|
|    30-45| 1100|          11|               1.00|
| Under 30| 1513|           2|               0.13|
+---------+-----+------------+-------------------+



In [42]:
# Hypertension and heart disease impact
hypertension_heart_disease_analysis = spark.sql("""
SELECT 
    hypertension,
    heart_disease,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY hypertension, heart_disease
ORDER BY stroke_rate_percent DESC
""")

In [43]:
hypertension_heart_disease_analysis.show()

+------------+-------------+-----+------------+-------------------+
|hypertension|heart_disease|total|stroke_cases|stroke_rate_percent|
+------------+-------------+-----+------------+-------------------+
|           1|            1|   64|          13|              20.31|
|           0|            1|  212|          33|              15.57|
|           1|            0|  432|          53|              12.27|
|           0|            0| 4392|         149|               3.39|
+------------+-------------+-----+------------+-------------------+



In [44]:
# Smoking status analysis
smoking_analysis = spark.sql("""
SELECT 
    smoking_status,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY smoking_status
ORDER BY stroke_rate_percent DESC
""")

In [45]:
smoking_analysis.show()

+---------------+-----+------------+-------------------+
| smoking_status|total|stroke_cases|stroke_rate_percent|
+---------------+-----+------------+-------------------+
|formerly smoked|  884|          69|               7.81|
|         smokes|  789|          42|               5.32|
|   never smoked| 1887|          90|               4.77|
|        Unknown| 1540|          47|               3.05|
+---------------+-----+------------+-------------------+



In [46]:
# Work type and stroke risk
work_type_analysis = spark.sql("""
SELECT 
    work_type,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY work_type
ORDER BY stroke_rate_percent DESC
""")

In [47]:
work_type_analysis.show()

+-------------+-----+------------+-------------------+
|    work_type|total|stroke_cases|stroke_rate_percent|
+-------------+-----+------------+-------------------+
|Self-employed|  816|          65|               7.97|
|      Private| 2920|         148|               5.07|
|     Govt_job|  656|          33|               5.03|
|     children|  686|           2|               0.29|
| Never_worked|   22|           0|               0.00|
+-------------+-----+------------+-------------------+



In [48]:
# Glucose level analysis by stroke status
glucose_analysis = spark.sql("""
SELECT 
    stroke,
    ROUND(AVG(CAST(avg_glucose_level AS DOUBLE)), 2) as avg_glucose,
    ROUND(MIN(CAST(avg_glucose_level AS DOUBLE)), 2) as min_glucose,
    ROUND(MAX(CAST(avg_glucose_level AS DOUBLE)), 2) as max_glucose,
    ROUND(STDDEV(CAST(avg_glucose_level AS DOUBLE)), 2) as std_glucose
FROM stroke_data
GROUP BY stroke
""")

In [49]:
glucose_analysis.show()

+------+-----------+-----------+-----------+-----------+
|stroke|avg_glucose|min_glucose|max_glucose|std_glucose|
+------+-----------+-----------+-----------+-----------+
|     0|     104.84|      55.12|     267.76|       43.9|
|     1|     132.16|      56.11|     271.74|      61.74|
+------+-----------+-----------+-----------+-----------+



In [50]:
# BMI analysis by stroke status
bmi_analysis = spark.sql("""
SELECT 
    stroke,
    ROUND(AVG(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as avg_bmi,
    ROUND(MIN(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as min_bmi,
    ROUND(MAX(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as max_bmi
FROM stroke_data
GROUP BY stroke
""")

In [51]:
bmi_analysis.show()

+------+-------+-------+-------+
|stroke|avg_bmi|min_bmi|max_bmi|
+------+-------+-------+-------+
|     0|  28.82|   10.3|   97.6|
|     1|  30.44|   16.9|   56.6|
+------+-------+-------+-------+



In [52]:
# Residence type comparison
residence_analysis = spark.sql("""
SELECT 
    Residence_type,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent,
    ROUND(AVG(CAST(age AS DOUBLE)), 2) as avg_age
FROM stroke_data
GROUP BY Residence_type
""")

In [53]:
residence_analysis.show()

+--------------+-----+------------+-------------------+-------+
|Residence_type|total|stroke_cases|stroke_rate_percent|avg_age|
+--------------+-----+------------+-------------------+-------+
|         Urban| 2591|         134|               5.17|  43.52|
|         Rural| 2509|         114|               4.54|  42.89|
+--------------+-----+------------+-------------------+-------+



In [54]:
# Multi-factor risk analysis
multi_factor_risk = spark.sql("""
SELECT 
    hypertension,
    heart_disease,
    smoking_status,
    COUNT(*) as total,
    SUM(CAST(stroke AS INT)) as stroke_cases,
    ROUND(SUM(CAST(stroke AS INT)) * 100.0 / COUNT(*), 2) as stroke_rate_percent
FROM stroke_data
GROUP BY hypertension, heart_disease, smoking_status
HAVING COUNT(*) > 10  -- Only show groups with sufficient data
ORDER BY stroke_rate_percent DESC
LIMIT 10
""")

In [55]:
multi_factor_risk.show()

+------------+-------------+---------------+-----+------------+-------------------+
|hypertension|heart_disease| smoking_status|total|stroke_cases|stroke_rate_percent|
+------------+-------------+---------------+-----+------------+-------------------+
|           1|            1|         smokes|   15|           5|              33.33|
|           0|            1|         smokes|   46|          10|              21.74|
|           1|            1|formerly smoked|   21|           4|              19.05|
|           0|            1|        Unknown|   43|           8|              18.60|
|           1|            1|   never smoked|   23|           4|              17.39|
|           1|            0|formerly smoked|   99|          15|              15.15|
|           1|            0|   never smoked|  207|          28|              13.53|
|           0|            1|formerly smoked|   56|           7|              12.50|
|           0|            1|   never smoked|   67|           8|             

In [57]:
# Data quality check for BMI
data_quality_metrics = spark.sql("""
SELECT 
    COUNT(*) as total_records,
    SUM(CASE WHEN bmi = 'N/A' THEN 1 ELSE 0 END) as missing_bmi,
    ROUND(SUM(CASE WHEN bmi = 'N/A' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as missing_bmi_percent
FROM stroke_data
""")

In [58]:
data_quality_metrics.show()

+-------------+-----------+-------------------+
|total_records|missing_bmi|missing_bmi_percent|
+-------------+-----------+-------------------+
|         5100|        200|               3.92|
+-------------+-----------+-------------------+



In [59]:
# Age vs Glucose level correlation by stroke
correlation_analysis = spark.sql("""
SELECT 
    stroke,
    ROUND(CORR(CAST(age AS DOUBLE), CAST(avg_glucose_level AS DOUBLE)), 3) as age_glucose_correlation
FROM stroke_data
GROUP BY stroke
""")

In [60]:
correlation_analysis.show()

+------+-----------------------+
|stroke|age_glucose_correlation|
+------+-----------------------+
|     0|                  0.223|
|     1|                   0.11|
+------+-----------------------+



In [61]:
# High-risk profile identification
high_risk_profiles = spark.sql("""
SELECT 
    COUNT(*) as high_risk_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM stroke_data), 2) as percentage
FROM stroke_data
WHERE 
    CAST(age AS INT) > 60 
    AND CAST(hypertension AS INT) = 1 
    AND CAST(avg_glucose_level AS DOUBLE) > 200
""")

In [62]:
high_risk_profiles.show()

+---------------+----------+
|high_risk_count|percentage|
+---------------+----------+
|             66|      1.29|
+---------------+----------+



In [63]:
# Summary statistics for numerical columns
numerical_summary = spark.sql("""
SELECT 
    'age' as column_name,
    ROUND(MIN(CAST(age AS DOUBLE)), 2) as min_value,
    ROUND(MAX(CAST(age AS DOUBLE)), 2) as max_value,
    ROUND(AVG(CAST(age AS DOUBLE)), 2) as avg_value,
    ROUND(STDDEV(CAST(age AS DOUBLE)), 2) as std_value
FROM stroke_data
UNION ALL
SELECT 
    'avg_glucose_level' as column_name,
    ROUND(MIN(CAST(avg_glucose_level AS DOUBLE)), 2) as min_value,
    ROUND(MAX(CAST(avg_glucose_level AS DOUBLE)), 2) as max_value,
    ROUND(AVG(CAST(avg_glucose_level AS DOUBLE)), 2) as avg_value,
    ROUND(STDDEV(CAST(avg_glucose_level AS DOUBLE)), 2) as std_value
FROM stroke_data
UNION ALL
SELECT 
    'bmi' as column_name,
    ROUND(MIN(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as min_value,
    ROUND(MAX(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as max_value,
    ROUND(AVG(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as avg_value,
    ROUND(STDDEV(CASE WHEN bmi != 'N/A' THEN CAST(bmi AS DOUBLE) ELSE NULL END), 2) as std_value
FROM stroke_data
""")

In [64]:
numerical_summary.show()

+-----------------+---------+---------+---------+---------+
|      column_name|min_value|max_value|avg_value|std_value|
+-----------------+---------+---------+---------+---------+
|              age|     0.08|     82.0|    43.21|    22.61|
|avg_glucose_level|    55.12|   271.74|   106.16|    45.31|
|              bmi|     10.3|     97.6|    28.89|     7.85|
+-----------------+---------+---------+---------+---------+



In [65]:
# Window functions for patient ranking
patient_risk_scores = spark.sql("""
SELECT 
    id,
    age,
    avg_glucose_level,
    bmi,
    hypertension,
    heart_disease,
    stroke,
    ROUND(CAST(age AS DOUBLE) * 0.3 + 
          CAST(avg_glucose_level AS DOUBLE) * 0.01 + 
          CAST(hypertension AS INT) * 20 + 
          CAST(heart_disease AS INT) * 25, 2) as risk_score,
    RANK() OVER (ORDER BY CAST(age AS DOUBLE) * 0.3 + 
                 CAST(avg_glucose_level AS DOUBLE) * 0.01 + 
                 CAST(hypertension AS INT) * 20 + 
                 CAST(heart_disease AS INT) * 25 DESC) as risk_rank
FROM stroke_data
WHERE bmi != 'N/A'
ORDER BY risk_score DESC
LIMIT 10
""")

In [66]:
patient_risk_scores.show()

+-----+---+-----------------+----+------------+-------------+------+----------+---------+
|   id|age|avg_glucose_level| bmi|hypertension|heart_disease|stroke|risk_score|risk_rank|
+-----+---+-----------------+----+------------+-------------+------+----------+---------+
|20463| 81|           250.89|28.1|           1|            1|     1|     71.81|        1|
|67895| 82|           215.94|27.9|           1|            1|     1|     71.76|        2|
|65955| 81|           220.64|  30|           1|            1|     0|     71.51|        3|
|63836| 81|           217.94|24.1|           1|            1|     0|     71.48|        4|
|68627| 80|           175.29|31.5|           1|            1|     1|     70.75|        5|
|62791| 79|           205.23|  22|           1|            1|     0|     70.75|        6|
|28333| 79|           200.28|  30|           1|            1|     0|      70.7|        7|
|54353| 78|           227.16|41.7|           1|            1|     0|     70.67|        8|
|19271| 82

In [67]:
# Pivot table - Stroke rate by age group and gender
age_gender_pivot = spark.sql("""
SELECT 
    gender,
    ROUND(
        SUM(CASE WHEN CAST(age AS INT) < 50 AND stroke = '1' THEN 1 ELSE 0 END) * 100.0 / 
        NULLIF(SUM(CASE WHEN CAST(age AS INT) < 50 THEN 1 ELSE 0 END), 0), 
        2
    ) as under_50_stroke_rate,
    ROUND(
        SUM(CASE WHEN CAST(age AS INT) >= 50 AND stroke = '1' THEN 1 ELSE 0 END) * 100.0 / 
        NULLIF(SUM(CASE WHEN CAST(age AS INT) >= 50 THEN 1 ELSE 0 END), 0), 
        2
    ) as over_50_stroke_rate
FROM stroke_data
GROUP BY gender
""")

In [68]:
age_gender_pivot.show()

+------+--------------------+-------------------+
|gender|under_50_stroke_rate|over_50_stroke_rate|
+------+--------------------+-------------------+
|Female|                0.82|              10.00|
|  Male|                0.51|              10.80|
| Other|                0.00|               NULL|
+------+--------------------+-------------------+



In [24]:
# PostgreSQL connection configuration
postgres_host = "44.202.107.137"
postgres_port = "5432"
postgres_db = "stroke_data_exploration"
postgres_user = "analytics_user"
postgres_password = "analytics_pass" 

In [25]:
postgres_url = f"jdbc:postgresql://{postgres_host}:{postgres_port}/{postgres_db}"
postgres_properties = {
    "user": postgres_user,
    "password": postgres_password,
    "driver": "org.postgresql.Driver"
}

In [27]:
def write_spark_df_to_postgresql(spark_df, table_name, mode="replace"):
    """
    Write Spark DataFrame to PostgreSQL using psycopg2 and sqlalchemy
    Modes: 'replace', 'append'
    """
    try:
        # Convert Spark DataFrame to Pandas
        pandas_df = spark_df.toPandas()
        
        # Create SQLAlchemy engine
        from sqlalchemy import create_engine
        engine = create_engine(f'postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}')
        
        # Write Pandas DataFrame to PostgreSQL
        pandas_df.to_sql(table_name, engine, if_exists=mode, index=False)
        
        print(f"Successfully wrote {pandas_df.shape[0]} rows to {table_name}")
    except Exception as e:
        print(f"Error writing to {table_name}: {str(e)}")
        raise

In [30]:
write_spark_df_to_postgresql(df, "stroke_raw_data", mode="replace")

Successfully wrote 5100 rows to stroke_raw_data


In [34]:
write_spark_df_to_postgresql(dataset_overview, "dataset_overview", mode="replace")

Successfully wrote 1 rows to dataset_overview


In [69]:
write_spark_df_to_postgresql(stroke_distribution, "stroke_distribution", mode="replace")

Successfully wrote 2 rows to stroke_distribution


In [70]:
write_spark_df_to_postgresql(gender_analysis, "gender_analysis", mode="replace")

Successfully wrote 3 rows to gender_analysis


In [71]:
write_spark_df_to_postgresql(age_group_analysis, "age_group_analysis", mode="replace")

Successfully wrote 4 rows to age_group_analysis


In [72]:
write_spark_df_to_postgresql(hypertension_heart_disease_analysis, "hypertension_heart_disease_analysis", mode="replace")

Successfully wrote 4 rows to hypertension_heart_disease_analysis


In [73]:
write_spark_df_to_postgresql(smoking_analysis, "smoking_analysis", mode="replace")

Successfully wrote 4 rows to smoking_analysis


In [74]:
write_spark_df_to_postgresql(work_type_analysis, "work_type_analysis", mode="replace")

Successfully wrote 5 rows to work_type_analysis


In [75]:
write_spark_df_to_postgresql(glucose_analysis, "glucose_analysis", mode="replace")

Successfully wrote 2 rows to glucose_analysis


In [76]:
write_spark_df_to_postgresql(bmi_analysis, "bmi_analysis", mode="replace")

Successfully wrote 2 rows to bmi_analysis


In [77]:
write_spark_df_to_postgresql(residence_analysis, "residence_analysis", mode="replace")

Successfully wrote 2 rows to residence_analysis


In [78]:
write_spark_df_to_postgresql(multi_factor_risk, "multi_factor_risk", mode="replace")

Successfully wrote 10 rows to multi_factor_risk


In [79]:
write_spark_df_to_postgresql(data_quality_metrics, "data_quality_metrics", mode="replace")

Successfully wrote 1 rows to data_quality_metrics


In [80]:
write_spark_df_to_postgresql(correlation_analysis, "correlation_analysis", mode="replace")

Successfully wrote 2 rows to correlation_analysis


In [81]:
write_spark_df_to_postgresql(high_risk_profiles, "high_risk_profiles", mode="replace")

Successfully wrote 1 rows to high_risk_profiles


In [82]:
write_spark_df_to_postgresql(numerical_summary, "numerical_summary", mode="replace")

Successfully wrote 3 rows to numerical_summary


In [None]:
write_spark_df_to_postgresql(patient_risk_scores, "patient_risk_scores", mode="replace")