In [None]:

import pandas as pd
import random
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Initialize Faker for synthetic data generation
fake = Faker()

In [None]:
# Generate synthetic patient data
def generate_patients(n=10000):
    data = []
    for _ in range(n):
        patient = {
            "Patient_ID": fake.uuid4(),
            "Age": random.randint(18, 90),
            "BP": random.randint(90, 180),
            "Sugar_Level": round(random.uniform(70, 200), 1),
            "Cholesterol": round(random.uniform(100, 300), 1),
            "Haemoglobin": round(random.uniform(9, 18), 1)
        }
        data.append(patient)
    return pd.DataFrame(data)

# Save the dataset
df = generate_patients()
df.to_csv("patients.csv", index=False)



In [None]:
spark = SparkSession.builder.appName("HealthMonitoring").getOrCreate()

# Load Data into Spark DataFrame
spark_df = spark.read.csv("patients.csv", header=True, inferSchema=True)

# Perform basic analysis
stats_df = spark_df.agg(
    avg(col("BP")).alias("Avg_BP"),
    avg(col("Sugar_Level")).alias("Avg_Sugar"),
    avg(col("Cholesterol")).alias("Avg_Cholesterol"),
    avg(col("Haemoglobin")).alias("Avg_Haemoglobin")
)



In [None]:
# Show results
stats_df.show()

# Save processed statistics
stats_df.toPandas().to_csv("health_statistics.csv", index=False)

print("Data processing complete. Statistics saved.")
