In [0]:
# Databricks Notebook
# ---------------------------------------------------------
# Notebook 01: Ingest Raw Source Files into Bronze Layer
# ---------------------------------------------------------

from pyspark.sql.functions import current_timestamp, input_file_name
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# -----------------------------
# 1. File Paths
# -----------------------------
patient_src_path = "dbfs:/FileStore/Mini_Project/Source_data/Patient_Source.csv"
insurance_src_path = "dbfs:/FileStore/Mini_Project/Source_data/Insurance_Source.csv"

bronze_patient_path = "dbfs:/FileStore/Mini_Project/bronze/patient_bronze"
bronze_insurance_path = "dbfs:/FileStore/Mini_Project/bronze/insurance_bronze"


In [0]:
from pyspark.sql.types import *

patient_schema = StructType([
    StructField("patient_id", StringType(), False),   
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("address", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("bill_amount", DoubleType(), True),
    StructField("insurance_provider", StringType(), True),
    StructField("policy_id", StringType(), False)
])


In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name

df_patient_raw = (
    spark.read.csv(
        "dbfs:/FileStore/Mini_Project/Source_data/Patient_Source.csv",
        schema=patient_schema,
        header=True
    )
    .withColumn("ingest_time", current_timestamp())
    .withColumn("source_file", input_file_name())
)


In [0]:
print("Patient_RAW Schema:")
df_patient_raw.printSchema()

In [0]:
%sql
DROP TABLE IF EXISTS patient_bronze;
DROP TABLE IF EXISTS insurance_bronze;


In [0]:
df_patient_raw = df_patient_raw.na.drop(subset=["patient_id"])


In [0]:
# -----------------------------
# 3. Write Patient Data to Bronze (Delta)
# -----------------------------
dbutils.fs.rm(bronze_patient_path, True)


df_patient_raw.write.mode("overwrite").format("delta").save(bronze_patient_path)


In [0]:

spark.sql("""
    CREATE TABLE IF NOT EXISTS patient_bronze
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/bronze/patient_bronze'
""")

print("Patient Bronze Table Created Successfully.")

In [0]:
%sql
ALTER TABLE patient_bronze
ALTER COLUMN patient_id SET NOT NULL

In [0]:
%sql
ALTER TABLE patient_bronze
ALTER COLUMN source_file SET NOT NULL

In [0]:
# -----------------------------
# 4. Read Raw Insurance File
# -----------------------------
df_insurance_raw = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(insurance_src_path)
         .withColumn("ingest_time", current_timestamp())
         .withColumn("source_file", input_file_name())
)

print("Insurance_RAW Schema:")
df_insurance_raw.printSchema()

In [0]:
df_insurance_raw = df_insurance_raw.na.drop(subset=["policy_id"])

In [0]:
# -----------------------------
# 5. Write Insurance Data to Bronze (Delta)
# -----------------------------
dbutils.fs.rm(bronze_insurance_path, True)
df_insurance_raw.write.mode("overwrite").format("delta").save(bronze_insurance_path)


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS insurance_bronze
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/bronze/insurance_bronze'
""")

print("Insurance Bronze Table Created Successfully.")

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN policy_id SET NOT NULL;

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN ingest_time SET NOT NULL

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN source_file SET NOT NULL

In [0]:
spark.sql("""
    select * from patient_bronze
""").show()

In [0]:
spark.sql("""
    select * from insurance_bronze
""").show()

In [0]:
df_insurance_raw.printSchema()
