In [0]:
# Databricks Notebook
# ---------------------------------------------------------
# Notebook 01: Ingest Raw Source Files into Bronze Layer
# ---------------------------------------------------------

from pyspark.sql.functions import current_timestamp, input_file_name
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# -----------------------------
# 1. File Paths
# -----------------------------
patient_src_path = "dbfs:/FileStore/Mini_Project/Source_data/Patient_Source.csv"
insurance_src_path = "dbfs:/FileStore/Mini_Project/Source_data/Insurance_Source.csv"

bronze_patient_path = "dbfs:/FileStore/Mini_Project/bronze/patient_bronze"
bronze_insurance_path = "dbfs:/FileStore/Mini_Project/bronze/insurance_bronze"


In [0]:
# -----------------------------
# 2. Read Raw Patient File
# -----------------------------
df_patient_raw = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(patient_src_path)
         .withColumn("ingest_time", current_timestamp())
         .withColumn("source_file", input_file_name())
)

print("Patient_RAW Schema:")
df_patient_raw.printSchema()

In [0]:
from pyspark.sql import functions as F
import re

def clean_col(name: str) -> str:
    # strip spaces, lower-case, replace bad chars with underscore
    name = name.strip()
    name = re.sub(r"[ ,;{}()\n\t=]", "_", name)
    return name.lower()

# After reading raw df
df_patient_raw = spark.read.option("header", True).csv(patient_src_path)

# Clean column names
df_patient_raw = df_patient_raw.toDF(*[clean_col(c) for c in df_patient_raw.columns])

# Do the same for insurance
df_insurance_raw = spark.read.option("header", True).csv(insurance_src_path)
df_insurance_raw = df_insurance_raw.toDF(*[clean_col(c) for c in df_insurance_raw.columns])


In [0]:
%sql
DROP TABLE IF EXISTS patient_bronze;
DROP TABLE IF EXISTS insurance_bronze;


In [0]:
dbutils.fs.rm("dbfs:/FileStore/Mini_Project/bronze/patient_bronze", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/Mini_Project/bronze/insurance_bronze", recurse=True)

In [0]:
# -----------------------------
# 3. Write Patient Data to Bronze (Delta)
# -----------------------------
df_patient_raw.write.mode("overwrite").format("delta").save(bronze_patient_path)

spark.sql("""
    CREATE TABLE IF NOT EXISTS patient_bronze
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/bronze/patient_bronze'
""")

print("Patient Bronze Table Created Successfully.")

In [0]:
# -----------------------------
# 4. Read Raw Insurance File
# -----------------------------
df_insurance_raw = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(insurance_src_path)
         .withColumn("ingest_time", current_timestamp())
         .withColumn("source_file", input_file_name())
)

print("Insurance_RAW Schema:")
df_insurance_raw.printSchema()

In [0]:
# -----------------------------
# 5. Write Insurance Data to Bronze (Delta)
# -----------------------------
df_insurance_raw.write.mode("overwrite").format("delta").save(bronze_insurance_path)

spark.sql("""
    CREATE TABLE IF NOT EXISTS insurance_bronze
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/bronze/insurance_bronze'
""")

print("Insurance Bronze Table Created Successfully.")

In [0]:
spark.sql("""
    select * from patient_bronze
""").show()

In [0]:
spark.sql("""
    select * from insurance_bronze
""").show()

In [0]:
df_insurance_raw.printSchema()
