In [0]:
%sql
USE CATALOG hive_metastore;


In [0]:
%run ./Logging_File


In [0]:
%sql
SHOW TABLES IN mini_proj_logs;

In [0]:
%sql
DESCRIBE DETAIL mini_proj_logs.etl_log;


In [0]:

from pyspark.sql.functions import current_timestamp, input_file_name
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

bronze_patient_path = "s3a://mini-project-sathwik/bronze/patient_bronze"
bronze_insurance_path = "s3a://mini-project-sathwik/bronze/insurance_bronze"


In [0]:
from pyspark.sql.types import *

patient_schema = StructType([
    StructField("patient_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("address", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("bill_amount", DoubleType(), True),
    StructField("insurance_provider", StringType(), True),
    StructField("policy_id", StringType(), False),
    StructField("Act_Ind", StringType(), False)
])


In [0]:
from pyspark.sql.functions import (
    current_timestamp,
    input_file_name,
    monotonically_increasing_id
)
from datetime import date

batch_id = date.today().strftime("%Y-%m-%d")

pipeline_name = "Patient Bronze Load"
layer = "bronze"

patient_src_path = "s3a://mini-project-sathwik/raw/patient/"

try:
    # 1. Find latest file dynamically
    files = dbutils.fs.ls(patient_src_path)
    latest_file = sorted(
        files,
        key=lambda x: x.modificationTime,
        reverse=True
    )[0].path

    file_name = latest_file.split("/")[-1]

    # 2. Read ONLY the latest file
    df_patient_raw = (
        spark.read
        .schema(patient_schema)
        .option("header", True)
        .csv(latest_file)
        .withColumn("ingest_time", current_timestamp())
        .withColumn("source_file", input_file_name())
        .withColumn("row_sequence", monotonically_increasing_id())
    )

    # 3. Safety check
    row_count = df_patient_raw.count()
    if row_count == 0:
        raise Exception("Latest source file contains no records")

    # 4. Log success
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="SUCCESS",
        row_count=row_count,
        message=f"{pipeline_name} completed successfully for {file_name}"
    )

except Exception as e:
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name if 'file_name' in locals() else "UNKNOWN",
        layer=layer,
        status="FAILED",
        row_count=0,
        message=f"Error processing patient file: {str(e)}"
    )
    raise


In [0]:
print("Patient_RAW Schema:")
df_patient_raw.printSchema()

In [0]:
df_patient_raw.select("age").distinct().show(100, truncate=False)


In [0]:
%sql
DROP TABLE IF EXISTS patient_bronze;
DROP TABLE IF EXISTS insurance_bronze;


In [0]:
# -----------------------------
# 3. Write Patient Data to Bronze (Delta)
# -----------------------------
dbutils.fs.rm(bronze_patient_path, True)


df_patient_raw.write.mode("overwrite").format("delta").save(bronze_patient_path)


In [0]:

spark.sql("""
    CREATE TABLE IF NOT EXISTS patient_bronze
    USING DELTA
    LOCATION 's3a://mini-project-sathwik/bronze/patient_bronze'
""")

print("Patient Bronze Table Created Successfully.")

In [0]:
%sql
ALTER TABLE patient_bronze
ALTER COLUMN patient_id SET NOT NULL

In [0]:
%sql
ALTER TABLE patient_bronze
ALTER COLUMN source_file SET NOT NULL

In [0]:
from datetime import date
from pyspark.sql.types import (
    StructType, StructField,
    StringType, DoubleType
)
from pyspark.sql.functions import (
    current_timestamp,
    input_file_name,
    monotonically_increasing_id
)

batch_id = date.today().strftime("%Y-%m-%d")

pipeline_name = "Insurance Bronze Load"
layer = "bronze"

# S3 source path
insurance_src_path = "s3a://mini-project-sathwik/raw/insurance/"

try:
    # 1. Find latest file dynamically
    files = dbutils.fs.ls(insurance_src_path)
    latest_file = sorted(
        files,
        key=lambda x: x.modificationTime,
        reverse=True
    )[0].path

    file_name = latest_file.split("/")[-1]

    # 2. Define schema
    insurance_schema = StructType([
        StructField("policy_id", StringType(), True),
        StructField("insurance_provider", StringType(), True),
        StructField("amount_covered", DoubleType(), True),
        StructField("claim_status", StringType(), True),
        StructField("Act_Ind", StringType(), False)
    ])

    # 3. Read ONLY the latest file
    df_insurance_raw = (
        spark.read
        .option("header", True)
        .schema(insurance_schema)
        .csv(latest_file)
        .withColumn("ingest_time", current_timestamp())
        .withColumn("source_file", input_file_name())
        .withColumn("row_sequence", monotonically_increasing_id())
    )

    # 4. Safety check
    row_count = df_insurance_raw.count()
    if row_count == 0:
        raise Exception("Latest insurance source file contains no records")

    # 5. Log success
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="SUCCESS",
        row_count=row_count,
        message=f"{pipeline_name} completed successfully for {file_name}"
    )

except Exception as e:
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name if 'file_name' in locals() else "UNKNOWN",
        layer=layer,
        status="FAILED",
        row_count=0,
        message=f"Error processing insurance file: {str(e)}"
    )
    raise


In [0]:
print("Insurance_RAW Schema:")
df_insurance_raw.printSchema()

In [0]:
# -----------------------------
# 5. Write Insurance Data to Bronze (Delta)
# -----------------------------
dbutils.fs.rm(bronze_insurance_path, True)
df_insurance_raw.write.mode("overwrite").format("delta").save(bronze_insurance_path)


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS insurance_bronze
    USING DELTA
    LOCATION 's3a://mini-project-sathwik/bronze/insurance_bronze'
""")

print("Insurance Bronze Table Created Successfully.")

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN policy_id SET NOT NULL;

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN ingest_time SET NOT NULL

In [0]:
%sql
ALTER TABLE insurance_bronze
ALTER COLUMN source_file SET NOT NULL

In [0]:
spark.sql("""
    select * from patient_bronze
""").show()

In [0]:
spark.sql("""
    select * from insurance_bronze
""").show()

In [0]:
df_insurance_raw.printSchema()


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS mini_proj_metrics;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS mini_proj_metrics.dq_insurance_bronze_metrics (
    batch_id STRING,
    layer STRING,
    table_name STRING,
    total_rows BIGINT,
    null_policy_id BIGINT,
    null_insurance_provider BIGINT,
    null_amount_covered BIGINT,
    null_claim_status BIGINT,
    load_timestamp TIMESTAMP
)
USING DELTA;


In [0]:
%sql
INSERT INTO mini_proj_metrics.dq_insurance_bronze_metrics
SELECT
    date_add(current_date(), 1)  AS batch_id,
    'bronze' AS layer,
    'insurance_bronze' AS table_name,
    COUNT(*) AS total_rows,
    SUM(CASE WHEN policy_id IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN insurance_provider IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN amount_covered IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN claim_status IS NULL THEN 1 ELSE 0 END),
    current_timestamp()
FROM insurance_bronze;


In [0]:
%sql
CREATE TABLE IF NOT EXISTS mini_proj_metrics.dq_patient_bronze_metrics (
    batch_id STRING,
    table_name STRING,
    total_rows BIGINT,
    null_patient_id BIGINT,
    null_name BIGINT,
    null_address BIGINT,
    null_phone_number BIGINT,
    null_policy_id BIGINT,
    null_age BIGINT,
    null_bill_amount BIGINT,
    load_time TIMESTAMP
)
USING DELTA;


In [0]:
%sql
INSERT INTO mini_proj_metrics.dq_patient_bronze_metrics
SELECT
    date_add(current_date(), 1)                       AS batch_id,          -- or pass your batch_id variable
    'patient_bronze'                      AS table_name,
    COUNT(*)                              AS total_rows,
    SUM(CASE WHEN patient_id IS NULL THEN 1 ELSE 0 END) AS null_patient_id,
    SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END)       AS null_name,
    SUM(CASE WHEN address IS NULL THEN 1 ELSE 0 END)    AS null_address,
    SUM(CASE WHEN phone_number IS NULL THEN 1 ELSE 0 END) AS null_phone_number,
    SUM(CASE WHEN policy_id IS NULL THEN 1 ELSE 0 END)  AS null_policy_id,
    SUM(CASE WHEN age IS NULL THEN 1 ELSE 0 END)        AS null_age,
    SUM(CASE WHEN bill_amount IS NULL THEN 1 ELSE 0 END) AS null_bill_amount,
    current_timestamp()                   AS load_time
FROM patient_bronze;


In [0]:
%sql
select * from mini_proj_metrics.dq_patient_bronze_metrics;

In [0]:
%sql
SELECT current_catalog(), current_database();


In [0]:
%sql
SHOW TABLES IN hive_metastore.mini_proj_metrics;


In [0]:
%sql
use mini_proj_logs

In [0]:
%python
df = spark.table("mini_proj_logs.etl_log")

df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_logs.etl_log_accessible")


In [0]:
%sql
use catalog hive_metastore

In [0]:
%sql
create database if not exists mini_proj_bronze;

In [0]:
%python
df=spark.table("patient_bronze")
df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_bronze.patient_bronze")

In [0]:
%python
df=spark.table("insurance_bronze")
df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_bronze.insurance_bronze")