In [0]:
# from pyspark.sql.functions import col
# from delta.tables import DeltaTable

# # # --- 1. Define Table Name ---
# # gold_audit_table_name = "ecommerce_audit.audit_schema.gold_price_audit_metrics"


# # --- 1. Define Table Name ---
# gold_audit_table_name = "ecommerce_audit.audit_schema.gold_price_audit_metrics"

# # --- 2. Read Data by Table Name (The reliable method) ---
# final_audit_df = spark.read.table(gold_audit_table_name)

# # --- 3. Inspect ALL Records (Ordered by the Bias Metric) ---
# print("Inspecting ALL Gold Records, Ordered by PVR:")
# (final_audit_df
#     .select(
#         "product_id",
#         "geo_cluster",
#         "avg_price",
#         "price_variance_ratio", # <-- THIS IS THE KEY COLUMN TO WATCH
#         "request_count"
#     )
#     .orderBy(col("price_variance_ratio").desc())
#     .display()
# )

# # --- 2. Force Cache Refresh (CRUCIAL FIX) ---
# # This forces the current session to update its metadata from the Unity Catalog.
# #spark.sql(f"REFRESH TABLE {gold_audit_table_name}") 
# # print(f"Refreshed table metadata for {gold_audit_table_name}.")

# # --- 3. Read Data by Table Name (Most Reliable) ---
# final_audit_df = spark.read.table(gold_audit_table_name)

# # --- 4. Display the highest-risk products ---
# print("Final Audit Metrics: Highest Risk & Variance")
# result_df = (
#     final_audit_df
#     .select(
#         "product_id",
#         "geo_cluster",
#         "avg_price",
#         "price_variance_ratio",
#         "audit_risk_flag"
#     )
#     .filter(col("audit_risk_flag") == "HIGH_RISK_AUDIT")
#     .orderBy(col("price_variance_ratio").desc())
# )

# # Check if records were found before displaying
# result_count = result_df.count()

# if result_count > 0:
#     print(f"✅ SUCCESS! {result_count} HIGH_RISK_AUDIT records found. The pipeline worked.")
#     result_df.display()
# else:
#     print("⚠️ No HIGH_RISK_AUDIT records found. Displaying ALL records to inspect PVR values.")
#     final_audit_df.display()

from pyspark.sql.functions import col

# --- 1. Define Table Name ---
gold_audit_table_name = "ecommerce_audit.audit_schema.gold_price_audit_metrics"

# --- 2. Read Data by Table Name ---
final_audit_df = spark.read.table(gold_audit_table_name)

# --- 3. Filter using the CORRECT PVR range (Exposing the 12% bias) ---
PVR_THRESHOLD = 0.10  # Use 10% as the minimum for high risk

print(f"Final Audit Metrics: Filtering records with PVR > {PVR_THRESHOLD} to expose bias:")
result_df_filtered = (
    final_audit_df
    .select(
        "product_id",
        "geo_cluster",
        "avg_price",
        "price_variance_ratio", # Confirmed the intended metric is here
        "request_count"
    )
    # Filter for the rows that fall within the correct PVR range (around 0.12)
    .filter(col("price_variance_ratio") > PVR_THRESHOLD) 
    .orderBy(col("price_variance_ratio").desc())
)

# --- Verification ---
result_count = result_df_filtered.count()

if result_count > 0:
    print(f"✅ SUCCESS! Found {result_count} records exposing the algorithmic bias.")
    print("The highest PVR (Price Variance Ratio) is now correctly identified:")
    result_df_filtered.display()
    
    print("\n--- Proceeding to Databricks SQL Dashboard ---")
else:
    print("❌ The PVR filter is still not capturing the bias. Please inspect the code for PVR calculation in Step 7.")

Final Audit Metrics: Filtering records with PVR > 0.1 to expose bias:
✅ SUCCESS! Found 82 records exposing the algorithmic bias.
The highest PVR (Price Variance Ratio) is now correctly identified:


product_id,geo_cluster,avg_price,price_variance_ratio,request_count
PROD_0012,Standard_Region_B,195.76615384615383,0.6214592121468135,13
PROD_0013,Standard_Region_C,164.67,0.5290750761424431,9
PROD_0010,VIP_City_A,186.99428571428567,0.5016409420758869,7
PROD_0012,Standard_Region_C,277.5733333333333,0.4632737772965168,6
PROD_0011,Standard_Region_C,245.2628571428572,0.4528619072583021,7
PROD_0009,Standard_Region_B,198.7103571428572,0.4259930690468012,28
PROD_0001,Standard_Region_B,229.69,0.4094097440064179,6
PROD_0003,Standard_Region_C,223.92,0.4028895537671397,14
PROD_0009,Standard_Region_B,196.83555555555552,0.3920475292321801,9
PROD_0019,Standard_Region_C,225.0475,0.3866478248308868,4



--- Proceeding to Databricks SQL Dashboard ---
