### **Lakehouse Catalog and Schema Initialization**

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS knowledgehub_lakehouse;

CREATE SCHEMA IF NOT EXISTS knowledgehub_lakehouse.reference;
CREATE SCHEMA IF NOT EXISTS knowledgehub_lakehouse.bronze;
CREATE SCHEMA IF NOT EXISTS knowledgehub_lakehouse.silver;
CREATE SCHEMA IF NOT EXISTS knowledgehub_lakehouse.gold;
CREATE SCHEMA IF NOT EXISTS knowledgehub_lakehouse.quarantine;


### **Loading reference tables into delta tables**

In [0]:

from pyspark.sql import functions as F

CATALOG = "knowledgehub_lakehouse"
SCHEMA_REFERENCE = "reference"

REF_PATH = "/Volumes/knowledgehub_lakehouse/reference/raw/"

users_path = REF_PATH + "users.csv"
dept_path  = REF_PATH + "departments.csv"
rules_path = REF_PATH + "doc_type_rules.csv"


In [0]:
# loading user into delta table
df_users = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(users_path)
    .withColumn("load_ts", F.current_timestamp())
    .withColumn("source_file", F.expr("_metadata.file_path"))
)

df_users.write.mode("overwrite").format("delta").saveAsTable("knowledgehub_lakehouse.reference.users")


In [0]:
# loading departments into delta table
df_dept = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(dept_path)
    .withColumn("load_ts", F.current_timestamp())
    .withColumn("source_file", F.expr("_metadata.file_path"))
)

df_dept.write.mode("overwrite").format("delta").saveAsTable("knowledgehub_lakehouse.reference.departments")




In [0]:
# loading doc_type_rules into delta table

df_rules = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(rules_path)
    .withColumn("load_ts", F.current_timestamp())
    .withColumn("source_file", F.expr("_metadata.file_path"))
)

df_rules.write.mode("overwrite").format("delta").saveAsTable("knowledgehub_lakehouse.reference.doc_type_rules")




In [0]:
users_tbl = f"{CATALOG}.{SCHEMA_REFERENCE}.users"
dept_tbl  = f"{CATALOG}.{SCHEMA_REFERENCE}.departments"
rules_tbl = f"{CATALOG}.{SCHEMA_REFERENCE}.doc_type_rules"

# Duplicate user_id
dup_users = (
    spark.table("knowledgehub_lakehouse.reference.users")
    .groupBy("user_id").count()
    .filter("count > 1")
    .count()
)

# Department mismatch check example
dept_distinct = spark.table("knowledgehub_lakehouse.reference.departments").select("department").distinct().count()

# Mandatory fields in doc_type_rules
rules_null_mandatory = (
    spark.table("knowledgehub_lakehouse.reference.doc_type_rules")
    .filter(F.col("mandatory_fields").isNull())
    .count()
)

quality_rows = [
    ("users", "duplicate_user_id_count", int(dup_users)),
    ("departments", "distinct_departments", int(dept_distinct)),
    ("doc_type_rules", "null_mandatory_fields_count", int(rules_null_mandatory))
]

df_quality = spark.createDataFrame(quality_rows, ["dataset", "check_name", "metric_value"]) \
                  .withColumn("report_ts", F.current_timestamp())

df_quality.write.mode("overwrite").format("delta") \
    .saveAsTable(f"{CATALOG}.{SCHEMA_REFERENCE}.reference_quality_report")

display(df_quality)


In [0]:
display(spark.table("knowledgehub_lakehouse.reference.departments").select("department").distinct())

In [0]:
%sql
SELECT COUNT(*) AS docs_rows
FROM knowledgehub_lakehouse.bronze.docs_raw;


In [0]:
%sql
SELECT COUNT(*) AS events_rows
FROM knowledgehub_lakehouse.bronze.access_events_raw;

In [0]:
%sql
DESCRIBE TABLE knowledgehub_lakehouse.bronze.docs_raw;


In [0]:
%sql
CREATE OR REPLACE TABLE knowledgehub_lakehouse.reference.day1_bronze_validation_report AS
SELECT
  'docs_raw' AS table_name,
  COUNT(*) AS total_rows,
  COUNT(DISTINCT input_batch) AS distinct_files,
  SUM(CASE WHEN doc_id IS NULL THEN 1 ELSE 0 END) AS null_doc_id,
  SUM(CASE WHEN doc_text IS NULL OR TRIM(doc_text) = '' THEN 1 ELSE 0 END) AS empty_doc_text,
  SUM(CASE WHEN review_due_date IS NOT NULL THEN 1 ELSE 0 END) AS drift_review_due_date_rows,
  SUM(CASE WHEN policy_region IS NOT NULL THEN 1 ELSE 0 END) AS drift_policy_region_rows,
  current_timestamp() AS report_ts
FROM knowledgehub_lakehouse.bronze.docs_raw
UNION ALL
SELECT
  'access_events_raw' AS table_name,
  COUNT(*) AS total_rows,
  COUNT(DISTINCT input_batch) AS distinct_files,
  SUM(CASE WHEN event_id IS NULL THEN 1 ELSE 0 END) AS null_event_id,
  SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) AS null_user_id,
  0 AS drift_review_due_date_rows,
  0 AS drift_policy_region_rows,
  current_timestamp() AS report_ts
FROM knowledgehub_lakehouse.bronze.access_events_raw;


In [0]:
df_docs = spark.table("knowledgehub_lakehouse.bronze.docs_raw")
duplicate_counts = [
    (col, df_docs.groupBy(col).count().filter("count > 1").count())
    for col in df_docs.columns
]
df_duplicates = spark.createDataFrame(duplicate_counts, ["column_name", "duplicate_count"])
display(df_duplicates)

In [0]:
%sql
SELECT doc_id, COUNT(*) AS cnt
FROM knowledgehub_lakehouse.bronze.docs_raw
GROUP BY doc_id
HAVING COUNT(*) > 1
ORDER BY cnt DESC;


In [0]:
df_distinct_doc_id = spark.table("knowledgehub_lakehouse.bronze.docs_raw").select("doc_id").distinct()
display(df_distinct_doc_id)

In [0]:
%sql
SELECT doc_id
FROM knowledgehub_lakehouse.bronze.docs_raw

In [0]:
%sql
select * from knowledgehub_lakehouse.bronze.docs_raw where doc_id="DOC00997"

In [0]:
df = spark.table("knowledgehub_lakehouse.bronze.docs_raw")
df_docid_count_per_title = df.groupBy("doc_title").agg({"doc_id": "count"}).withColumnRenamed("count(doc_id)", "doc_id_count")
display(df_docid_count_per_title)

In [0]:
%sql
SELECT
  doc_id,
  doc_title,
  COUNT(*) AS cnt,
  COUNT(DISTINCT department) AS distinct_departments
FROM knowledgehub_lakehouse.silver.docs_clean
GROUP BY doc_id, doc_title
HAVING COUNT(*) > 1
ORDER BY cnt DESC, doc_id;

