In [0]:
from pyspark.sql.functions import (
    col, when, length, current_date, datediff, lit, coalesce, expr, sum)

docs = spark.table("knowledgehub_lakehouse.silver.docs_clean")
rules = spark.table("knowledgehub_lakehouse.reference.doc_type_rules")
usage = spark.table("knowledgehub_lakehouse.gold.fact_doc_daily_usage")


In [0]:
from pyspark.sql.functions import to_date

features = (
    docs
    .filter(col("current_version_flag") == True)   # minimal fix: compare to True
    .select(
        "doc_id",
        "version_norm",
        "doc_title",
        "doc_type",
        "department",
        "status",
        "confidentiality",
        "updated_ts",
        "review_due_date",
        "policy_region",
        "ingest_ts",
        "source_file",
        "input_batch",
        
        # feature: confidentiality missing
        when(col("confidentiality").isNull() | (col("confidentiality") == ""), lit(1)).otherwise(lit(0)).alias("confidentiality_missing_flag"),
        
        # feature: empty doc text
        when(col("doc_text").isNull() | (col("doc_text") == ""), lit(1)).otherwise(lit(0)).alias("empty_text_flag"),
        
        # feature: text length
        length(col("doc_text")).alias("text_length"),
        
        # feature: stale doc flag (older than 180 days)
        when(datediff(current_date(), to_date(col("updated_ts"))) > 180, lit(1)).otherwise(lit(0)).alias("stale_doc_flag"),
        
        # feature: review due missing
        when(col("review_due_date").isNull() | (col("review_due_date") == ""), lit(1)).otherwise(lit(0)).alias("review_due_missing_flag")
    )
)


In [0]:
(
    features.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("knowledgehub_lakehouse.silver.doc_features")
)


In [0]:
spark.table("knowledgehub_lakehouse.silver.doc_features").display()

In [0]:
from pyspark.sql.functions import col, when, lit

feat = spark.table("knowledgehub_lakehouse.silver.doc_features")

labeled = (
    feat
    .withColumn(
        "risk_band",
        when(
            (col("confidentiality_missing_flag") == 1) |
            (col("empty_text_flag") == 1) |
            (col("stale_doc_flag") == 1),
            lit("HIGH")
        )
        .when(
            (col("text_length") < 80) |
            (col("review_due_missing_flag") == 1),
            lit("MEDIUM")
        )
        .otherwise(lit("LOW"))
    )
    .withColumn(
        "label",
        when(col("risk_band") == "HIGH", lit(2))
        .when(col("risk_band") == "MEDIUM", lit(1))
        .otherwise(lit(0))
    )
)


labeled.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("knowledgehub_lakehouse.silver.doc_features")

In [0]:
spark.sql("""
SELECT risk_band, COUNT(*) cnt
FROM knowledgehub_lakehouse.silver.doc_features
GROUP BY risk_band
ORDER BY cnt DESC
""").display()


In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = spark.table("knowledgehub_lakehouse.silver.doc_features").toPandas()
df.head()


In [0]:
feature_cols = [
    "confidentiality_missing_flag",
    "empty_text_flag",
    "text_length",
    "stale_doc_flag",
    "review_due_missing_flag"
]

X = df[feature_cols]
y = df["label"]


In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [0]:
mlflow.set_experiment("/Users/" + spark.sql("SELECT current_user()").collect()[0][0] + "/capstone7_doc_quality")

model_name = "knowledgehub_doc_quality_model"

with mlflow.start_run() as run:
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42
    )
    rf.fit(X_train, y_train)

    preds = rf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1  = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_weighted", f1)

    print("Accuracy:", acc)
    print("F1:", f1)
    print(classification_report(y_test, preds))

    # log model
    mlflow.sklearn.log_model(
    rf,
    artifact_path="model",
    input_example=X_train.head(5)
)


    run_id = run.info.run_id
    print("Run ID:", run_id)


### Registering the "knowledgehub_doc_quality_model" model 

In [0]:
model_name = "knowledgehub_lakehouse.ml_models.knowledgehub_doc_quality_model"

model_uri = f"runs:/{run_id}/model"

registered_model = mlflow.register_model(
    model_uri=model_uri,
    name=model_name
)



### Batch Scoring

In [0]:
import mlflow
model_name = "knowledgehub_lakehouse.ml_models.knowledgehub_doc_quality_model"  
client = mlflow.tracking.MlflowClient()
client.search_model_versions(f"name='{model_name}'")

In [0]:
import mlflow
import pandas as pd
from pyspark.sql.functions import col, current_timestamp, lit, when

model_name = "knowledgehub_lakehouse.ml_models.knowledgehub_doc_quality_model"
model_uri = f"models:/{model_name}@version1"

model = mlflow.sklearn.load_model(model_uri)
print("Loaded model:", model_name)

# reading the features
feat_spark = spark.table("knowledgehub_lakehouse.silver.doc_features")
df = feat_spark.toPandas()

feature_cols = [
    "confidentiality_missing_flag",
    "empty_text_flag",
    "text_length",
    "stale_doc_flag",
    "review_due_missing_flag"
]

X = df[feature_cols]
pred_label = model.predict(X)

df["pred_label"] = pred_label

def label_to_score(x):
    if x == 2:
        return 20
    elif x == 1:
        return 60
    else:
        return 90

df["score"] = df["pred_label"].apply(label_to_score)

def label_to_band(x):
    if x == 2:
        return "HIGH"
    elif x == 1:
        return "MEDIUM"
    else:
        return "LOW"

df["risk_band"] = df["pred_label"].apply(label_to_band)

# creating output dataframe
out_cols = [
    "doc_id",
    "version_norm",
    "score",
    "risk_band"
]

out_pdf = df[out_cols]
out_sdf = spark.createDataFrame(out_pdf)


from pyspark.sql.functions import current_timestamp

out_sdf = (
    out_sdf
    .withColumn("scored_ts", current_timestamp())
)


In [0]:
(
    out_sdf.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("knowledgehub_lakehouse.gold.doc_quality_scores")
)

In [0]:
# Compliance Risk Summary
spark.sql("""
CREATE OR REPLACE TABLE knowledgehub_lakehouse.gold.compliance_risk_summary AS
SELECT
  doc_id,
  MAX_BY(score, scored_ts)      AS current_score,
  MAX_BY(risk_band, scored_ts)  AS current_risk_band,
  MAX(scored_ts)                AS last_scored_ts
FROM knowledgehub_lakehouse.gold.doc_quality_scores
GROUP BY doc_id
""")


In [0]:
spark.table("knowledgehub_lakehouse.gold.compliance_risk_summary").display()