# Clause Segmentation

Segment contract text into clauses using rule-based heuristics + sentence transformers clustering.

In [None]:

silver_clean_path = "abfss://silver@ragstorage4122025.dfs.core.windows.net/contracts_silver_delta/"
gold_path = "abfss://gold@ragstorage4122025.dfs.core.windows.net/contracts_gold_delta/"


In [None]:
import re
from pyspark.sql.functions import udf, explode, col, monotonically_increasing_id
from pyspark.sql.types import ArrayType, StringType



df = spark.read.format("delta").load(silver_clean_path)

# ---------------- Clause Segmentation ----------------
def extract_clauses(text):
    if not text: return []
    parts = re.split(r"[.\n]", text)
    return [p.strip() for p in parts if len(p.strip()) > 30]   # min clause length

extract_clauses_udf = udf(extract_clauses, ArrayType(StringType()))

df_clauses = df.withColumn("clause", explode(extract_clauses_udf(col("text_clean")))) \
               .select("filename","clause")

# ---------------- Feature Extraction (Rule-Based) ----------------
FEATURES = {
    "salary": r"salary|compensation|ctc|bonus|package|pay",
    "notice_period": r"notice period|prior notice",
    "termination": r"termination|terminate|severance|exit",
    "confidentiality": r"confidential|non-disclosure|nda",
    "ip_rights": r"intellectual property|IP|ownership",
    "non_compete": r"non[- ]?compete|restrict",
    "leave_policy": r"leave|vacation|holiday|PTO",
}

def detect_features(text):
    tags = []
    for feature,pattern in FEATURES.items():
        if re.search(pattern, text, re.IGNORECASE):
            tags.append(feature)
    return tags or ["general_clause"]

feature_udf = udf(detect_features, ArrayType(StringType()))

df_gold = df_clauses.withColumn("features", feature_udf(col("clause"))) \
                     .withColumn("id", monotonically_increasing_id())

# ---------------- Save to GOLD ----------------
df_gold.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(gold_path)

display(df_gold.limit(20))
print("GOLD READY — Clauses with feature tags ✔")
