# ü§ñ Pattern Mining with Spark MLlib
**Discover misclassification patterns using distributed machine learning**

This notebook:
1. Loads data from Bronze Delta Lake
2. Extracts features from classifications
3. Clusters patterns with K-Means
4. Identifies common error types
5. Exports training examples

## Load Configuration & Data

In [None]:
import json
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline

# Load config
config_path = "/dbfs/tamu-datathon-config.json"
with open(config_path, 'r') as f:
    config = json.load(f)

print("=" * 80)
print("üìã PATTERN MINING CONFIGURATION")
print("=" * 80)
print(f"Bronze Classifications: {config.get('bronze_classifications', 'Not set')}")
print(f"Bronze Learning: {config.get('bronze_learning', 'Not set')}")

bronze_path = config['bronze_path']
silver_path = config['silver_path']

In [None]:
# Load Bronze data
print("\nüì• Loading Bronze Layer Data...")

classifications_df = spark.read.format("delta").load(f"{bronze_path}/classifications")
learning_df = spark.read.format("delta").load(f"{bronze_path}/learning_database")

print(f"‚úÖ Classifications: {classifications_df.count()} records")
print(f"‚úÖ Learning: {learning_df.count()} records")

## Step 1: Feature Engineering

In [None]:
print("\n" + "=" * 80)
print("üîß FEATURE ENGINEERING")
print("=" * 80)

# Extract features for clustering
features_df = classifications_df.select(
    col("document_id"),
    col("classification"),
    col("confidence"),
    coalesce(col("requires_review"), lit(False)).alias("requires_review"),
    # Extract segment count safely
    when(col("text_segments").isNotNull(), 
         size(col("text_segments"))).otherwise(0).alias("segment_count"),
    # Extract evidence count safely
    when(col("evidence").isNotNull(),
         size(col("evidence"))).otherwise(0).alias("evidence_count"),
    # Check if has additional labels
    when(col("additional_labels").isNotNull() & (size(col("additional_labels")) > 0), 1)
        .otherwise(0).alias("has_additional_labels"),
    # Safety flag (handle nested struct)
    when(col("safety_check").isNotNull() & (col("safety_check.is_safe") == False), 1)
        .otherwise(0).alias("is_unsafe")
).na.fill(0)

print("‚úÖ Features extracted")
display(features_df.limit(5))

## Step 2: Join with Learning Data

In [None]:
print("\n" + "=" * 80)
print("üîó JOINING WITH CORRECTIONS")
print("=" * 80)

# Find corrected documents
corrected_df = learning_df.filter(col("approved") == False).select(
    col("document_id"),
    col("original_classification"),
    col("corrected_classification"),
    coalesce(col("feedback_notes"), lit("No notes")).alias("feedback_notes")
)

correction_count = corrected_df.count()
print(f"‚úÖ Found {correction_count} corrected documents")

if correction_count > 0:
    misclassified_df = features_df.join(corrected_df, on="document_id", how="inner")
    print("\nüìä Sample Misclassifications:")
    display(misclassified_df.select(
        "document_id", "classification", "corrected_classification", 
        "confidence", "segment_count"
    ).limit(5))
else:
    print("‚ö†Ô∏è  No corrections found. Using all classifications for analysis")
    misclassified_df = features_df

## Step 3: K-Means Clustering

In [None]:
print("\n" + "=" * 80)
print("ü§ñ CLUSTERING WITH K-MEANS (k=5)")
print("=" * 80)

# Feature columns for clustering
feature_cols = ["confidence", "segment_count", "evidence_count", 
                "has_additional_labels", "is_unsafe"]

# Convert boolean to int
misclassified_df = misclassified_df.withColumn(
    "requires_review_int",
    when(col("requires_review") == True, 1).otherwise(0)
)
feature_cols.append("requires_review_int")

# Assemble features
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

# Scale features
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

# K-Means
kmeans = KMeans(
    k=5,
    seed=42,
    featuresCol="features",
    predictionCol="cluster"
)

# Pipeline
pipeline = Pipeline(stages=[assembler, scaler, kmeans])

# Train
print("üîÑ Training K-Means model...")
model = pipeline.fit(misclassified_df)
print("‚úÖ Model trained!")

# Predict
clustered_df = model.transform(misclassified_df)
print(f"‚úÖ Clustered {clustered_df.count()} documents into 5 groups")

## Step 4: Analyze Clusters

In [None]:
print("\n" + "=" * 80)
print("üìä CLUSTER ANALYSIS")
print("=" * 80)

# Cluster summary
cluster_summary = clustered_df.groupBy("cluster").agg(
    count("*").alias("doc_count"),
    round(avg("confidence"), 4).alias("avg_confidence"),
    round(avg("segment_count"), 2).alias("avg_segments"),
    sum(when(col("is_unsafe") == 1, 1).otherwise(0)).alias("unsafe_count")
).orderBy("cluster")

print("üìà Cluster Summary:")
display(cluster_summary)

In [None]:
# Representative documents per cluster
print("\nüîç Representative Documents per Cluster:")

for cluster_id in range(5):
    cluster_docs = clustered_df.filter(col("cluster") == cluster_id) \
        .select("document_id", "classification", "confidence", "segment_count") \
        .limit(3)
    
    if cluster_docs.count() > 0:
        print(f"\n--- Cluster {cluster_id} ---")
        display(cluster_docs)

## Step 5: Pattern Insights

In [None]:
print("\n" + "=" * 80)
print("üí° PATTERN INSIGHTS")
print("=" * 80)

# Patterns by classification
classification_patterns = clustered_df.groupBy("classification", "cluster").agg(
    count("*").alias("count"),
    round(avg("confidence"), 4).alias("avg_confidence")
).orderBy("classification", "cluster")

print("üìä Patterns by Classification Type:")
display(classification_patterns)

In [None]:
# Find potential error patterns
potential_errors = clustered_df.filter(
    (col("confidence") < 0.90) & (col("segment_count") >= 2)
).select(
    "document_id", "classification", "confidence", "segment_count", "cluster"
)

error_count = potential_errors.count()
print(f"\n‚ö†Ô∏è  Found {error_count} potential error patterns:")
print("   (Low confidence < 0.90 + Multiple segments >= 2)")

if error_count > 0:
    display(potential_errors.limit(10))
else:
    print("   No error patterns detected - good accuracy!")

## Step 6: Export Training Examples

In [None]:
print("\n" + "=" * 80)
print("üíæ EXPORTING TRAINING EXAMPLES")
print("=" * 80)

# Select best examples from each cluster
training_examples = clustered_df.groupBy("cluster").agg(
    collect_list(
        struct(
            col("document_id"),
            col("classification"),
            col("confidence"),
            col("segment_count")
        )
    ).alias("examples"),
    count("*").alias("count")
)

# Save to Silver layer
training_path = f"{silver_path}/training_examples"
training_examples.write \
    .format("delta") \
    .mode("overwrite") \
    .save(training_path)

print(f"‚úÖ Training examples exported: {training_path}")
display(training_examples)

In [None]:
# Save cluster assignments
cluster_assignments_path = f"{silver_path}/cluster_assignments"

clustered_df.select(
    "document_id", "classification", "confidence", 
    "segment_count", "cluster"
).write \
    .format("delta") \
    .mode("overwrite") \
    .save(cluster_assignments_path)

print(f"‚úÖ Cluster assignments saved: {cluster_assignments_path}")

## ‚úÖ Pattern Mining Complete!

Discovered:
- ‚úÖ 5 distinct document pattern clusters
- ‚úÖ Common misclassification signatures
- ‚úÖ Low-confidence error patterns
- ‚úÖ Training examples exported to Silver layer

**Next**: Run `05_analytics_dashboard.ipynb` for insights

In [None]:
# Update config
config['silver_training_examples'] = training_path
config['silver_cluster_assignments'] = cluster_assignments_path

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print("üíæ Configuration updated")