# Irregular Defect Refinement Study

This standalone notebook explores high-fidelity structural features to distinguish between **Good** pillars and **Irregular** defects (like internal splitting or deformation).

### Core Problem:
Simple features like Circularity can be fooled by background noise or slight blur. We need internal structural analysis to confirm a defect.

### New Structural Tactics:
1. **Internal Edge Density**: Highlights splits *inside* the pillar mask.
2. **Inertia Ratio**: Measures elongation/deformation.
3. **Peak-Centroid Offset**: Measures symmetry drift.

In [None]:
import sys
import os
from pathlib import Path

# Add src directory to path
current_dir = Path(os.getcwd())
root_dir = current_dir.parent if current_dir.name == 'notebooks' else current_dir
src_dir = str(root_dir / "src")
if src_dir not in sys.path:
    sys.path.append(src_dir)

import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

# Ensure project root is in sys.path
PROJECT_ROOT = Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from defect_analysis.clustering.layered_clustering import load_tiles, run_layered_pipeline

plt.rcParams.update({
    'font.size': 11, 'figure.dpi': 120, 'axes.titleweight': 'bold',
    'legend.frameon': True, 'legend.facecolor': 'white'
})

DATA_DIR = PROJECT_ROOT / 'data' / 'Meta_Atoms'
tiles = load_tiles(DATA_DIR, ['Array_4'])
print(f"Loaded {len(tiles)} tiles from Array 4.")

## 1. Baseline Pipeline Execution
Running the current pipeline to get the initial 'Irregular' candidate pool.

In [None]:
for t in tiles: t.defect_type = 'Unknown'
run_layered_pipeline(tiles, verbose=False)
print("Initial Pipeline Counts:")
counts = pd.Series([t.defect_type for t in tiles]).value_counts()
display(counts.to_frame(name='Count'))

## 2. Structural Feature Extraction
Calculating internal metrics for all surviving pillars (Good + Irregular candidates).

In [None]:
def extract_structural_features(tile):
    img = (tile.image * 255).astype(np.uint8) if tile.image.max() <= 1.0 else tile.image.astype(np.uint8)
    
    # 1. Mask the pillar (Otsu thresholding)
    _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    if np.mean(thresh[8:24, 8:24]) < 127: thresh = 255 - thresh
    
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours: return 0.0, 1.0, 0.0, 0.0
    
    cnt = max(contours, key=cv2.contourArea)
    mask = np.zeros_like(img)
    cv2.drawContours(mask, [cnt], -1, 1, -1)
    
    # 2. Internal Edge Density (Targeting splitting)
    edges = cv2.Canny(img, 30, 100)
    internal_edges = cv2.bitwise_and(edges, edges, mask=mask)
    edge_density = np.sum(internal_edges) / np.sum(mask) if np.sum(mask) > 0 else 0
    
    # 3. Inertia Ratio (Elongation from image moments)
    mu = cv2.moments(cnt)
    if mu['m00'] > 0:
        m20, m02, m11 = mu['mu20'], mu['mu02'], mu['mu11']
        denom = np.sqrt((m20 - m02)**2 + 4 * m11**2)
        inertia_ratio = (m20 + m02 - denom) / (m20 + m02 + denom) if (m20 + m02 + denom) > 0 else 1.0
        
        # 4. Peak-Centroid Offset (Checking symmetry)
        cx, cy = int(mu['m10']/mu['m00']), int(mu['m01']/mu['m00'])
        _, _, _, max_loc = cv2.minMaxLoc(img, mask=mask)
        offset = np.sqrt((cx - max_loc[0])**2 + (cy - max_loc[1])**2)
        # 5. Circularity
        area = cv2.contourArea(cnt)
        perim = cv2.arcLength(cnt, True)
        circ = (4 * np.pi * area) / (perim**2) if perim > 0 else 0
    else:
        inertia_ratio, offset, circ = 1.0, 0.0, 0.0
    
    return edge_density, inertia_ratio, offset, circ

print("Extracting structural features...")
candidates = [t for t in tiles if t.defect_type in ['Good', 'Irregular']]
for t in candidates:
    ed, ir, off, circ = extract_structural_features(t)
    t.features['int_edge_density'] = ed
    t.features['inertia_ratio'] = ir
    t.features['peak_offset'] = off
    t.features['circularity_refined'] = circ

df = pd.DataFrame([t.to_dict() for t in candidates])

## 3. Comparative Feature Analysis
Let's see if Internal Edge Density actually separates the 'Good' noisy pillars from the 'Irregular' splits.

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='circularity_refined', y='int_edge_density', 
                hue='defect_type', alpha=0.6, palette={'Good': '#27ae60', 'Irregular': '#e74c3c'})
plt.axhline(y=1.5, ls='--', color='black', label='Split Threshold')
plt.title("Circularity vs. Internal Edge Density")
plt.legend()
plt.show()

## 4. Gated Refinement Study
We re-classify tiles to see how many 'Good' meta-atoms we can recover with strict structural gating.

In [None]:
def apply_gate(row, edge_thresh=2.0, off_thresh=4.0):
    if row['defect_type'] == 'Irregular':
        # Only stay Irregular if you fail the structural tests
        if row['int_edge_density'] > edge_thresh or row['peak_offset'] > off_thresh:
            return 'Irregular'
        else:
            return 'Good'
    return row['defect_type']

def show_refined_gallery(df_refined, title):
    for dtype in ['Good', 'Irregular']:
        sub_df = df_refined[df_refined['refined_type'] == dtype]
        n = len(sub_df)
        print(f"{dtype}: {n} found")
        if n == 0: continue
        
        rows = (min(n, 40) + 9) // 10
        fig, axes = plt.subplots(rows, 10, figsize=(15, rows * 1.5))
        fig.suptitle(f"{title} - {dtype} (N={n})", fontsize=16, y=1.02)
        
        selection = sub_df.sample(min(n, rows*10)).index
        for i, ax in enumerate(axes.flatten()):
            if i < len(selection):
                idx = selection[i]
                # Re-find original tile object to show image
                # Based on coordinate match or index if stable
                tile = candidates[idx]
                ax.imshow(tile.image, cmap='gray')
                ax.set_title(f"E:{tile.features['int_edge_density']:.1f}", fontsize=8)
            ax.axis('off')
        plt.tight_layout()
        plt.show()

print("\n--- CONFIG: BALANCED GATING (Edge > 2.0, Offset > 4.0) ---")
df['refined_type'] = df.apply(lambda r: apply_gate(r, 2.0, 4.0), axis=1)
show_refined_gallery(df, "Balanced Structural Gating")

print("\nUpdated Dataset Counts:")
display(df['refined_type'].value_counts().to_frame(name='Count'))