In [2]:
import os
import requests
import pandas as pd
from tqdm import tqdm

os.makedirs('data/raw', exist_ok=True)

ECB_API = "https://data.cityofnewyork.us/resource/6bgk-3dad.json"
SAFETY_API = "https://data.cityofnewyork.us/resource/855j-jady.json"
HPD_API = "https://data.cityofnewyork.us/resource/wvxf-dwi5.json"

def fetch_dataset(api_url, name, total=200000, batch=50000):
    records = []
    for offset in range(0, total, batch):
        print(f"[{name}] Fetching {offset:,}-{offset+batch:,}...")
        params = {"$limit": batch, "$offset": offset, "$order": ":id"}
        resp = requests.get(api_url, params=params)
        if resp.status_code != 200:
            print(f"  Error: {resp.status_code}")
            break
        data = resp.json()
        if not data:
            break
        records.extend(data)
    df = pd.DataFrame(records)
    print(f"[{name}] Total: {len(df):,} records\n")
    return df

print("=" * 60)
print("DOWNLOADING NYC BUILDING VIOLATION DATASETS")
print("=" * 60)

# 1. DOB ECB Violations (primary dataset)
ecb_path = "data/raw/ecb_violations.csv"
if not os.path.exists(ecb_path):
    ecb_df = fetch_dataset(ECB_API, "ECB Violations", total=300000)
    ecb_df.to_csv(ecb_path, index=False)
    print(f"Saved: {ecb_path}")
else:
    ecb_df = pd.read_csv(ecb_path)
    print(f"Cached: {ecb_path} ({len(ecb_df):,} rows)")

# 2. DOB Safety Violations (supplementary)
safety_path = "data/raw/safety_violations.csv"
if not os.path.exists(safety_path):
    safety_df = fetch_dataset(SAFETY_API, "Safety Violations", total=100000)
    safety_df.to_csv(safety_path, index=False)
    print(f"Saved: {safety_path}")
else:
    safety_df = pd.read_csv(safety_path)
    print(f"Cached: {safety_path} ({len(safety_df):,} rows)")

# 3. HPD Housing Violations (supplementary)
hpd_path = "data/raw/hpd_violations.csv"
if not os.path.exists(hpd_path):
    hpd_df = fetch_dataset(HPD_API, "HPD Violations", total=100000)
    hpd_df.to_csv(hpd_path, index=False)
    print(f"Saved: {hpd_path}")
else:
    hpd_df = pd.read_csv(hpd_path)
    print(f"Cached: {hpd_path} ({len(hpd_df):,} rows)")


for name, df in [("ECB Violations", ecb_df), ("Safety Violations", safety_df), ("HPD Violations", hpd_df)]:
    print(f"\n{name}:")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Columns: {list(df.columns)[:10]}...")
    print(f"  Missing: {df.isnull().sum().sum():,} total null values")

print("\n\nECB Violations - Key Columns Preview:")
preview_cols = [c for c in ['violation_type', 'violation_description', 'severity',
                            'section_of_law1_description', 'infraction_codes'] if c in ecb_df.columns]
print(ecb_df[preview_cols].head(10).to_string())

print("\n\nECB Violation Type Distribution:")
if 'violation_type' in ecb_df.columns:
    print(ecb_df['violation_type'].value_counts().head(15))

print("\nECB Severity Distribution:")
if 'severity' in ecb_df.columns:
    print(ecb_df['severity'].value_counts())

print("\n✓ All datasets downloaded. Ready for 01_dataset_preparation.py")


DOWNLOADING NYC BUILDING VIOLATION DATASETS
[ECB Violations] Fetching 0-50,000...
[ECB Violations] Fetching 50,000-100,000...
[ECB Violations] Fetching 100,000-150,000...
[ECB Violations] Fetching 150,000-200,000...
[ECB Violations] Fetching 200,000-250,000...
[ECB Violations] Fetching 250,000-300,000...
[ECB Violations] Total: 300,000 records

Saved: data/raw/ecb_violations.csv
[Safety Violations] Fetching 0-50,000...
[Safety Violations] Fetching 50,000-100,000...
[Safety Violations] Total: 100,000 records

Saved: data/raw/safety_violations.csv
[HPD Violations] Fetching 0-50,000...
[HPD Violations] Fetching 50,000-100,000...
[HPD Violations] Total: 100,000 records

Saved: data/raw/hpd_violations.csv

ECB Violations:
  Rows: 300,000
  Columns: 42
  Columns: ['isn_dob_bis_extract', 'ecb_violation_number', 'ecb_violation_status', 'dob_violation_number', 'bin', 'boro', 'block', 'lot', 'hearing_date', 'hearing_time']...
  Missing: 4,471,620 total null values

Safety Violations:
  Rows: 100

In [3]:


import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

for d in ['data/processed', 'data/splits', 'figures']:
    os.makedirs(d, exist_ok=True)

print("=" * 70)
print("DATASET PREPARATION FOR BUILDING CODE VIOLATION DETECTION")
print("=" * 70)


# ==============================================================================
# SECTION 1: DATASET SELECTION (3 Points)
# ==============================================================================

print("\n" + "=" * 70)
print("SECTION 1: DATASET SELECTION")
print("=" * 70)

raw_df = pd.read_csv("data/raw/ecb_violations.csv")
print(f"\n✓ Loaded raw dataset: {len(raw_df):,} records, {len(raw_df.columns)} columns")

print(f"\n--- Raw Dataset Overview ---")
print(f"Shape: {raw_df.shape}")
print(f"Date range: {raw_df['violation_date'].min()} to {raw_df['violation_date'].max()}"
      if 'violation_date' in raw_df.columns else "")

print(f"\nRaw Violation Type Distribution:")
print(raw_df['violation_type'].value_counts())

print(f"\nRaw Severity Distribution:")
print(raw_df['severity'].value_counts())

# Visualize raw distributions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

raw_df['violation_type'].value_counts().plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Raw Violation Type Distribution')
axes[0].set_xlabel('Count')

raw_df['severity'].value_counts().plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('Raw Severity Distribution')
axes[1].set_xlabel('Count')

plt.tight_layout()
plt.savefig('figures/01_raw_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: figures/01_raw_distributions.png")


# ==============================================================================
# SECTION 2: PREPROCESSING AND DATA CLEANING (3 Points)
# ==============================================================================
"""
PREPROCESSING PIPELINE:
1. Drop records with missing violation_description or violation_type
2. Clean and normalize violation descriptions (free text)
3. Map 14 raw violation types → 8 standardized categories
4. Map 6 raw severity codes → 3 classes (Hazardous / Moderate / Minor)
5. Remove duplicates and validate data integrity
6. Analyze and document class imbalance

CLEANING OPERATIONS ON violation_description:
- Replace specific addresses with <ADDR> token
- Replace dollar amounts with <AMOUNT> token
- Replace dates with <DATE> token
- Replace permit/BIN numbers with <NUM> token
- Replace floor/apartment references with <FLOOR> token
- Normalize building code references
- Normalize whitespace and casing
- Remove non-printable characters
"""

print("\n" + "=" * 70)
print("SECTION 2: PREPROCESSING AND DATA CLEANING")
print("=" * 70)

# ---------------------------------------------------------------------------
# 2.1 HANDLE MISSING VALUES
# ---------------------------------------------------------------------------

print("\n--- 2.1 Handling Missing Values ---")
print(f"Before: {len(raw_df):,} records")

missing_before = raw_df[['violation_type', 'violation_description', 'severity']].isnull().sum()
print(f"\nMissing values in key columns:")
print(missing_before)

df = raw_df.dropna(subset=['violation_description', 'violation_type', 'severity']).copy()
df = df[df['violation_description'].str.strip().str.len() > 10]

print(f"After dropping nulls + short descriptions: {len(df):,} records")
print(f"Dropped: {len(raw_df) - len(df):,} records ({(len(raw_df)-len(df))/len(raw_df)*100:.1f}%)")


# ---------------------------------------------------------------------------
# 2.2 VIOLATION DESCRIPTION CLEANING
# ---------------------------------------------------------------------------

print("\n--- 2.2 Cleaning Violation Descriptions ---")

class ViolationPreprocessor:
    """
    Preprocessor for building code violation descriptions.

    Transforms raw inspector-written descriptions into clean,
    normalized text suitable for transformer-based models.
    """

    def __init__(self):
        self.cleaning_patterns = [
            (r'\b\d{1,5}\s+[\w\s]+(STREET|ST|AVENUE|AVE|ROAD|RD|BLVD|DRIVE|DR|PLACE|PL|LANE|LN)\b', '<ADDR>'),
            (r'\$[\d,]+\.?\d*', '<AMOUNT>'),
            (r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '<DATE>'),
            (r'\b\d{4}-\d{2}-\d{2}\b', '<DATE>'),
            (r'\bBIN\s*#?\s*\d+\b', '<BIN_NUM>'),
            (r'\bJOB\s*#?\s*\d+\b', '<JOB_NUM>'),
            (r'\bAPPLICATION\s*#?\s*\d+\b', '<APP_NUM>'),
            (r'\bPERMIT\s*#?\s*\d+\b', '<PERMIT_NUM>'),
            (r'\b\d+(ST|ND|RD|TH)\s*(FLOOR|FLR|FL)\b', '<FLOOR>'),
            (r'\b(CELLAR|BASEMENT|ROOF|PENTHOUSE)\b', '<FLOOR>'),
            (r'\b(1ST|2ND|3RD|\d+TH)\s*(STORY|STORIES)\b', '<STORIES>'),
            (r'\bBC\s*\d+[\.\d]*\s*;\s*\d+-\d+', '<BLDG_CODE>'),
            (r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '<IP>'),
            (r'(?<=[^A-Z\d])\d{5,}(?=[^A-Z\d]|$)', '<NUM>'),
        ]

        self.stats = {'total': 0, 'cleaned': 0, 'empty_after': 0}

    def clean_description(self, text):
        self.stats['total'] += 1

        if pd.isna(text) or not isinstance(text, str):
            self.stats['empty_after'] += 1
            return ""

        cleaned = text.upper().strip()

        for pattern, replacement in self.cleaning_patterns:
            cleaned = re.sub(pattern, replacement, cleaned)

        cleaned = re.sub(r'[^\w\s<>.,;:\-/()&]', ' ', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        cleaned = ''.join(c for c in cleaned if c.isprintable() or c == ' ')

        if len(cleaned) < 10:
            self.stats['empty_after'] += 1
            return ""

        self.stats['cleaned'] += 1
        return cleaned

    def print_stats(self):
        print(f"\nCleaning Statistics:")
        print(f"  Total processed: {self.stats['total']:,}")
        print(f"  Successfully cleaned: {self.stats['cleaned']:,}")
        print(f"  Empty after cleaning: {self.stats['empty_after']:,}")


preprocessor = ViolationPreprocessor()

print("Cleaning violation descriptions...")
df['clean_description'] = df['violation_description'].apply(preprocessor.clean_description)
preprocessor.print_stats()

df = df[df['clean_description'].str.len() > 10].copy()
print(f"Records after cleaning: {len(df):,}")

print("\n--- Before vs After Cleaning Examples ---")
for i in range(min(5, len(df))):
    row = df.iloc[i]
    print(f"\nExample {i+1}:")
    print(f"  BEFORE: {str(row['violation_description'])[:120]}...")
    print(f"  AFTER:  {str(row['clean_description'])[:120]}...")


# ---------------------------------------------------------------------------
# 2.3 LABEL MAPPING - VIOLATION CATEGORY
# ---------------------------------------------------------------------------

print("\n--- 2.3 Label Mapping: Violation Categories ---")

CATEGORY_MAP = {
    'Construction':        'Construction',
    'Elevators':           'Elevators',
    'Boilers':             'Mechanical',
    'Local Law':           'Regulatory',
    'Signs':               'Regulatory',
    'Public Assembly':     'Regulatory',
    'Site Safety':         'Site Safety',
    'Zoning':              'Zoning',
    'Cranes and Derricks': 'Site Safety',
    'Plumbing':            'Plumbing',
    'Quality of Life':     'Quality of Life',
    'HPD':                 'Construction',
    'Administrative':      'Regulatory',
    'Unknown':             None,
}

df['category'] = df['violation_type'].map(CATEGORY_MAP)
df = df.dropna(subset=['category']).copy()
print(f"Records after removing 'Unknown' type: {len(df):,}")

print(f"\nMapped Category Distribution:")
print(df['category'].value_counts())
print(f"\nTotal categories: {df['category'].nunique()}")


# ---------------------------------------------------------------------------
# 2.4 LABEL MAPPING - SEVERITY
# ---------------------------------------------------------------------------

print("\n--- 2.4 Label Mapping: Severity ---")

SEVERITY_MAP = {
    'Hazardous':      'HIGH',
    'CLASS - 1':      'HIGH',
    'CLASS - 2':      'MEDIUM',
    'CLASS - 3':      'LOW',
    'Non-Hazardous':  'LOW',
    'Unknown':        None,
}

df['severity_label'] = df['severity'].map(SEVERITY_MAP)
df = df.dropna(subset=['severity_label']).copy()
print(f"Records after removing 'Unknown' severity: {len(df):,}")

print(f"\nMapped Severity Distribution:")
print(df['severity_label'].value_counts())


# ---------------------------------------------------------------------------
# 2.5 ENCODE LABELS
# ---------------------------------------------------------------------------

print("\n--- 2.5 Encoding Labels ---")

category_labels = sorted(df['category'].unique())
severity_labels = ['LOW', 'MEDIUM', 'HIGH']

cat2id = {c: i for i, c in enumerate(category_labels)}
id2cat = {i: c for c, i in cat2id.items()}
sev2id = {s: i for i, s in enumerate(severity_labels)}
id2sev = {i: s for s, i in sev2id.items()}

df['category_id'] = df['category'].map(cat2id)
df['severity_id'] = df['severity_label'].map(sev2id)

print(f"\nCategory Label Encoding:")
for cat, idx in cat2id.items():
    count = (df['category_id'] == idx).sum()
    print(f"  {idx}: {cat:<20s} ({count:,} samples)")

print(f"\nSeverity Label Encoding:")
for sev, idx in sev2id.items():
    count = (df['severity_id'] == idx).sum()
    print(f"  {idx}: {sev:<10s} ({count:,} samples)")

label_maps = {'cat2id': cat2id, 'id2cat': id2cat, 'sev2id': sev2id, 'id2sev': id2sev}
import json
with open('data/processed/label_maps.json', 'w') as f:
    json.dump(label_maps, f, indent=2)
print("✓ Saved: data/processed/label_maps.json")


# ---------------------------------------------------------------------------
# 2.6 REMOVE DUPLICATES + FINAL VALIDATION
# ---------------------------------------------------------------------------

print("\n--- 2.6 Deduplication & Validation ---")
print(f"Before dedup: {len(df):,}")

df = df.drop_duplicates(subset=['clean_description', 'category_id', 'severity_id']).copy()
print(f"After dedup: {len(df):,}")

assert df['clean_description'].isnull().sum() == 0, "Null descriptions found!"
assert df['category_id'].isnull().sum() == 0, "Null category labels found!"
assert df['severity_id'].isnull().sum() == 0, "Null severity labels found!"
assert df['clean_description'].str.len().min() > 10, "Short descriptions found!"
print("✓ All validation checks passed")


# ---------------------------------------------------------------------------
# 2.7 CLASS IMBALANCE ANALYSIS
# ---------------------------------------------------------------------------

print("\n--- 2.7 Class Imbalance Analysis ---")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

cat_counts = df['category'].value_counts()
cat_counts.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Violation Category Distribution (Cleaned)')
axes[0].set_xlabel('Count')
for i, v in enumerate(cat_counts.values):
    axes[0].text(v + 500, i, f'{v:,}', va='center', fontsize=9)

sev_counts = df['severity_label'].value_counts()
sev_counts.plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('Severity Distribution (Cleaned)')
axes[1].set_xlabel('Count')
for i, v in enumerate(sev_counts.values):
    axes[1].text(v + 500, i, f'{v:,}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('figures/02_cleaned_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: figures/02_cleaned_distributions.png")

imbalance_ratio_cat = cat_counts.max() / cat_counts.min()
imbalance_ratio_sev = sev_counts.max() / sev_counts.min()
print(f"\nCategory imbalance ratio (max/min): {imbalance_ratio_cat:.1f}x")
print(f"Severity imbalance ratio (max/min): {imbalance_ratio_sev:.1f}x")
print("→ Will use class weights during training to handle imbalance")

law_col = [c for c in df.columns if 'section_of_law' in c.lower() or 'sectionoflaw' in c.lower() or 'infraction' in c.lower()]
keep_cols = ['clean_description', 'category', 'category_id',
             'severity_label', 'severity_id', 'violation_type', 'severity'] + law_col
keep_cols = [c for c in keep_cols if c in df.columns]
df_processed = df[keep_cols].copy()
df_processed = df_processed.reset_index(drop=True)
df_processed.to_csv('data/processed/violations_cleaned.csv', index=False)
print(f"\n✓ Saved: data/processed/violations_cleaned.csv ({len(df_processed):,} records)")


# ==============================================================================
# SECTION 3: TRAIN / VALIDATION / TEST SPLITTING (3 Points)
# ==============================================================================
"""
SPLITTING STRATEGY:
- 80% Train / 10% Validation / 10% Test
- Stratified by BOTH category_id AND severity_id to maintain distributions
- Combined stratification key ensures all label combinations are represented
- Reproducible via RANDOM_SEED = 42
"""

print("\n" + "=" * 70)
print("SECTION 3: TRAIN / VALIDATION / TEST SPLITTING")
print("=" * 70)

df_processed['strat_key'] = df_processed['category_id'].astype(str) + "_" + df_processed['severity_id'].astype(str)

min_class_count = df_processed['strat_key'].value_counts().min()
print(f"Smallest stratification group: {min_class_count} samples")

if min_class_count < 3:
    rare_keys = df_processed['strat_key'].value_counts()[df_processed['strat_key'].value_counts() < 3].index
    print(f"Removing {len(rare_keys)} rare combinations with < 3 samples")
    df_processed = df_processed[~df_processed['strat_key'].isin(rare_keys)].copy()
    print(f"Records after removing rare combos: {len(df_processed):,}")

train_df, temp_df = train_test_split(
    df_processed, test_size=0.2, random_state=RANDOM_SEED,
    stratify=df_processed['strat_key']
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=RANDOM_SEED,
    stratify=temp_df['strat_key']
)

print(f"\n--- Split Sizes ---")
print(f"Train:      {len(train_df):>8,} ({len(train_df)/len(df_processed)*100:.1f}%)")
print(f"Validation: {len(val_df):>8,} ({len(val_df)/len(df_processed)*100:.1f}%)")
print(f"Test:       {len(test_df):>8,} ({len(test_df)/len(df_processed)*100:.1f}%)")
print(f"Total:      {len(train_df)+len(val_df)+len(test_df):>8,}")

print(f"\n--- Category Distribution Across Splits ---")
for split_name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    dist = split_df['category'].value_counts(normalize=True)
    print(f"\n{split_name}:")
    for cat in category_labels:
        pct = dist.get(cat, 0) * 100
        print(f"  {cat:<20s}: {pct:5.1f}%")

print(f"\n--- Severity Distribution Across Splits ---")
for split_name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    dist = split_df['severity_label'].value_counts(normalize=True)
    print(f"\n{split_name}:")
    for sev in severity_labels:
        pct = dist.get(sev, 0) * 100
        print(f"  {sev:<10s}: {pct:5.1f}%")

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
for i, (name, sdf) in enumerate([('Train', train_df), ('Val', val_df), ('Test', test_df)]):
    sdf['category'].value_counts().plot(kind='bar', ax=axes[0][i], color='steelblue')
    axes[0][i].set_title(f'{name} - Categories')
    axes[0][i].tick_params(axis='x', rotation=45)

    sdf['severity_label'].value_counts().plot(kind='bar', ax=axes[1][i], color='coral')
    axes[1][i].set_title(f'{name} - Severity')
    axes[1][i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('figures/03_split_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: figures/03_split_distributions.png")

train_df.drop(columns=['strat_key']).to_csv('data/splits/train.csv', index=False)
val_df.drop(columns=['strat_key']).to_csv('data/splits/val.csv', index=False)
test_df.drop(columns=['strat_key']).to_csv('data/splits/test.csv', index=False)
print("✓ Saved: data/splits/train.csv, val.csv, test.csv")


# ==============================================================================
# SECTION 4: FORMATTING FOR FINE-TUNING (3 Points)
# ==============================================================================
"""
FORMATTING FOR RoBERTa FINE-TUNING:
- Tokenize with RoBERTa tokenizer (max_length=256)
- Create PyTorch Dataset class with dual labels (category + severity)
- Build DataLoaders with appropriate batch sizes
- Compute class weights for weighted loss during training
- Save tokenized datasets for reproducible training
"""

print("\n" + "=" * 70)
print("SECTION 4: FORMATTING FOR FINE-TUNING")
print("=" * 70)

MODEL_NAME = "roberta-base"
MAX_LENGTH = 256
BATCH_SIZE = 16

print(f"\nModel: {MODEL_NAME}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"✓ Loaded tokenizer: vocab size = {tokenizer.vocab_size:,}")

# ---------------------------------------------------------------------------
# 4.1 TOKEN LENGTH ANALYSIS
# ---------------------------------------------------------------------------

print("\n--- 4.1 Token Length Analysis ---")

sample_texts = train_df['clean_description'].sample(min(5000, len(train_df)), random_state=RANDOM_SEED)
token_lengths = [len(tokenizer.encode(t, add_special_tokens=True)) for t in tqdm(sample_texts, desc="Analyzing token lengths")]

print(f"Token length statistics:")
print(f"  Mean:   {np.mean(token_lengths):.1f}")
print(f"  Median: {np.median(token_lengths):.1f}")
print(f"  Std:    {np.std(token_lengths):.1f}")
print(f"  Min:    {np.min(token_lengths)}")
print(f"  Max:    {np.max(token_lengths)}")
print(f"  95th percentile: {np.percentile(token_lengths, 95):.0f}")
print(f"  99th percentile: {np.percentile(token_lengths, 99):.0f}")

coverage = sum(1 for l in token_lengths if l <= MAX_LENGTH) / len(token_lengths) * 100
print(f"\n  Coverage at max_length={MAX_LENGTH}: {coverage:.1f}%")

plt.figure(figsize=(10, 5))
plt.hist(token_lengths, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(x=MAX_LENGTH, color='red', linestyle='--', label=f'max_length={MAX_LENGTH}')
plt.axvline(x=np.percentile(token_lengths, 95), color='orange', linestyle='--', label='95th percentile')
plt.xlabel('Token Length')
plt.ylabel('Count')
plt.title('Distribution of Token Lengths (RoBERTa Tokenizer)')
plt.legend()
plt.tight_layout()
plt.savefig('figures/04_token_lengths.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: figures/04_token_lengths.png")


# ---------------------------------------------------------------------------
# 4.2 PYTORCH DATASET CLASS
# ---------------------------------------------------------------------------

print("\n--- 4.2 Creating PyTorch Dataset ---")

class ViolationDataset(Dataset):
    def __init__(self, texts, category_ids, severity_ids, tokenizer, max_length=256):
        self.texts = texts.tolist()
        self.category_ids = category_ids.tolist()
        self.severity_ids = severity_ids.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'category_label': torch.tensor(self.category_ids[idx], dtype=torch.long),
            'severity_label': torch.tensor(self.severity_ids[idx], dtype=torch.long),
        }

train_dataset = ViolationDataset(
    train_df['clean_description'], train_df['category_id'],
    train_df['severity_id'], tokenizer, MAX_LENGTH
)
val_dataset = ViolationDataset(
    val_df['clean_description'], val_df['category_id'],
    val_df['severity_id'], tokenizer, MAX_LENGTH
)
test_dataset = ViolationDataset(
    test_df['clean_description'], test_df['category_id'],
    test_df['severity_id'], tokenizer, MAX_LENGTH
)

print(f"Train dataset: {len(train_dataset):,} samples")
print(f"Val dataset:   {len(val_dataset):,} samples")
print(f"Test dataset:  {len(test_dataset):,} samples")

sample = train_dataset[0]
print(f"\nSample item shapes:")
print(f"  input_ids:      {sample['input_ids'].shape}")
print(f"  attention_mask:  {sample['attention_mask'].shape}")
print(f"  category_label:  {sample['category_label']} ({id2cat[sample['category_label'].item()]})")
print(f"  severity_label:  {sample['severity_label']} ({id2sev[sample['severity_label'].item()]})")


# ---------------------------------------------------------------------------
# 4.3 DATALOADERS
# ---------------------------------------------------------------------------

print("\n--- 4.3 Creating DataLoaders ---")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader):,}")
print(f"Val batches:   {len(val_loader):,}")
print(f"Test batches:  {len(test_loader):,}")

batch = next(iter(train_loader))
print(f"\nSample batch shapes:")
for k, v in batch.items():
    print(f"  {k}: {v.shape}")


# ---------------------------------------------------------------------------
# 4.4 CLASS WEIGHTS FOR IMBALANCED DATA
# ---------------------------------------------------------------------------

print("\n--- 4.4 Computing Class Weights ---")

cat_counts = train_df['category_id'].value_counts().sort_index()
cat_weights = len(train_df) / (len(cat_counts) * cat_counts.values)
cat_weights_tensor = torch.FloatTensor(cat_weights)

sev_counts = train_df['severity_id'].value_counts().sort_index()
sev_weights = len(train_df) / (len(sev_counts) * sev_counts.values)
sev_weights_tensor = torch.FloatTensor(sev_weights)

print(f"\nCategory class weights:")
for i, w in enumerate(cat_weights_tensor):
    print(f"  {id2cat[i]:<20s}: {w:.4f}")

print(f"\nSeverity class weights:")
for i, w in enumerate(sev_weights_tensor):
    print(f"  {id2sev[i]:<10s}: {w:.4f}")

torch.save({
    'train_dataset': train_dataset,
    'val_dataset': val_dataset,
    'test_dataset': test_dataset,
    'cat_weights': cat_weights_tensor,
    'sev_weights': sev_weights_tensor,
    'label_maps': label_maps,
    'config': {
        'model_name': MODEL_NAME,
        'max_length': MAX_LENGTH,
        'batch_size': BATCH_SIZE,
        'num_categories': len(cat2id),
        'num_severities': len(sev2id),
        'random_seed': RANDOM_SEED,
    }
}, 'data/processed/tokenized_datasets.pt')
print("\n✓ Saved: data/processed/tokenized_datasets.pt")


# ==============================================================================
# FINAL SUMMARY
# ==============================================================================

print("\n" + "=" * 70)
print("DATASET PREPARATION COMPLETE — SUMMARY")
print("=" * 70)

print(f"""
┌────────────────────────────────────────────────────────────────┐
│                    PREPARATION SUMMARY                         │
├────────────────────────────────────────────────────────────────┤
│ Raw records loaded:          {len(raw_df):>8,}                         │
│ After cleaning & mapping:    {len(df_processed):>8,}                         │
│ After deduplication:         {len(df_processed):>8,}                         │
│                                                                │
│ Train set:                   {len(train_df):>8,}  (80%)                    │
│ Validation set:              {len(val_df):>8,}  (10%)                    │
│ Test set:                    {len(test_df):>8,}  (10%)                    │
│                                                                │
│ Violation categories:        {len(cat2id):>8}                            │
│ Severity levels:             {len(sev2id):>8}                            │
│ Max token length:            {MAX_LENGTH:>8}                            │
│ Tokenizer:                   {MODEL_NAME:>15}                     │
│                                                                │
│ Files saved:                                                   │
│   data/processed/violations_cleaned.csv                        │
│   data/processed/label_maps.json                               │
│   data/processed/tokenized_datasets.pt                         │
│   data/splits/train.csv                                        │
│   data/splits/val.csv                                          │
│   data/splits/test.csv                                         │
│   figures/01_raw_distributions.png                              │
│   figures/02_cleaned_distributions.png                          │
│   figures/03_split_distributions.png                            │
│   figures/04_token_lengths.png                                  │
└────────────────────────────────────────────────────────────────┘
""")

DATASET PREPARATION FOR BUILDING CODE VIOLATION DETECTION

SECTION 1: DATASET SELECTION


  raw_df = pd.read_csv("data/raw/ecb_violations.csv")



✓ Loaded raw dataset: 300,000 records, 42 columns

--- Raw Dataset Overview ---
Shape: (300000, 42)


Raw Violation Type Distribution:
violation_type
Construction           156931
Elevators               52893
Unknown                 40800
Boilers                 14402
Local Law                7960
Signs                    6699
Public Assembly          5783
Site Safety              3713
Zoning                   3169
Cranes and Derricks      3024
Plumbing                 2839
Quality of Life          1494
HPD                       278
Administrative             15
Name: count, dtype: int64

Raw Severity Distribution:
severity
CLASS - 2        100380
Non-Hazardous     93596
CLASS - 1         71057
Hazardous         17151
CLASS - 3         14983
Unknown            2833
Name: count, dtype: int64
✓ Saved: figures/01_raw_distributions.png

SECTION 2: PREPROCESSING AND DATA CLEANING

--- 2.1 Handling Missing Values ---
Before: 300,000 records

Missing values in key columns:
violation_type   

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

✓ Loaded tokenizer: vocab size = 50,265

--- 4.1 Token Length Analysis ---


Analyzing token lengths: 100%|██████████| 5000/5000 [00:00<00:00, 7112.57it/s]


Token length statistics:
  Mean:   63.4
  Median: 70.0
  Std:    21.1
  Min:    7
  Max:    134
  95th percentile: 88
  99th percentile: 102

  Coverage at max_length=256: 100.0%
✓ Saved: figures/04_token_lengths.png

--- 4.2 Creating PyTorch Dataset ---
Train dataset: 189,156 samples
Val dataset:   23,644 samples
Test dataset:  23,645 samples

Sample item shapes:
  input_ids:      torch.Size([256])
  attention_mask:  torch.Size([256])
  category_label:  0 (Construction)
  severity_label:  1 (MEDIUM)

--- 4.3 Creating DataLoaders ---
Train batches: 11,823
Val batches:   1,478
Test batches:  1,478

Sample batch shapes:
  input_ids: torch.Size([16, 256])
  attention_mask: torch.Size([16, 256])
  category_label: torch.Size([16])
  severity_label: torch.Size([16])

--- 4.4 Computing Class Weights ---

Category class weights:
  Construction        : 0.1964
  Elevators           : 0.6327
  Mechanical          : 2.7366
  Plumbing            : 10.7037
  Quality of Life     : 20.5069
  Regulato

In [7]:
import os
import shutil

os.makedirs('checkpoints', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Kaggle uploads go to /kaggle/input/ — find your files
# List what's there:
for root, dirs, files in os.walk('/kaggle/input'):
    for f in files:
        print(os.path.join(root, f))
        
shutil.copy('/kaggle/input/checkpoints/final_model.pt', 'checkpoints/final_model.pt')
shutil.copy('/kaggle/input/checkpoints/tokenized_datasets.pt', 'data/processed/tokenized_datasets.pt')

print("✓ Files moved")

/kaggle/input/checkpoints/final_model.pt
/kaggle/input/checkpoints/tokenized_datasets.pt
✓ Files moved


In [8]:
import os, json, time, torch, torch.nn as nn, numpy as np, pandas as pd
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

for d in ['results', 'figures']:
    os.makedirs(d, exist_ok=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")

class ViolationDataset(TorchDataset):
    def __init__(self, texts, category_ids, severity_ids, tokenizer, max_length=256):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
        self.category_ids = category_ids.tolist() if hasattr(category_ids, 'tolist') else category_ids
        self.severity_ids = severity_ids.tolist() if hasattr(severity_ids, 'tolist') else severity_ids
        self.tokenizer = tokenizer; self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {'input_ids': enc['input_ids'].squeeze(), 'attention_mask': enc['attention_mask'].squeeze(),
                'category_label': torch.tensor(self.category_ids[idx], dtype=torch.long),
                'severity_label': torch.tensor(self.severity_ids[idx], dtype=torch.long)}

class ViolationClassifier(nn.Module):
    def __init__(self, model_name, num_categories, num_severities, dropout=0.3):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        h = self.encoder.config.hidden_size; self.dropout = nn.Dropout(dropout)
        self.category_head = nn.Sequential(nn.Linear(h,256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256,num_categories))
        self.severity_head = nn.Sequential(nn.Linear(h,128), nn.ReLU(), nn.Dropout(dropout), nn.Linear(128,num_severities))
    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = self.dropout(out.last_hidden_state[:, 0, :])
        return self.category_head(cls), self.severity_head(cls)

print("Loading model...")
ckpt = torch.load('checkpoints/final_model.pt', map_location='cpu', weights_only=False)
model_cfg = ckpt['model_config']; label_maps = ckpt['label_maps']
id2cat = {int(k):v for k,v in label_maps['id2cat'].items()}
id2sev = {int(k):v for k,v in label_maps['id2sev'].items()}
cat_names = [id2cat[i] for i in range(len(id2cat))]
sev_names = [id2sev[i] for i in range(len(id2sev))]

model = ViolationClassifier(model_cfg['model_name'], model_cfg['num_categories'], model_cfg['num_severities'], model_cfg['dropout'])
model.load_state_dict(ckpt['model_state_dict']); model.to(DEVICE); model.eval()
print(f"✓ Model loaded")

data = torch.load('data/processed/tokenized_datasets.pt', map_location='cpu', weights_only=False)
test_dataset = data['test_dataset']
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)
test_df = pd.read_csv('data/splits/test.csv')
print(f"✓ Test: {len(test_dataset):,} samples")

@torch.no_grad()
def predict_all(mdl, loader):
    cp,cl,sp,sl,cpr,spr = [],[],[],[],[],[]
    for b in tqdm(loader, desc="Predicting"):
        ids = b['input_ids'].to(DEVICE); mask = b['attention_mask'].to(DEVICE)
        c_log, s_log = mdl(ids, mask)
        cpr.append(torch.softmax(c_log,1).detach().cpu().numpy())
        spr.append(torch.softmax(s_log,1).detach().cpu().numpy())
        cp.extend(c_log.argmax(1).detach().cpu().numpy())
        cl.extend(b['category_label'].detach().cpu().numpy())
        sp.extend(s_log.argmax(1).detach().cpu().numpy())
        sl.extend(b['severity_label'].detach().cpu().numpy())
    return np.array(cp),np.array(cl),np.array(sp),np.array(sl),np.vstack(cpr),np.vstack(spr)

# ==============================================================================
# SECTION 1: TEST SET EVALUATION
# ==============================================================================
print("\n" + "="*70 + "\nSECTION 1: TEST SET EVALUATION\n" + "="*70)

cat_preds, cat_true, sev_preds, sev_true, cat_probs, sev_probs = predict_all(model, test_loader)

print("\n--- Category Report ---")
cat_report = classification_report(cat_true, cat_preds, target_names=cat_names, digits=4, output_dict=True)
print(classification_report(cat_true, cat_preds, target_names=cat_names, digits=4))
cat_macro_f1 = f1_score(cat_true, cat_preds, average='macro')
cat_weighted_f1 = f1_score(cat_true, cat_preds, average='weighted')
cat_acc = accuracy_score(cat_true, cat_preds)

print("\n--- Severity Report ---")
sev_report = classification_report(sev_true, sev_preds, target_names=sev_names, digits=4, output_dict=True)
print(classification_report(sev_true, sev_preds, target_names=sev_names, digits=4))
sev_macro_f1 = f1_score(sev_true, sev_preds, average='macro')
sev_weighted_f1 = f1_score(sev_true, sev_preds, average='weighted')
sev_acc = accuracy_score(sev_true, sev_preds)

print(f"Category: F1={cat_macro_f1:.4f}, Acc={cat_acc:.4f}")
print(f"Severity: F1={sev_macro_f1:.4f}, Acc={sev_acc:.4f}")

# ==============================================================================
# SECTION 2: BASELINE COMPARISON (hardcoded from training output)
# ==============================================================================
print("\n" + "="*70 + "\nSECTION 2: BASELINE vs FINE-TUNED\n" + "="*70)

b_cat_f1, b_sev_f1, b_cat_acc, b_sev_acc = 0.0046, 0.1530, 0.0161, 0.2978

comparison = {
    'Metric': ['Category F1','Category Acc','Severity F1','Severity Acc','Combined F1'],
    'Baseline': [b_cat_f1, b_cat_acc, b_sev_f1, b_sev_acc, (b_cat_f1+b_sev_f1)/2],
    'Fine-Tuned': [cat_macro_f1, cat_acc, sev_macro_f1, sev_acc, (cat_macro_f1+sev_macro_f1)/2],
}
comparison['Improvement'] = [ft-bl for ft,bl in zip(comparison['Fine-Tuned'], comparison['Baseline'])]
print("\n" + pd.DataFrame(comparison).to_string(index=False, float_format='%.4f'))

fig, ax = plt.subplots(figsize=(10,6))
x = np.arange(5); w = 0.35
ax.bar(x-w/2, comparison['Baseline'], w, label='Baseline', color='#d9534f')
ax.bar(x+w/2, comparison['Fine-Tuned'], w, label='Fine-Tuned', color='#5cb85c')
ax.set_xticks(x); ax.set_xticklabels(comparison['Metric'], rotation=20, ha='right')
ax.legend(); ax.set_ylim(0,1.05); ax.set_title('Baseline vs Fine-Tuned')
plt.tight_layout(); plt.savefig('figures/06_baseline_vs_finetuned.png', dpi=150); plt.show()
print("✓ Saved: figures/06_baseline_vs_finetuned.png")

# ==============================================================================
# SECTION 3: CONFUSION MATRICES
# ==============================================================================
print("\n" + "="*70 + "\nSECTION 3: CONFUSION MATRICES\n" + "="*70)

fig, axes = plt.subplots(1, 2, figsize=(20, 8))
cat_cm = confusion_matrix(cat_true, cat_preds)
cat_cm_n = cat_cm.astype('float') / cat_cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cat_cm_n, annot=True, fmt='.2f', cmap='Blues', xticklabels=cat_names, yticklabels=cat_names, ax=axes[0])
axes[0].set_title('Category Confusion Matrix'); axes[0].set_ylabel('True'); axes[0].set_xlabel('Predicted')

sev_cm = confusion_matrix(sev_true, sev_preds)
sev_cm_n = sev_cm.astype('float') / sev_cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(sev_cm_n, annot=True, fmt='.2f', cmap='Oranges', xticklabels=sev_names, yticklabels=sev_names, ax=axes[1])
axes[1].set_title('Severity Confusion Matrix'); axes[1].set_ylabel('True'); axes[1].set_xlabel('Predicted')
plt.tight_layout(); plt.savefig('figures/07_confusion_matrices.png', dpi=150); plt.show()
print("✓ Saved: figures/07_confusion_matrices.png")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
cat_f1s = [cat_report[c]['f1-score'] for c in cat_names]
axes[0].barh(cat_names, cat_f1s, color=['#d9534f' if f<0.7 else '#f0ad4e' if f<0.85 else '#5cb85c' for f in cat_f1s])
axes[0].set_xlim(0,1); axes[0].set_title('Per-Class F1 — Category')
for i,v in enumerate(cat_f1s): axes[0].text(v+0.01, i, f'{v:.3f}', va='center')

sev_f1s = [sev_report[s]['f1-score'] for s in sev_names]
axes[1].barh(sev_names, sev_f1s, color=['#d9534f' if f<0.7 else '#f0ad4e' if f<0.85 else '#5cb85c' for f in sev_f1s])
axes[1].set_xlim(0,1); axes[1].set_title('Per-Class F1 — Severity')
for i,v in enumerate(sev_f1s): axes[1].text(v+0.01, i, f'{v:.3f}', va='center')
plt.tight_layout(); plt.savefig('figures/08_per_class_f1.png', dpi=150); plt.show()
print("✓ Saved: figures/08_per_class_f1.png")

# ==============================================================================
# SECTION 4: ERROR ANALYSIS
# ==============================================================================
print("\n" + "="*70 + "\nSECTION 4: ERROR ANALYSIS\n" + "="*70)

test_df = test_df.head(len(cat_preds)).copy()
test_df['cat_pred'] = [id2cat[p] for p in cat_preds]
test_df['cat_true'] = [id2cat[t] for t in cat_true]
test_df['sev_pred'] = [id2sev[p] for p in sev_preds]
test_df['sev_true'] = [id2sev[t] for t in sev_true]
test_df['cat_correct'] = test_df['cat_pred'] == test_df['cat_true']
test_df['sev_correct'] = test_df['sev_pred'] == test_df['sev_true']
test_df['cat_confidence'] = [cat_probs[i, cat_preds[i]] for i in range(len(cat_preds))]
test_df['sev_confidence'] = [sev_probs[i, sev_preds[i]] for i in range(len(sev_preds))]

cat_errors = test_df[~test_df['cat_correct']]
sev_errors = test_df[~test_df['sev_correct']]

print(f"Category errors: {len(cat_errors):,}/{len(test_df):,} ({len(cat_errors)/len(test_df)*100:.1f}%)")
print(f"Severity errors: {len(sev_errors):,}/{len(test_df):,} ({len(sev_errors)/len(test_df)*100:.1f}%)")

print("\n--- Top Category Confusions ---")
cat_conf_pairs = cat_errors.groupby(['cat_true','cat_pred']).size().sort_values(ascending=False).head(10)
print(cat_conf_pairs.to_string())

print("\n--- Top Severity Confusions ---")
sev_conf_pairs = sev_errors.groupby(['sev_true','sev_pred']).size().sort_values(ascending=False).head(10)
print(sev_conf_pairs.to_string())

print("\n--- Silent Failures (High Confidence + Wrong) ---")
hc_cat = cat_errors[cat_errors['cat_confidence']>0.9].sort_values('cat_confidence', ascending=False)
print(f"High-conf category errors (>90%): {len(hc_cat)}")
for _, r in hc_cat.head(3).iterrows():
    print(f"  {str(r['clean_description'])[:80]}... TRUE:{r['cat_true']} → PRED:{r['cat_pred']} ({r['cat_confidence']:.3f})")

critical = sev_errors[(sev_errors['sev_true']=='HIGH')&(sev_errors['sev_pred']=='LOW')]
false_alarms = sev_errors[(sev_errors['sev_true']=='LOW')&(sev_errors['sev_pred']=='HIGH')]
print(f"\n⚠️ CRITICAL (HIGH→LOW): {len(critical)}")
for _, r in critical.head(3).iterrows():
    print(f"  {str(r['clean_description'])[:80]}... conf:{r['sev_confidence']:.3f}")
print(f"⚠️ FALSE ALARMS (LOW→HIGH): {len(false_alarms)}")

print("\n--- Error Rate by Text Length ---")
test_df['text_length'] = test_df['clean_description'].str.len()
test_df['length_bin'] = pd.cut(test_df['text_length'], bins=[0,100,200,300,500,10000], labels=['<100','100-200','200-300','300-500','500+'])
print(test_df.groupby('length_bin').agg(
    total=('cat_correct','count'),
    cat_err=('cat_correct', lambda x: 1-x.mean()),
    sev_err=('sev_correct', lambda x: 1-x.mean())
).round(4).to_string())

weakest_cat = min(cat_names, key=lambda c: cat_report[c]['f1-score'])
weakest_sev = min(sev_names, key=lambda s: sev_report[s]['f1-score'])
print(f"\n--- Suggested Improvements ---")
print(f"1. Weakest category: '{weakest_cat}' (F1:{cat_report[weakest_cat]['f1-score']:.4f}) → more data / augmentation")
print(f"2. Weakest severity: '{weakest_sev}' (F1:{sev_report[weakest_sev]['f1-score']:.4f}) → re-examine label mapping")
print(f"3. {len(critical)} critical safety misses → asymmetric loss penalizing HIGH→LOW")
print(f"4. {len(hc_cat)} high-confidence errors → temperature scaling for calibration")

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
cat_conf_pairs.head(8).plot(kind='barh', ax=axes[0], color='#d9534f')
axes[0].set_title('Top Confusions'); axes[0].set_xlabel('Count')

conf_bins = [0, 0.3, 0.5, 0.7, 0.9, 1.0]
test_df['conf_bin'] = pd.cut(test_df['cat_confidence'], bins=conf_bins)
ce = test_df.groupby('conf_bin')['cat_correct'].agg(['mean','count'])
axes[1].bar(range(len(ce)), 1-ce['mean'].values, color='#f0ad4e')
axes[1].set_xticks(range(len(ce))); axes[1].set_xticklabels(['0-0.3','0.3-0.5','0.5-0.7','0.7-0.9','0.9-1.0'])
axes[1].set_title('Error Rate by Confidence'); axes[1].set_ylabel('Error Rate')
plt.tight_layout(); plt.savefig('figures/09_error_analysis.png', dpi=150); plt.show()
print("✓ Saved: figures/09_error_analysis.png")

# ==============================================================================
# SECTION 5: INFERENCE PIPELINE
# ==============================================================================
print("\n" + "="*70 + "\nSECTION 5: INFERENCE PIPELINE\n" + "="*70)

class ViolationInferencePipeline:
    def __init__(self, model, tokenizer, id2cat, id2sev, max_length=256, device='cpu'):
        self.model = model.to(device); self.model.eval()
        self.tokenizer = tokenizer; self.id2cat = id2cat; self.id2sev = id2sev
        self.max_length = max_length; self.device = device

    @torch.no_grad()
    def predict(self, text):
        enc = self.tokenizer(text.upper(), max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        ids = enc['input_ids'].to(self.device); mask = enc['attention_mask'].to(self.device)
        c_log, s_log = self.model(ids, mask)
        cp = torch.softmax(c_log,1).detach().cpu().numpy()[0]
        sp = torch.softmax(s_log,1).detach().cpu().numpy()[0]
        return {'category': self.id2cat[cp.argmax()], 'cat_conf': float(cp.max()),
                'severity': self.id2sev[sp.argmax()], 'sev_conf': float(sp.max())}

    @torch.no_grad()
    def predict_batch(self, texts, batch_size=32):
        results = []
        for i in range(0, len(texts), batch_size):
            bt = texts[i:i+batch_size]
            enc = self.tokenizer([t.upper() for t in bt], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
            ids = enc['input_ids'].to(self.device); mask = enc['attention_mask'].to(self.device)
            c_log, s_log = self.model(ids, mask)
            cp = torch.softmax(c_log,1).detach().cpu().numpy()
            sp = torch.softmax(s_log,1).detach().cpu().numpy()
            for j in range(len(bt)):
                results.append({'text': bt[j][:80], 'category': self.id2cat[cp[j].argmax()],
                                'cat_conf': float(cp[j].max()), 'severity': self.id2sev[sp[j].argmax()],
                                'sev_conf': float(sp[j].max())})
        return results

tokenizer = AutoTokenizer.from_pretrained(model_cfg['model_name'])
pipeline = ViolationInferencePipeline(model, tokenizer, id2cat, id2sev, device=DEVICE)

demo_texts = [
    "FAILURE TO MAINTAIN BUILDING WALL NOTED BRICKS FALLING FROM FACADE POSING DANGER TO PEDESTRIANS",
    "WORK WITHOUT A PERMIT CONTRACTOR PERFORMING ELECTRICAL WORK ON 3RD FLOOR WITHOUT DOB APPROVAL",
    "ELEVATOR INSPECTION OVERDUE CERTIFICATE EXPIRED LAST YEAR BUILDING HAS 6 PASSENGER ELEVATORS",
    "FENCE EXCEEDS PERMITTED HEIGHT IN FRONT YARD SETBACK AREA ZONING VIOLATION",
    "FAILURE TO PROVIDE SITE SAFETY MANAGER DURING ACTIVE DEMOLITION OF 5 STORY BUILDING",
    "BOILER FAILED ANNUAL INSPECTION DUE TO CRACKED HEAT EXCHANGER AND GAS LEAK DETECTED",
    "ILLEGAL CONVERSION OF COMMERCIAL SPACE TO RESIDENTIAL USE WITHOUT CERTIFICATE OF OCCUPANCY",
    "EXIT DOOR NOT SELF CLOSING ON 2ND FLOOR OF PUBLIC ASSEMBLY SPACE CAPACITY 300 PERSONS",
]

print("\n--- Inference Demo ---")
for t in demo_texts:
    r = pipeline.predict(t)
    print(f"\nInput:    {t[:80]}...")
    print(f"Category: {r['category']} ({r['cat_conf']:.3f}) | Severity: {r['severity']} ({r['sev_conf']:.3f})")

print("\n--- Batch Performance ---")
sample = test_df['clean_description'].head(500).tolist()
start = time.time()
_ = pipeline.predict_batch(sample, batch_size=64)
elapsed = time.time() - start
print(f"{len(sample)} samples in {elapsed:.2f}s | {len(sample)/elapsed:.0f} samples/sec | {elapsed/len(sample)*1000:.1f}ms/sample")

# ==============================================================================
# SAVE RESULTS
# ==============================================================================
results = {
    'test_metrics': {'cat_f1': cat_macro_f1, 'cat_weighted_f1': cat_weighted_f1, 'cat_acc': cat_acc,
                     'sev_f1': sev_macro_f1, 'sev_weighted_f1': sev_weighted_f1, 'sev_acc': sev_acc,
                     'combined_f1': (cat_macro_f1+sev_macro_f1)/2},
    'baseline': {'cat_f1': b_cat_f1, 'sev_f1': b_sev_f1},
    'errors': {'cat_error_rate': len(cat_errors)/len(test_df), 'sev_error_rate': len(sev_errors)/len(test_df),
               'critical_high_as_low': int(len(critical)), 'false_alarms': int(len(false_alarms)),
               'high_conf_errors': int(len(hc_cat))},
    'inference': {'throughput': len(sample)/elapsed, 'latency_ms': elapsed/len(sample)*1000}
}
with open('results/evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("\n✓ Saved: results/evaluation_results.json")

# ==============================================================================
# FINAL SUMMARY
# ==============================================================================
print(f"""
{'='*70}
EVALUATION COMPLETE
{'='*70}

  TEST SET:  Cat F1={cat_macro_f1:.4f}  Sev F1={sev_macro_f1:.4f}  Combined={((cat_macro_f1+sev_macro_f1)/2):.4f}
  BASELINE:  Cat F1={b_cat_f1:.4f}  Sev F1={b_sev_f1:.4f}
  IMPROVEMENT: Cat +{cat_macro_f1-b_cat_f1:.4f}  Sev +{sev_macro_f1-b_sev_f1:.4f}
  ERRORS: {len(critical)} critical (HIGH→LOW) | {len(false_alarms)} false alarms | {len(hc_cat)} high-conf errors
  INFERENCE: {len(sample)/elapsed:.0f} samples/sec | {elapsed/len(sample)*1000:.1f}ms/sample

  Files: figures/06-09, results/evaluation_results.json
""")

Device: cuda
Loading model...


2026-02-07 15:49:12.633712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770479352.790930      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770479352.837467      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770479353.220450      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770479353.220498      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770479353.220501      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded
✓ Test: 23,645 samples

SECTION 1: TEST SET EVALUATION


Predicting:   0%|          | 0/370 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting: 100%|██████████| 370/370 [06:07<00:00,  1.01it/s]



--- Category Report ---
                 precision    recall  f1-score   support

   Construction     0.9955    0.9664    0.9807     15048
      Elevators     0.9443    0.9876    0.9655      4673
     Mechanical     0.9339    0.7593    0.8376      1080
       Plumbing     0.8713    0.9600    0.9135       275
Quality of Life     0.7150    0.9583    0.8190       144
     Regulatory     0.9228    0.9876    0.9541      1453
    Site Safety     0.7416    0.9682    0.8399       661
         Zoning     0.8151    0.9357    0.8713       311

       accuracy                         0.9620     23645
      macro avg     0.8675    0.9404    0.8977     23645
   weighted avg     0.9655    0.9620    0.9624     23645


--- Severity Report ---
              precision    recall  f1-score   support

         LOW     0.9230    0.8979    0.9103      9643
      MEDIUM     0.8277    0.8600    0.8436      6959
        HIGH     0.8378    0.8367    0.8373      7043

    accuracy                         0.8685  

  print(test_df.groupby('length_bin').agg(
  ce = test_df.groupby('conf_bin')['cat_correct'].agg(['mean','count'])


✓ Saved: figures/09_error_analysis.png

SECTION 5: INFERENCE PIPELINE

--- Inference Demo ---

Input:    FAILURE TO MAINTAIN BUILDING WALL NOTED BRICKS FALLING FROM FACADE POSING DANGER...
Category: Construction (0.978) | Severity: HIGH (0.731)

Input:    WORK WITHOUT A PERMIT CONTRACTOR PERFORMING ELECTRICAL WORK ON 3RD FLOOR WITHOUT...
Category: Construction (0.990) | Severity: LOW (0.909)

Input:    ELEVATOR INSPECTION OVERDUE CERTIFICATE EXPIRED LAST YEAR BUILDING HAS 6 PASSENG...
Category: Elevators (0.428) | Severity: LOW (0.813)

Input:    FENCE EXCEEDS PERMITTED HEIGHT IN FRONT YARD SETBACK AREA ZONING VIOLATION...
Category: Construction (0.919) | Severity: LOW (0.996)

Input:    FAILURE TO PROVIDE SITE SAFETY MANAGER DURING ACTIVE DEMOLITION OF 5 STORY BUILD...
Category: Site Safety (0.966) | Severity: HIGH (0.992)

Input:    BOILER FAILED ANNUAL INSPECTION DUE TO CRACKED HEAT EXCHANGER AND GAS LEAK DETEC...
Category: Mechanical (0.999) | Severity: LOW (0.533)

Input:    ILLEG