In [4]:
# --- Campaign QC summary + low-confidence review (Windows-safe paths) ---

import pandas as pd
from pathlib import Path

# === CONFIG ===
CAMP = "211 LA"

# Option 1 (absolute path - use a *raw string* or forward slashes):
BASE = Path(r"C:\Users\Anova\Downloads\Rally\rally-kb-starter-v2-llm\data\processed")

# Option 2 (relative path from repo root - uncomment if you run the notebook from repo root):
# BASE = Path("data/processed")

CSV = BASE / CAMP / "classified.csv"

# Sanity check paths
print(f"BASE: {BASE}")
print(f"CSV:  {CSV}")
if not BASE.exists():
    raise FileNotFoundError(f"BASE directory not found: {BASE}\n"
                            f"Tip: use a raw string r'...' or forward slashes in your absolute path.")
if not CSV.exists():
    raise FileNotFoundError(f"classified.csv not found at: {CSV}\n"
                            f"Check that the campaign folder name is correct (CAMP='{CAMP}').")

# Allowed types by extract_type (keep in sync with your classifier)
ALLOWED_BY_EXTRACT = {
    "xlsx_schema": {"editorial_calendar", "media_list", "coverage_tracker", "story_bank",
                    "speaking_tracker", "competitive_analysis"},
    "slides": {"deck|training_materials", "deck|analysis_report"},
    "text": None  # no restriction
}

# === LOAD ===
df = pd.read_csv(CSV)
print(f"\nLoaded: {CSV}  ({len(df)} rows)\n")

# === HEADLINE STATS ===
print("=== HEADLINE STATS ===")
print(f"Total rows: {len(df)}")
print(f"Unique doc_types: {df['doc_type'].nunique()}")
print(f"Unique extract_types: {df['extract_type'].nunique()}")

conf_desc = df['clf_confidence'].describe()
print("\nConfidence summary:")
print(conf_desc.to_string())

print("\nDoc type distribution (top 20):")
print(df['doc_type'].value_counts().head(20).to_string())

print("\nExtract type distribution:")
print(df['extract_type'].value_counts().to_string())

# Cross-tab doc_type by extract_type (counts)
print("\nDoc type by extract type (counts):")
ct = pd.crosstab(df['doc_type'], df['extract_type'])[
    sorted(df['extract_type'].unique())
]
display(ct)

# === LOW CONFIDENCE REVIEW ===
LOW_CONF = 0.65
low = df[df['clf_confidence'] <= LOW_CONF].copy()
print(f"\n=== LOW CONFIDENCE (<= {LOW_CONF}) — {len(low)} rows ===")
if len(low):
    cols = ["source_path", "doc_type", "doc_subtype", "extract_type", "clf_confidence"]
    display(low[cols].sort_values(by=["clf_confidence", "doc_type"]).reset_index(drop=True))
else:
    print("None 🎉")

# Optional: save low-confidence subset for manual review
OUT_LOW = BASE / CAMP / f"low_confidence_le_{int(LOW_CONF*100)}.csv"
low.to_csv(OUT_LOW, index=False)
print(f"\nSaved low-confidence rows to: {OUT_LOW}")

# === EXTRACT-TYPE vs DOC-TYPE MISMATCHES ===
def violates_extract_rule(row) -> bool:
    allowed = ALLOWED_BY_EXTRACT.get(row['extract_type'])
    if allowed is None:
        return False
    return row['doc_type'] not in allowed

mismatch = df[df.apply(violates_extract_rule, axis=1)].copy()
print(f"\n=== EXTRACT-TYPE MISMATCHES — {len(mismatch)} rows ===")
if len(mismatch):
    cols = ["source_path", "extract_type", "doc_type", "doc_subtype", "clf_confidence"]
    display(mismatch[cols].sort_values(by=["extract_type", "clf_confidence"]).reset_index(drop=True))
else:
    print("None 🎉")

# --- Targeted quick checks ---
def grep(name_substr):
    return df[df["source_path"].str.contains(name_substr, case=False, na=False)][
        ["source_path", "doc_type", "doc_subtype", "extract_type", "clf_confidence"]
    ].sort_values(by="source_path").reset_index(drop=True)

print("\n--- Targeted checks ---")
print("\n1) 'Interview Guide' files:")
display(grep("Interview Guide"))

print("\n2) 'Media List' spreadsheets:")
display(grep("Media List"))

print("\n3) 'Timeline' docs:")
display(grep("timeline"))

print("\n4) 'Release' by filename:")
display(grep("Release_"))

print("\n5) 'Pitch' by filename:")
display(grep("Pitch"))


BASE: C:\Users\Anova\Downloads\Rally\rally-kb-starter-v2-llm\data\processed
CSV:  C:\Users\Anova\Downloads\Rally\rally-kb-starter-v2-llm\data\processed\211 LA\classified.csv

Loaded: C:\Users\Anova\Downloads\Rally\rally-kb-starter-v2-llm\data\processed\211 LA\classified.csv  (81 rows)

=== HEADLINE STATS ===
Total rows: 81
Unique doc_types: 30
Unique extract_types: 3

Confidence summary:
count    81.000000
mean      0.892099
std       0.039991
min       0.600000
25%       0.900000
50%       0.900000
75%       0.900000
max       0.920000

Doc type distribution (top 20):
doc_type
interview_guide            10
pitch_email                 8
talking_points              8
press_release               7
bio_profile                 6
work_plan|strategy_memo     5
conference_proposal         4
platform_copy_edits         3
competitive_analysis        3
op_ed_draft                 3
media_response              2
workback_plan               2
media_list                  2
proclamation_language    

extract_type,slides,text,xlsx_schema
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
annual_report,0,1,0
bio_profile,0,6,0
competitive_analysis,0,2,1
conference_proposal,0,4,0
coverage_tracker,0,0,1
deck|analysis_report,1,0,0
deck|training_materials,1,0,0
editorial_calendar,0,0,1
event_qna,0,1,0
fact_sheet,0,1,0



=== LOW CONFIDENCE (<= 0.65) — 1 rows ===


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Earned Media\1. January 2025\1...,pitch_email,media pitch for nonprofit support,text,0.6



Saved low-confidence rows to: C:\Users\Anova\Downloads\Rally\rally-kb-starter-v2-llm\data\processed\211 LA\low_confidence_le_65.csv

=== EXTRACT-TYPE MISMATCHES — 0 rows ===
None 🎉

--- Targeted checks ---

1) 'Interview Guide' files:


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Insights and Analysis\211 LA_S...,interview_guide,stakeholder interview guide,text,0.9
1,data\raw\211 LA\Insights and Analysis\211 LA_S...,interview_guide,stakeholder interview guide,text,0.9
2,data\raw\211 LA\Insights and Analysis\211 LA_S...,interview_guide,stakeholder interview guide,text,0.9



2) 'Media List' spreadsheets:


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Earned Media\211 LA_Media List...,media_list,reporters and outlets contact list,xlsx_schema,0.9
1,data\raw\211 LA\Earned Media\211_Media List_1....,media_list,media contacts and outlets,xlsx_schema,0.85



3) 'Timeline' docs:


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Earned Media\_Special Projects...,timeline,timeline for launch activities,text,0.9



4) 'Release' by filename:


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Earned Media\1. January 2025\1...,press_release,announcement of funding for wildfire support,text,0.9
1,data\raw\211 LA\Earned Media\4. June 2025\211 ...,press_release,announcement of award,text,0.9
2,data\raw\211 LA\Earned Media\5. July 2025\211 ...,press_release,announcement of partnership,text,0.9
3,data\raw\211 LA\Earned Media\_Special Projects...,press_release,announcement of new data tool for wildfire rec...,text,0.9



5) 'Pitch' by filename:


Unnamed: 0,source_path,doc_type,doc_subtype,extract_type,clf_confidence
0,data\raw\211 LA\Earned Media\1. January 2025\1...,pitch_email,media pitch for nonprofit support,text,0.6
1,data\raw\211 LA\Earned Media\1. January 2025\1...,pitch_email,initial outreach to media,text,0.92
2,data\raw\211 LA\Earned Media\1. January 2025\4...,pitch_email,media pitch for disaster relief,text,0.92
3,data\raw\211 LA\Earned Media\2. February 2025\...,pitch_email,story pitch for media coverage,text,0.92
4,data\raw\211 LA\Earned Media\211 LA_Long-term ...,pitch_email,media pitch for nonprofit support,text,0.92
5,data\raw\211 LA\Earned Media\4. June 2025\211 ...,pitch_email,media pitch for wildfire recovery,text,0.92
6,data\raw\211 LA\Earned Media\_Special Projects...,pitch_email,follow-up pitch for media coverage,text,0.92
7,data\raw\211 LA\Earned Media\_Special Projects...,pitch_email,media pitch,text,0.92
