# Populate ATI Tab from ATI-AI_refactored.csv

This notebook fills the ATI/LAI sheet in the 2024-2025 Statistical Reports template from the consolidated ATI dataset.

- Select institution, reporting period, and language.
- Generates a new .xlsx file with the ATI/LAI tab populated.
- Set `CSV_URL` when the consolidated CSV is hosted online.


In [7]:
# %pip install ipywidgets
import re
import unicodedata
from pathlib import Path

import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import column_index_from_string, get_column_letter
from openpyxl.worksheet.formula import ArrayFormula
from IPython.display import display


In [8]:
CSV_URL = None  # TODO: set to hosted URL (e.g., https://.../ATI-AI_refactored.csv)
CSV_FALLBACK_PATH = "ATI-AI_refactored.csv"

TEMPLATE_EN = "2024-2025 Statistical Reports.xlsx"
TEMPLATE_FR = "2024-2025 Rapports statistiques.xlsx"
MAPPING_SHEET = "ATI_ForConsumption"


In [9]:
DIRECT_REF_RE = re.compile(r"=\s*'?([^'!]+)'?!\$?([A-Z]+)\$?(\d+)")
ARRAY_REF_RE = re.compile(r"=TRANSPOSE\('?([^'!]+)'?!\$?([A-Z]+)\$?(\d+):\$?([A-Z]+)\$?(\d+)\)")
ID_RE = re.compile(r"Row(\d+)([a-z]?)Cell(\d+)([a-z]?)$", re.IGNORECASE)


def normalize_sub_key(value):
    if value is None or pd.isna(value):
        return None
    text = str(value).strip()
    if text == "" or text.lower() == "nan":
        return None
    if text.endswith(".0"):
        text = text[:-2]
    return text.replace(",", ".")


def suffix_rank(value):
    if not value:
        return 0
    return ord(value.lower()) - ord("a") + 1


def parse_id_sort_key(id_value):
    if not isinstance(id_value, str):
        id_value = str(id_value)
    match = ID_RE.search(id_value)
    if not match:
        return (10**9, 0, 10**9, 0, id_value)
    row_num = int(match.group(1))
    row_suffix = suffix_rank(match.group(2))
    cell_num = int(match.group(3))
    cell_suffix = suffix_rank(match.group(4))
    return (row_num, row_suffix, cell_num, cell_suffix, id_value)


def sanitize_filename(text):
    normalized = unicodedata.normalize("NFKD", str(text)).encode("ascii", "ignore").decode("ascii")
    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", normalized).strip("_")
    return cleaned or "institution"


def expand_range(start_col, start_row, end_col, end_row):
    c1 = column_index_from_string(start_col)
    c2 = column_index_from_string(end_col)
    r1 = int(start_row)
    r2 = int(end_row)
    if c2 < c1:
        c1, c2 = c2, c1
    if r2 < r1:
        r1, r2 = r2, r1
    cells = []
    for row in range(r1, r2 + 1):
        for col in range(c1, c2 + 1):
            cells.append(f"{get_column_letter(col)}{row}")
    return cells


def parse_direct_ref(formula):
    match = DIRECT_REF_RE.match(str(formula).strip())
    if not match:
        return None, None
    sheet = match.group(1).strip("'")
    cell = f"{match.group(2)}{match.group(3)}"
    return sheet, cell


def parse_array_formula(text):
    match = ARRAY_REF_RE.match(str(text).strip())
    if not match:
        return None, []
    sheet = match.group(1).strip("'")
    cells = expand_range(match.group(2), match.group(3), match.group(4), match.group(5))
    return sheet, cells


def build_mapping_from_sheet(ws):
    row_to_target = {}
    seen_refs = set()

    for row in range(1, ws.max_row + 1):
        value = ws.cell(row=row, column=2).value
        if isinstance(value, ArrayFormula):
            if value.ref in seen_refs:
                continue
            seen_refs.add(value.ref)
            ref_match = re.match(r"B(\d+):B(\d+)", value.ref)
            if not ref_match:
                continue
            start_row = int(ref_match.group(1))
            end_row = int(ref_match.group(2))
            sheet, cells = parse_array_formula(value.text)
            if not sheet or not cells:
                continue
            for idx, target_row in enumerate(range(start_row, end_row + 1)):
                if idx < len(cells):
                    row_to_target[target_row] = (sheet, cells[idx])

    for row in range(1, ws.max_row + 1):
        if row in row_to_target:
            continue
        value = ws.cell(row=row, column=2).value
        if isinstance(value, str) and value.startswith("="):
            sheet, cell = parse_direct_ref(value)
            if sheet and cell:
                row_to_target[row] = (sheet, cell)

    inst_target = None
    sub_map = {}

    for row in range(1, ws.max_row + 1):
        key = ws.cell(row=row, column=1).value
        sub = ws.cell(row=row, column=3).value
        if key == "Inst":
            inst_target = row_to_target.get(row)
            continue
        if key and sub is not None:
            sub_key = normalize_sub_key(sub)
            if not sub_key:
                continue
            target = row_to_target.get(row)
            if not target:
                continue
            sub_map.setdefault(sub_key, []).append(target)

    return inst_target, sub_map


def load_ati_data(csv_url=None, fallback_path=CSV_FALLBACK_PATH):
    df = None
    if csv_url:
        try:
            df = pd.read_csv(
                csv_url,
                dtype={"section_number": "string", "subsection_number": "string"},
                low_memory=False,
            )
        except Exception as exc:
            print(f"Failed to read CSV from URL ({exc}); falling back to {fallback_path}.")

    if df is None:
        df = pd.read_csv(
            fallback_path,
            dtype={"section_number": "string", "subsection_number": "string"},
            low_memory=False,
        )

    df["ReportingPeriodStart"] = pd.to_datetime(df["ReportingPeriodStart"], errors="coerce")
    df["ReportingPeriodEnd"] = pd.to_datetime(df["ReportingPeriodEnd"], errors="coerce")
    df["report_start"] = df["ReportingPeriodStart"].dt.date
    df["report_end"] = df["ReportingPeriodEnd"].dt.date

    df["gc_orgID"] = pd.to_numeric(df["gc_orgID"], errors="coerce").astype("Int64")

    sub = df["subsection_number"].fillna("").astype("string")
    sub = sub.where(~sub.str.lower().eq("nan"), "")
    sec = df["section_number"].fillna("").astype("string")
    sec = sec.where(~sec.str.lower().eq("nan"), "")

    sub_key = sub.mask(sub == "", sec)
    df["sub_key"] = sub_key.map(normalize_sub_key)

    return df


def build_output_filename(inst_name, start_date, lang):
    year = pd.to_datetime(start_date, errors="coerce").year
    year_label = f"{year}" if pd.notna(year) else "unknown"
    inst_slug = sanitize_filename(inst_name)[:60]
    lang_label = "EN" if str(lang).lower().startswith("en") else "FR"
    return f"ATI_Report_{year_label}_{lang_label}_{inst_slug}.xlsx"



def is_formula_cell(ws, cell):
    target = ws[cell]
    if target.data_type == "f":
        return True
    value = target.value
    return isinstance(value, str) and value.startswith("=")

def populate_workbook(wb, subset, inst_name, start_date, end_date, lang):
    mapping_ws = wb[MAPPING_SHEET]
    inst_target, sub_map = build_mapping_from_sheet(mapping_ws)

    if inst_target:
        inst_sheet, inst_cell = inst_target
    else:
        inst_sheet = "ATI" if str(lang).lower().startswith("en") else "LAI"
        inst_cell = "D6"

    wb[inst_sheet][inst_cell] = inst_name

    if pd.notna(start_date):
        wb[inst_sheet]["D8"] = pd.to_datetime(start_date).date()
    if pd.notna(end_date):
        wb[inst_sheet]["H8"] = pd.to_datetime(end_date).date()

    warnings = []

    skipped_formula = 0
    skipped_examples = []

    for sub_key, targets in sub_map.items():
        sub_rows = subset[subset["sub_key"] == sub_key]
        if sub_rows.empty:
            warnings.append(f"Subsection {sub_key}: no matching rows in data.")
            continue
        sub_rows = sub_rows.copy()
        sub_rows["sort_key"] = sub_rows["id"].map(parse_id_sort_key)
        sub_rows = sub_rows.sort_values("sort_key")
        values = sub_rows["value"].tolist()

        if len(values) != len(targets):
            warnings.append(
                f"Subsection {sub_key}: template has {len(targets)} cells, data has {len(values)}; filled {min(len(values), len(targets))}."
            )

        for value, target in zip(values, targets):
            if pd.isna(value):
                continue
            sheet_name, cell = target
            if is_formula_cell(wb[sheet_name], cell):
                skipped_formula += 1
                if len(skipped_examples) < 10:
                    skipped_examples.append(f"{sheet_name}!{cell}")
                continue
            wb[sheet_name][cell] = value

    if skipped_formula:
        sample = ", ".join(skipped_examples)
        warnings.append(
            f"Skipped {skipped_formula} formula cells (auto-calculated), e.g. {sample}."
        )

    return warnings


def generate_ati_report(df, gc_org_id, report_start, lang="En", output_path=None):
    report_start = pd.to_datetime(report_start, errors="coerce").date()

    template_path = TEMPLATE_EN if str(lang).lower().startswith("en") else TEMPLATE_FR
    if not Path(template_path).exists():
        raise FileNotFoundError(f"Template not found: {template_path}")

    subset = df[(df["gc_orgID"] == gc_org_id) & (df["report_start"] == report_start)]
    if subset.empty:
        raise ValueError("No rows found for the selected institution and period.")

    first_row = subset.iloc[0]
    inst_name = first_row["institution_fr"] if str(lang).lower().startswith("fr") else first_row["institution_en"]
    start_date = first_row["ReportingPeriodStart"]
    end_date = first_row["ReportingPeriodEnd"]

    wb = load_workbook(template_path, data_only=False)
    warnings = populate_workbook(wb, subset, inst_name, start_date, end_date, lang)

    if output_path is None:
        output_path = build_output_filename(inst_name, start_date, lang)

    wb.save(output_path)
    return output_path, warnings


In [10]:
df = load_ati_data(CSV_URL, CSV_FALLBACK_PATH)
print(f"Loaded {len(df):,} rows from ATI-AI_refactored.csv")


Loaded 697,866 rows from ATI-AI_refactored.csv


In [11]:
# Example (manual usage without widgets):
# output_path, warnings = generate_ati_report(
#     df,
#     gc_org_id=2297,
#     report_start="2024-04-01",
#     lang="En",
# )
# print("Saved:", output_path)
# if warnings:
#     print("Warnings:")
#     for item in warnings[:20]:
#         print("-", item)


In [None]:
try:
    import ipywidgets as widgets

    inst_df = (
        df[["gc_orgID", "institution_en", "institution_fr"]]
        .dropna(subset=["gc_orgID"])
        .drop_duplicates()
        .sort_values("gc_orgID")
    )
    inst_df["gc_orgID"] = inst_df["gc_orgID"].astype(int)

    inst_options = [
        (f"{row.gc_orgID} | {row.institution_en}", row.gc_orgID)
        for row in inst_df.itertuples(index=False)
    ]

    period_dates = sorted(df["report_start"].dropna().unique())
    period_options = [(f"{d.year}-{d.year + 1}", d) for d in period_dates]

    inst_dropdown = widgets.Dropdown(options=inst_options, description="Institution", layout=widgets.Layout(width="80%"))
    period_dropdown = widgets.Dropdown(options=period_options, description="Period")
    lang_dropdown = widgets.Dropdown(options=["En", "Fr"], description="Language")
    generate_btn = widgets.Button(description="Generate XLSX", button_style="primary")
    output_area = widgets.Output()

    def on_click(_):
        output_area.clear_output()
        with output_area:
            try:
                output_path, warnings = generate_ati_report(
                    df,
                    gc_org_id=inst_dropdown.value,
                    report_start=period_dropdown.value,
                    lang=lang_dropdown.value,
                )
                print(f"Saved: {output_path}")
                if warnings:
                    print("Warnings:")
                    for item in warnings[:20]:
                        print("-", item)
            except Exception as exc:
                print("Error:", exc)

    generate_btn.on_click(on_click)
    display(widgets.VBox([inst_dropdown, period_dropdown, lang_dropdown, generate_btn, output_area]))
except ImportError:
    print("ipywidgets not installed. Run %pip install ipywidgets, or call generate_ati_report(...) manually.")


VBox(children=(Dropdown(description='Institution', layout=Layout(width='80%'), options=(('2222 | Agriculture a…

In [None]:
import zipfile
import re


def read_docx_text(path):
    with zipfile.ZipFile(path) as zf:
        xml = zf.read("word/document.xml").decode("utf-8", errors="ignore")
    text = re.sub(r"<w:tab[^/]*/>", "	", xml)
    text = re.sub(r"</w:p>", "
", text)
    text = re.sub(r"<[^>]+>", "", text)
    text = text.replace("", "")
    lines = [line.strip() for line in text.split("
")]
    return [line for line in lines if line]


validation_text = read_docx_text("2024-2025 Validation Checklist.docx")
guide_text = read_docx_text("2024-25 Guide ATIA Statistical Report.docx")

print(f"Validation checklist lines: {len(validation_text)}")
print(f"Guide lines: {len(guide_text)}")
print("Rules checked in audit:")
print("- Validation checklist: page/minute >0 requires request count >0 (sections 2.4, 2.5, 4.5.2, 4.5.4, 4.5.6, 8.1, 8.2).")
print("- Validation checklist: Section 5.1 dispositions must appear in Section 4.1.")
print("- Guide: Section 1.1 totals and carryover relationships.")
print("- Guide: Section 1.2 and 1.3 totals equal sum of rows 1-6.")
print("- Guide: numeric-only values.")


In [None]:
Q_KEY_RE = re.compile(r"^Q(\d+)R(\d+)C(\d+)$")


def parse_id_components(id_value):
    match = ID_RE.search(str(id_value))
    if not match:
        return None
    row_num = int(match.group(1))
    row_suffix = match.group(2) or ""
    cell_num = int(match.group(3))
    cell_suffix = match.group(4) or ""
    return row_num, row_suffix.lower(), cell_num, cell_suffix.lower()


def build_row_to_target(ws):
    row_to_target = {}
    seen_refs = set()

    for row in range(1, ws.max_row + 1):
        value = ws.cell(row=row, column=2).value
        if isinstance(value, ArrayFormula):
            if value.ref in seen_refs:
                continue
            seen_refs.add(value.ref)
            ref_match = re.match(r"B(\d+):B(\d+)", value.ref)
            if not ref_match:
                continue
            start_row = int(ref_match.group(1))
            end_row = int(ref_match.group(2))
            sheet, cells = parse_array_formula(value.text)
            if not sheet or not cells:
                continue
            for idx, target_row in enumerate(range(start_row, end_row + 1)):
                if idx < len(cells):
                    row_to_target[target_row] = (sheet, cells[idx])

    for row in range(1, ws.max_row + 1):
        if row in row_to_target:
            continue
        value = ws.cell(row=row, column=2).value
        if isinstance(value, str) and value.startswith("="):
            sheet, cell = parse_direct_ref(value)
            if sheet and cell:
                row_to_target[row] = (sheet, cell)

    return row_to_target


def build_mapping_rows(ws):
    row_to_target = build_row_to_target(ws)
    rows = []
    for row in range(1, ws.max_row + 1):
        key = ws.cell(row=row, column=1).value
        sub = ws.cell(row=row, column=3).value
        if not key or key in ("Inst", "Report") or sub is None:
            continue
        sub_key = normalize_sub_key(sub)
        target = row_to_target.get(row)
        if not sub_key or not target:
            continue
        rows.append({
            "mapping_row": row,
            "mapping_key": key,
            "sub_key": sub_key,
            "target_sheet": target[0],
            "target_cell": target[1],
        })
    return rows


def build_data_rows(subset):
    data_rows = {}
    tmp = subset.copy()
    tmp["id_sort"] = tmp["id"].map(parse_id_sort_key)
    tmp = tmp.sort_values(["sub_key", "id_sort"])

    for sub_key, group in tmp.groupby("sub_key", sort=False):
        rows = []
        for row in group.itertuples(index=False):
            comp = parse_id_components(row.id)
            if not comp:
                continue
            row_num, row_suffix, cell_num, cell_suffix = comp
            rows.append({
                "id": row.id,
                "value": row.value,
                "row_num": row_num,
                "row_suffix": row_suffix,
                "cell_num": cell_num,
                "cell_suffix": cell_suffix,
            })
        data_rows[sub_key] = rows
    return data_rows


def audit_mapping(df, gc_org_id, report_start, lang="En", output_prefix="ati_audit"):
    report_start = pd.to_datetime(report_start, errors="coerce").date()
    subset = df[(df["gc_orgID"] == gc_org_id) & (df["report_start"] == report_start)]
    if subset.empty:
        raise ValueError("No rows found for the selected institution and period.")

    template_path = TEMPLATE_EN if str(lang).lower().startswith("en") else TEMPLATE_FR
    wb = load_workbook(template_path, data_only=False)
    mapping_ws = wb[MAPPING_SHEET]

    mapping_rows = build_mapping_rows(mapping_ws)
    data_rows = build_data_rows(subset)

    sub_index = {}
    audit_rows = []

    for mapping in mapping_rows:
        sub_key = mapping["sub_key"]
        idx = sub_index.get(sub_key, 0)
        sub_index[sub_key] = idx + 1

        rows = data_rows.get(sub_key, [])
        data_row = rows[idx] if idx < len(rows) else None

        target_sheet = mapping["target_sheet"]
        target_cell = mapping["target_cell"]
        is_formula = is_formula_cell(wb[target_sheet], target_cell)

        audit_rows.append({
            "mapping_row": mapping["mapping_row"],
            "mapping_key": mapping["mapping_key"],
            "sub_key": sub_key,
            "target_sheet": target_sheet,
            "target_cell": target_cell,
            "target_is_formula": is_formula,
            "csv_id": data_row["id"] if data_row else None,
            "csv_value": data_row["value"] if data_row else None,
            "csv_row_num": data_row["row_num"] if data_row else None,
            "csv_row_suffix": data_row["row_suffix"] if data_row else None,
            "csv_cell_num": data_row["cell_num"] if data_row else None,
            "csv_cell_suffix": data_row["cell_suffix"] if data_row else None,
            "mapping_status": "formula_cell" if is_formula else ("ok" if data_row else "missing_data"),
        })

    audit_df = pd.DataFrame(audit_rows)

    unmapped = []
    for sub_key, rows in data_rows.items():
        mapped_count = sum(1 for row in audit_rows if row["sub_key"] == sub_key)
        if len(rows) > mapped_count:
            for row in rows[mapped_count:]:
                unmapped.append({
                    "sub_key": sub_key,
                    "csv_id": row["id"],
                    "csv_value": row["value"],
                    "reason": "extra_data_not_mapped",
                })

    unmapped_df = pd.DataFrame(unmapped)

    summary_rows = []
    for sub_key in sorted(set(audit_df["sub_key"])):
        map_count = (audit_df["sub_key"] == sub_key).sum()
        data_count = len(data_rows.get(sub_key, []))
        formula_count = ((audit_df["sub_key"] == sub_key) & (audit_df["target_is_formula"])).sum()
        summary_rows.append({
            "sub_key": sub_key,
            "mapping_cells": map_count,
            "data_rows": data_count,
            "formula_cells": int(formula_count),
            "non_formula_cells": int(map_count - formula_count),
            "status": "ok" if map_count == data_count else "count_mismatch",
        })

    summary_df = pd.DataFrame(summary_rows)

    audit_path = f"{output_prefix}_mapping.csv"
    summary_path = f"{output_prefix}_summary.csv"
    unmapped_path = f"{output_prefix}_unmapped.csv"

    audit_df.to_csv(audit_path, index=False)
    summary_df.to_csv(summary_path, index=False)
    if not unmapped_df.empty:
        unmapped_df.to_csv(unmapped_path, index=False)

    return audit_df, summary_df, unmapped_df


def rule_check_numeric_only(subset):
    issues = []
    values = pd.to_numeric(subset["value"], errors="coerce")
    bad = subset[values.isna() & subset["value"].notna()]
    for row in bad.itertuples(index=False):
        issues.append({
            "rule_id": "numeric_only",
            "sub_key": row.sub_key,
            "id": row.id,
            "value": row.value,
            "issue": "non_numeric_value",
            "source": "data",
        })
    return issues


def rule_check_pages_require_requests(subset):
    issues = []
    target_subs = {"2.4", "2.5", "4.5.2", "4.5.4", "4.5.6", "8.1", "8.2"}

    for sub_key in target_subs:
        group = subset[subset["sub_key"] == sub_key].copy()
        if group.empty:
            continue
        comp = group["id"].map(parse_id_components)
        group = group[comp.notna()].copy()
        comps = comp.dropna()
        group[["row_num", "row_suffix", "cell_num", "cell_suffix"]] = list(comps)
        group["row_key"] = group["row_num"].astype(str) + group["row_suffix"]
        group["value_num"] = pd.to_numeric(group["value"], errors="coerce").fillna(0)

        for row_key, rows in group.groupby("row_key"):
            request_val = rows[rows["cell_num"] == 1]["value_num"].sum()
            pages_val = rows[rows["cell_num"] > 1]["value_num"].sum()
            if pages_val > 0 and request_val <= 0:
                issues.append({
                    "rule_id": "pages_require_requests",
                    "sub_key": sub_key,
                    "row_key": row_key,
                    "issue": "pages_minutes_without_requests",
                    "detail": f"pages_or_minutes={pages_val}, requests={request_val}",
                    "source": "data",
                })

    return issues


def rule_check_5_1_vs_4_1(subset):
    issues = []
    sub_41 = subset[subset["sub_key"] == "4.1"].copy()
    sub_51 = subset[subset["sub_key"] == "5.1"].copy()

    if sub_41.empty or sub_51.empty:
        return issues

    def build_row_totals(df):
        comp = df["id"].map(parse_id_components)
        df = df[comp.notna()].copy()
        comps = comp.dropna()
        df[["row_num", "row_suffix", "cell_num", "cell_suffix"]] = list(comps)
        df["row_key"] = df["row_num"].astype(str) + df["row_suffix"]
        df["value_num"] = pd.to_numeric(df["value"], errors="coerce").fillna(0)
        totals = df.groupby("row_key")["value_num"].sum().to_dict()
        return totals

    totals_41 = build_row_totals(sub_41)
    totals_51 = build_row_totals(sub_51)

    for row_key, total in totals_51.items():
        if total > 0 and totals_41.get(row_key, 0) <= 0:
            issues.append({
                "rule_id": "5_1_requires_4_1",
                "sub_key": "5.1",
                "row_key": row_key,
                "issue": "section_5_1_disposition_without_4_1",
                "detail": f"section_5_1_total={total}, section_4_1_total={totals_41.get(row_key, 0)}",
                "source": "data",
            })

    return issues


def rule_check_section_1_totals(subset):
    issues = []
    sub = subset[subset["sub_key"] == "1.1"].copy()
    if sub.empty:
        return issues

    comp = sub["id"].map(parse_id_components)
    sub = sub[comp.notna()].copy()
    comps = comp.dropna()
    sub[["row_num", "row_suffix", "cell_num", "cell_suffix"]] = list(comps)
    sub = sub[sub["cell_num"] == 1].copy()
    sub["row_key"] = sub["row_num"].astype(str) + sub["row_suffix"]
    sub["value_num"] = pd.to_numeric(sub["value"], errors="coerce").fillna(0)
    values = sub.set_index("row_key")["value_num"].to_dict()

    def check_eq(row_key, expected, label):
        actual = values.get(row_key, 0)
        if abs(actual - expected) > 1e-6:
            issues.append({
                "rule_id": "section_1_1_total",
                "sub_key": "1.1",
                "row_key": row_key,
                "issue": label,
                "detail": f"expected={expected}, actual={actual}",
                "source": "data",
            })

    row2a = values.get("2a", 0)
    row2b = values.get("2b", 0)
    row2 = values.get("2", 0)
    row1 = values.get("1", 0)
    row5 = values.get("5", 0)
    row6 = values.get("6", 0)
    row7 = values.get("7", 0)
    row8 = values.get("8", 0)
    row9 = values.get("9", 0)

    check_eq("2", row2a + row2b, "row2_equals_row2a_plus_row2b")
    check_eq("5", row1 + row2, "row5_equals_row1_plus_row2")
    check_eq("7", row8 + row9, "row7_equals_row8_plus_row9")
    check_eq("7", row5 - row6, "row7_equals_row5_minus_row6")

    return issues


def rule_check_section_1_totals_generic(subset, sub_key):
    issues = []
    sub = subset[subset["sub_key"] == sub_key].copy()
    if sub.empty:
        return issues

    comp = sub["id"].map(parse_id_components)
    sub = sub[comp.notna()].copy()
    comps = comp.dropna()
    sub[["row_num", "row_suffix", "cell_num", "cell_suffix"]] = list(comps)
    sub = sub[sub["cell_num"] == 1].copy()
    sub["value_num"] = pd.to_numeric(sub["value"], errors="coerce").fillna(0)

    totals = sub.set_index("row_num")["value_num"].to_dict()
    expected = sum(totals.get(i, 0) for i in range(1, 7))
    actual = totals.get(7, 0)
    if abs(actual - expected) > 1e-6:
        issues.append({
            "rule_id": "section_total",
            "sub_key": sub_key,
            "row_key": "7",
            "issue": "row7_total_mismatch",
            "detail": f"expected={expected}, actual={actual}",
            "source": "data",
        })

    return issues


def run_business_rules(subset):
    issues = []
    issues.extend(rule_check_numeric_only(subset))
    issues.extend(rule_check_pages_require_requests(subset))
    issues.extend(rule_check_5_1_vs_4_1(subset))
    issues.extend(rule_check_section_1_totals(subset))
    issues.extend(rule_check_section_1_totals_generic(subset, "1.2"))
    issues.extend(rule_check_section_1_totals_generic(subset, "1.3"))
    return pd.DataFrame(issues)


def audit_all(df, gc_org_id, report_start, lang="En", output_prefix="ati_audit"):
    report_start = pd.to_datetime(report_start, errors="coerce").date()
    subset = df[(df["gc_orgID"] == gc_org_id) & (df["report_start"] == report_start)]
    if subset.empty:
        raise ValueError("No rows found for the selected institution and period.")

    mapping_df, summary_df, unmapped_df = audit_mapping(
        df,
        gc_org_id=gc_org_id,
        report_start=report_start,
        lang=lang,
        output_prefix=output_prefix,
    )

    rules_df = run_business_rules(subset)
    rules_path = f"{output_prefix}_rules.csv"
    rules_df.to_csv(rules_path, index=False)

    return mapping_df, summary_df, unmapped_df, rules_df
