# Parse Privacy Stats Forms


In [1]:
%pip install requests pandas bs4 openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import Dict, Iterable, List

EN_URL = "https://www.canada.ca/en/treasury-board-secretariat/corporate/forms/350-63.html"
FR_URL = "https://www.canada.ca/fr/secretariat-conseil-tresor/organisation/formulaires/350-63.html"

GENERAL_FIELDS = {
    'en': {
        'general_section': 'General Information',
        'identification': 'Identification',
        'reporting_fallback': 'Reporting period:',
        'start_label': 'Start Date',
        'end_label': 'End Date',
    },
    'fr': {
        'general_section': 'Informations générales',
        'identification': 'Identification',
        'reporting_fallback': 'Période d\'établissement de rapport',
        'start_label': 'Date de début',
        'end_label': 'Date de fin',
    },
}

ID_CORRECTIONS = {
    'en': {
        'Q3_28Row8Cell1': 'Q3_2Row28Cell1',
        'Q8_2Row3Cel10': 'Q8_2Row3Cell10',
    },
    'fr': {
        'Q8_2Row7Cell85': 'Q8_2Row7Cell8',
    },
}

DROP_IDS = {
    'en': set(),
    'fr': set(),
}



In [3]:
def fetch_soup(url: str) -> BeautifulSoup:
    """Fetch and parse HTML, raising on HTTP errors."""
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')


def extract_language_data(soup: BeautifulSoup, locale: str) -> List[Dict[str, str]]:
    """Pull section/sub-section/input metadata for a language version."""
    records: List[Dict[str, str]] = []

    # Structured sections
    for section_tag in soup.select('section.panel-info'):
        section_heading = section_tag.select_one('header.panel-heading h2')
        section_label = section_heading.get_text(strip=True) if section_heading else 'N/A'

        for table in section_tag.find_all('table'):
            caption = table.find('caption')
            subsection_label = caption.get_text(strip=True) if caption else 'N/A'

            for input_tag in table.find_all('input'):
                records.append({
                    'Section': section_label,
                    'Sub-section': subsection_label,
                    'id': input_tag.get('id', 'N/A'),
                    'title': input_tag.get('title', 'N/A'),
                })

    # General information outside section panels
    initial_div = soup.find('div', class_='mwsgeneric-base-html parbase section')
    if initial_div:
        general_cfg = GENERAL_FIELDS[locale]
        name_input = initial_div.find('input', id='NameOfInstitution')
        if name_input:
            records.append({
                'Section': general_cfg['general_section'],
                'Sub-section': general_cfg['identification'],
                'id': name_input.get('id', 'N/A'),
                'title': name_input.get('title', 'N/A'),
            })

        reporting_fieldset = initial_div.find('fieldset', class_='brdr-bttm')
        if reporting_fieldset:
            legend = reporting_fieldset.find('legend')
            reporting_label = legend.get_text(strip=True) if legend else general_cfg['reporting_fallback']

            start_input = reporting_fieldset.find('input', id='ReportingPeriodStart')
            end_input = reporting_fieldset.find('input', id='ReportingPeriodEnd')

            if start_input:
                records.append({
                    'Section': reporting_label,
                    'Sub-section': general_cfg['start_label'],
                    'id': start_input.get('id', 'N/A'),
                    'title': start_input.get('title', 'N/A'),
                })
            if end_input:
                records.append({
                    'Section': reporting_label,
                    'Sub-section': general_cfg['end_label'],
                    'id': end_input.get('id', 'N/A'),
                    'title': end_input.get('title', 'N/A'),
                })

    return records


def clean_ids(records: Iterable[Dict[str, str]], locale: str) -> List[Dict[str, str]]:
    """Apply ID fixes and drop known bad rows."""
    corrections = ID_CORRECTIONS.get(locale, {})
    drop_ids = DROP_IDS.get(locale, set())

    cleaned: List[Dict[str, str]] = []
    for record in records:
        if record['id'] in drop_ids:
            continue

        fixed = dict(record)
        fixed['id'] = corrections.get(fixed['id'], fixed['id'])
        cleaned.append(fixed)

    return cleaned

def add_missing_privacy_fields(records: List[Dict[str, str]], locale: str) -> List[Dict[str, str]]:
    # Insert missing fields for Privacy form where HTML inputs are absent.
    missing_ids = [f'Q6_2Row3Cell{cell}' for cell in range(2, 9)]
    existing = {record['id'] for record in records}
    if all(mid in existing for mid in missing_ids):
        return records

    def _replace_duration(title: str) -> str:
        if not isinstance(title, str):
            return title
        parts = title.rsplit(':', 1)
        if len(parts) != 2:
            return title
        suffix = 'Plus de 31 jours' if locale == 'fr' else '31 days or greater'
        return f"{parts[0]}: {suffix}"

    updated = list(records)
    for cell in range(2, 9):
        missing_id = f'Q6_2Row3Cell{cell}'
        if missing_id in existing:
            continue
        template_id = f'Q6_2Row1Cell{cell}'
        template = next((r for r in records if r['id'] == template_id), None)
        if not template:
            continue
        updated.append({
            'Section': template['Section'],
            'Sub-section': template['Sub-section'],
            'id': missing_id,
            'title': _replace_duration(template['title']),
        })
    return updated


def process_language(records: List[Dict[str, str]], locale: str) -> pd.DataFrame:
    """Normalize section/subsection fields and rename title column per locale."""
    df_lang = pd.DataFrame(records)

    df_lang['section_number'] = df_lang['Section'].astype(str).str.extract(r'Section[\s\xa0]*(\d+)', flags=re.IGNORECASE)
    section_name_col = f'section_name_{locale}'
    df_lang[section_name_col] = df_lang['Section'].astype(str).str.replace(
        r'Section[\s\xa0]*\d+[\s\xa0]*:[\s\xa0]*', '', flags=re.IGNORECASE
    ).str.strip()
    df_lang.loc[df_lang['section_number'].isna(), section_name_col] = df_lang.loc[
        df_lang['section_number'].isna(), 'Section'
    ].astype(str).str.strip()

    df_lang['subsection_number'] = df_lang['Sub-section'].astype(str).str.extract(
        r'^(\d+(?:\.\d+){0,2})', flags=re.IGNORECASE
    )
    subsection_name_col = f'subsection_name_{locale}'
    df_lang[subsection_name_col] = df_lang['Sub-section'].astype(str).str.replace(
        r'^\d+(?:\.\d+){0,2}\s*', '', flags=re.IGNORECASE
    ).str.strip()
    df_lang.loc[df_lang['subsection_number'].isna(), subsection_name_col] = df_lang.loc[
        df_lang['subsection_number'].isna(), 'Sub-section'
    ].astype(str).str.strip()

    df_lang = df_lang.drop(columns=['Section', 'Sub-section'])
    df_lang = df_lang.rename(columns={'title': f'title_{locale}'})
    return df_lang


def merge_languages(df_en: pd.DataFrame, df_fr: pd.DataFrame) -> pd.DataFrame:
    """Combine English and French data, preferring English section/subsection numbers when available."""
    combined = pd.merge(df_en, df_fr, on='id', how='outer', suffixes=('_en', '_fr'))
    combined['section_number'] = combined['section_number_en'].fillna(combined['section_number_fr'])
    combined['subsection_number'] = combined['subsection_number_en'].fillna(combined['subsection_number_fr'])

    final_cols = [
        'id',
        'section_number',
        'section_name_en',
        'section_name_fr',
        'subsection_number',
        'subsection_name_en',
        'subsection_name_fr',
        'title_en',
        'title_fr',
    ]

    return combined[final_cols]


def custom_id_sort_key(id_str: str):
    # Prioritize header fields
    if id_str == 'NameOfInstitution':
        return (-3, (), -3, '', -3)
    if id_str == 'ReportingPeriodStart':
        return (-2, (), -2, '', -2)
    if id_str == 'ReportingPeriodEnd':
        return (-1, (), -1, '', -1)

    match = re.match(r'Q(\d+)(?:_(\d+(?:_\d+)*))?Row(\d+)([a-zA-Z]*)Cell(\d+)', id_str or '')
    if match:
        section = int(match.group(1))
        subsection_str = match.group(2)
        subsection_parts = tuple(int(p) for p in subsection_str.split('_')) if subsection_str else ()
        padded_subsection = subsection_parts + (0,) * (3 - len(subsection_parts))
        row_num = int(match.group(3))
        row_suffix = match.group(4).lower()
        cell_num = int(match.group(5))
        if section == 3 and subsection_parts in ((1,), (5, 2), (5, 4), (5, 6), (5, 7), (7, 2), (8,)):
            return (section, padded_subsection, cell_num, row_num, row_suffix)
        if section == 6 and subsection_parts in ((2,),):
            return (section, padded_subsection, cell_num, row_num, row_suffix)
        if section == 7 and subsection_parts in ((1,), (2,), (3,)):
            return (section, padded_subsection, cell_num, row_num, row_suffix)
        if section == 8 and subsection_parts in ((1,), (2,)):
            return (section, padded_subsection, cell_num, row_num, row_suffix)
        if section == 10 and subsection_parts in ((2,),):
            return (section, padded_subsection, cell_num, row_num, row_suffix)
        return (section, padded_subsection, row_num, row_suffix, cell_num)

    return (float('inf'), (), float('inf'), '', (id_str or '').lower())


def sort_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    sorted_df = df.copy()
    sorted_df['sort_key'] = sorted_df['id'].apply(custom_id_sort_key)
    sorted_df = sorted_df.sort_values(by='sort_key').drop(columns='sort_key').reset_index(drop=True)
    return sorted_df


def export_to_excel(df: pd.DataFrame, filename: str = 'combined_privacy_form_data.xlsx') -> None:
    df.to_excel(filename, index=False)
    print(f"DataFrame successfully exported to '{filename}'")


In [4]:
# Fetch, parse, normalize, and export combined form metadata
soup_en = fetch_soup(EN_URL)
soup_fr = fetch_soup(FR_URL)

df_en = process_language(add_missing_privacy_fields(clean_ids(extract_language_data(soup_en, 'en'), 'en'), 'en'), 'en')
df_fr = process_language(add_missing_privacy_fields(clean_ids(extract_language_data(soup_fr, 'fr'), 'fr'), 'fr'), 'fr')

df_final = sort_dataframe(merge_languages(df_en, df_fr))
display(df_final.head(10))

export_to_excel(df_final)


Unnamed: 0,id,section_number,section_name_en,section_name_fr,subsection_number,subsection_name_en,subsection_name_fr,title_en,title_fr
0,NameOfInstitution,,General Information,Informations générales,,Identification,Identification,,
1,ReportingPeriodStart,,Reporting period:,Période d’établissement de rapport :,,Start Date,Date de début,,
2,ReportingPeriodEnd,,Reporting period:,Période d’établissement de rapport :,,End Date,Date de fin,,
3,Q1_1Row1Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
4,Q1_1Row2Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
5,Q1_1Row2aCell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
6,Q1_1Row2bCell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
7,Q1_1Row3Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
8,Q1_1Row4Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...
9,Q1_1Row5Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...


DataFrame successfully exported to 'combined_privacy_form_data.xlsx'


## Convert published dataset URL to long CSV
Map a wide Privacy dataset URL onto the form schema, enrich with GC org IDs, and write a long CSV.


In [5]:
import json
import pandas as pd
from pathlib import Path
import unicodedata
import re
import requests
from difflib import SequenceMatcher

def _normalize_name(name: str) -> str:
    if not isinstance(name, str):
        return ''
    text = unicodedata.normalize('NFKD', name).casefold()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[‐‑‒–—―]', '-', text)  # normalize dashes
    return text.strip()

def _similarity(a: str, b: str) -> float:
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

def load_gc_org_lookup(org_json_url: str) -> pd.DataFrame:
    """Load GC org master data JSON and return DataFrame with name variants for matching."""
    if org_json_url.startswith('http'):
        resp = requests.get(org_json_url)
        resp.raise_for_status()
        data = resp.json()
    else:
        data = json.loads(Path(org_json_url).read_text(encoding='utf-8'))

    records = data.get('records') or data.get('result', {}).get('records')
    fields = data.get('fields') or data.get('result', {}).get('fields')
    if not records or not fields:
        raise ValueError('Unexpected org JSON structure; missing fields/records')

    cols = [f['id'] for f in fields]
    df = pd.DataFrame(records, columns=cols)

    keep_cols = [
        'gc_orgID',
        'harmonized_name',
        'nom_harmonise',
        'legal_title',
        'appellation_legale',
        'preferred_name',
        'nom_prefere',
    ]
    for c in keep_cols:
        if c not in df.columns:
            df[c] = None
    df_lookup = df[keep_cols].copy()

    def collect_variants(row):
        variants = [
            row['harmonized_name'],
            row['nom_harmonise'],
            row['legal_title'],
            row['appellation_legale'],
            row['preferred_name'],
            row['nom_prefere'],
        ]
        norms = {_normalize_name(v) for v in variants if _normalize_name(v)}
        return list(norms)

    df_lookup['norm_variants'] = df_lookup.apply(collect_variants, axis=1)
    return df_lookup

def match_institution(name: str, lookup: pd.DataFrame, similarity_threshold: float = 0.88):
    norm = _normalize_name(name)
    if not norm:
        return pd.Series({'gc_orgID': None, 'institution_en': None, 'institution_fr': None})

    exact = lookup[lookup['norm_variants'].apply(lambda vs: norm in vs)]
    if not exact.empty:
        r = exact.iloc[0]
        return pd.Series({
            'gc_orgID': r['gc_orgID'],
            'institution_en': r['harmonized_name'],
            'institution_fr': r['nom_harmonise'],
        })

    best_row = None
    best_ratio = 0.0
    for _, cand in lookup.iterrows():
        for variant in cand['norm_variants']:
            ratio = _similarity(norm, variant)
            if ratio > best_ratio:
                best_ratio = ratio
                best_row = cand
    if best_row is not None and best_ratio >= similarity_threshold:
        return pd.Series({
            'gc_orgID': best_row['gc_orgID'],
            'institution_en': best_row['harmonized_name'],
            'institution_fr': best_row['nom_harmonise'],
        })

    return pd.Series({'gc_orgID': None, 'institution_en': name, 'institution_fr': None})

def dataset_url_to_long_csv(dataset_url: str,
                            reporting_start: str,
                            reporting_end: str,
                            output_csv: str = 'long_from_url.csv',
                            wide_path: str = 'combined_privacy_form_data.xlsx',
                            org_json_url: str = 'https://open.canada.ca/data/en/datastore/dump/cb5b5566-f599-4d12-abae-8279a0230928?format=json') -> pd.DataFrame:
    """Download a wide Privacy dataset and convert to long CSV aligned to the form schema with GC org IDs."""
    wide = pd.read_excel(wide_path)

    value_rows = wide[~wide['id'].isin(['NameOfInstitution', 'ReportingPeriodStart', 'ReportingPeriodEnd'])].reset_index(drop=True)

    raw = pd.read_excel(dataset_url, header=None)
    expected_cols = len(value_rows) + 1  # institution + values
    if raw.shape[1] != expected_cols:
        raise ValueError(f"Column mismatch: raw has {raw.shape[1]} columns, expected {expected_cols} (institution + form fields)")

    data_rows = raw.iloc[3:].reset_index(drop=True)

    lookup = load_gc_org_lookup(org_json_url)

    start_date = pd.to_datetime(reporting_start)
    end_date = pd.to_datetime(reporting_end)

    frames = []
    for _, row in data_rows.iterrows():
        institution_raw = str(row.iloc[0]).strip()
        if not institution_raw:
            continue

        values = row.iloc[1:]
        if len(values) != len(value_rows):
            raise ValueError(f"Value length mismatch for {institution_raw}: {len(values)} vs {len(value_rows)}")

        match = match_institution(institution_raw, lookup)

        frame = value_rows.copy()
        frame['gc_orgID'] = match['gc_orgID']
        frame['institution_en'] = match['institution_en']
        frame['institution_fr'] = match['institution_fr']
        frame['ReportingPeriodStart'] = start_date
        frame['ReportingPeriodEnd'] = end_date
        frame['value'] = values.values
        frames.append(frame)

    if not frames:
        raise ValueError("No institution rows found in dataset.")

    long_df = pd.concat(frames, ignore_index=True)

    col_order = [
        'gc_orgID',
        'institution_en',
        'institution_fr',
        'ReportingPeriodStart',
        'ReportingPeriodEnd',
        'id',
        'section_number',
        'section_name_en',
        'section_name_fr',
        'subsection_number',
        'subsection_name_en',
        'subsection_name_fr',
        'title_en',
        'title_fr',
        'value',
    ]

    long_df = long_df[col_order]
    long_df.to_csv(output_csv, index=False)
    print(f"Wrote {len(long_df)} rows for {data_rows.shape[0]} institutions to {output_csv}")
    return long_df


In [6]:
DATASETS = [
    {
        'url': 'https://open.canada.ca/data/dataset/236294e1-bc74-486f-ab97-422227bc8832/resource/b1b2b12e-3589-4d64-b571-0e6502d117ee/download/2021-22-privacy-dataset.xlsx',
        'start': '2021-04-01',
        'end': '2022-03-31',
        'output': 'combined_privacy_form_data_long_2021_22.csv',
    },
    {
        'url': 'https://open.canada.ca/data/dataset/236294e1-bc74-486f-ab97-422227bc8832/resource/37bc4113-aa31-4c1f-87fe-749d361cf7e8/download/2022-23-privacy-dataset.xlsx',
        'start': '2022-04-01',
        'end': '2023-03-31',
        'output': 'combined_privacy_form_data_long_2022_23.csv',
    },
    {
        'url': 'https://open.canada.ca/data/dataset/236294e1-bc74-486f-ab97-422227bc8832/resource/7b019922-e25c-454e-addf-12c8d72952c0/download/privacy-dataset-2023-24-1.xlsx',
        'start': '2023-04-01',
        'end': '2024-03-31',
        'output': 'combined_privacy_form_data_long_2023_24.csv',
    },

]

for cfg in DATASETS:
    dataset_url_to_long_csv(
        cfg['url'],
        reporting_start=cfg['start'],
        reporting_end=cfg['end'],
        output_csv=cfg['output'],
    )


Wrote 147643 rows for 191 institutions to combined_privacy_form_data_long_2021_22.csv
Wrote 149962 rows for 194 institutions to combined_privacy_form_data_long_2022_23.csv
Wrote 150735 rows for 195 institutions to combined_privacy_form_data_long_2023_24.csv


## Build consolidated Privacy-AI CSV
Concatenate yearly long CSVs into a single file.


In [7]:
import pandas as pd
from pathlib import Path

def build_consolidated_csv(files=None, output='Privacy-AI_refactored.csv'):
    """Concat multiple long CSVs into one file.
    Args:
        files: list of CSV paths; defaults to known yearly outputs if None.
        output: destination CSV filename.
    """
    if files is None:
        files = [
            'combined_privacy_form_data_long_2021_22.csv',
            'combined_privacy_form_data_long_2022_23.csv',
            'combined_privacy_form_data_long_2023_24.csv'
        ]

    frames = []
    missing = []
    for f in files:
        p = Path(f)
        if p.exists():
            frames.append(pd.read_csv(p))
        else:
            missing.append(f)

    if missing:
        raise FileNotFoundError(f"Missing input CSVs: {missing}")
    if not frames:
        raise ValueError('No input files provided.')

    combined = pd.concat(frames, ignore_index=True)
    combined.to_csv(output, index=False)
    print(f"Wrote {output} with {len(combined)} rows from {len(frames)} files.")
    return combined



In [8]:
build_consolidated_csv()

Wrote Privacy-AI_refactored.csv with 448340 rows from 3 files.


Unnamed: 0,gc_orgID,institution_en,institution_fr,ReportingPeriodStart,ReportingPeriodEnd,id,section_number,section_name_en,section_name_fr,subsection_number,subsection_name_en,subsection_name_fr,title_en,title_fr,value
0,2297.0,Administrative Tribunals Support Service of Ca...,Service canadien d'appui aux tribunaux adminis...,2021-04-01,2022-03-31,Q1_1Row1Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...,165.000
1,2297.0,Administrative Tribunals Support Service of Ca...,Service canadien d'appui aux tribunaux adminis...,2021-04-01,2022-03-31,Q1_1Row2Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...,3.000
2,2297.0,Administrative Tribunals Support Service of Ca...,Service canadien d'appui aux tribunaux adminis...,2021-04-01,2022-03-31,Q1_1Row2aCell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...,3.000
3,2297.0,Administrative Tribunals Support Service of Ca...,Service canadien d'appui aux tribunaux adminis...,2021-04-01,2022-03-31,Q1_1Row2bCell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...,0.000
4,2297.0,Administrative Tribunals Support Service of Ca...,Service canadien d'appui aux tribunaux adminis...,2021-04-01,2022-03-31,Q1_1Row3Cell1,1.0,Section 1: Requests under thePrivacy Act,Section 1 : Demandes en vertu de laLoi sur la ...,1.1,1.1 Number of requests received,1.1 Nombre de demandes,Requests under the Access to Information Act: ...,Demandes en vertu de la Loi sur l’accès à l’in...,168.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448335,,Yukon Surface Rights Board,,2023-04-01,2024-03-31,Q12_2Row2Cell1,12.0,Section 12: Resources related to thePrivacy Act,Section 12 : Ressources liées à laLoi sur la p...,12.2,12.2 Human resources,12.2 Ressources humaines,Person Years Dedicated to Access to Informatio...,Années-personnes consacrées aux activités liée...,0.001
448336,,Yukon Surface Rights Board,,2023-04-01,2024-03-31,Q12_2Row3Cell1,12.0,Section 12: Resources related to thePrivacy Act,Section 12 : Ressources liées à laLoi sur la p...,12.2,12.2 Human resources,12.2 Ressources humaines,Person Years Dedicated to Access to Informatio...,Années-personnes consacrées aux activités liée...,0.000
448337,,Yukon Surface Rights Board,,2023-04-01,2024-03-31,Q12_2Row4Cell1,12.0,Section 12: Resources related to thePrivacy Act,Section 12 : Ressources liées à laLoi sur la p...,12.2,12.2 Human resources,12.2 Ressources humaines,Person Years Dedicated to Access to Informatio...,Années-personnes consacrées aux activités liée...,0.000
448338,,Yukon Surface Rights Board,,2023-04-01,2024-03-31,Q12_2Row5Cell1,12.0,Section 12: Resources related to thePrivacy Act,Section 12 : Ressources liées à laLoi sur la p...,12.2,12.2 Human resources,12.2 Ressources humaines,Person Years Dedicated to Access to Informatio...,Années-personnes consacrées aux activités liée...,0.000


## Generate populated Privacy form as HTML
Select an institution, period, and language to fill the Privacy form HTML using `Privacy-AI_refactored.csv`.


In [None]:
# %pip install ipywidgets
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from http.server import ThreadingHTTPServer, SimpleHTTPRequestHandler
import functools
import socket
import threading
import webbrowser
import re

EN_URL = "https://www.canada.ca/en/treasury-board-secretariat/corporate/forms/350-63.html"
FR_URL = "https://www.canada.ca/fr/secretariat-conseil-tresor/organisation/formulaires/350-63.html"

ID_CORRECTIONS_HTML = {
    'Q3_28Row8Cell1': 'Q3_2Row28Cell1',
    'Q8_2Row3Cel10': 'Q8_2Row3Cell10',
    'Q8_2Row7Cell85': 'Q8_2Row7Cell8',
}

_SERVER = {'thread': None, 'server': None, 'port': None}

def load_data(csv_path='Privacy-AI_refactored.csv'):
    df = pd.read_csv(csv_path)
    df['ReportingPeriodStart'] = pd.to_datetime(df['ReportingPeriodStart'])
    return df

def fetch_form(lang='En'):
    url = EN_URL if lang.lower().startswith('en') else FR_URL
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, 'html.parser')
    for bad, good in ID_CORRECTIONS_HTML.items():
        tag = soup.find(id=bad)
        if tag:
            tag['id'] = good
    return soup

def populate_form(soup, values):
    for input_tag in soup.find_all('input'):
        input_id = input_tag.get('id')
        if not input_id:
            continue
        if input_id in values:
            val = values[input_id]
            if pd.isna(val):
                continue
            input_tag['value'] = str(val)
    return soup

def clean_html_assets(html_text: str) -> str:
    patterns = [
        r'<script[^>]*assets\.adobedtm\.com[^>]*></script>\s*',
        r'<script[^>]*>.*?go-mpulse\.net/boomerang.*?</script>\s*',
        r'<script[^>]*>.*?_satellite\.pageBottom\(\);.*?</script>\s*',
    ]
    cleaned = html_text
    for pat in patterns:
        cleaned = re.sub(pat, '', cleaned, flags=re.DOTALL)

    def _abs(match):
        attr = match.group(1)
        path_part = match.group(2)
        return f'{attr}="https://www.canada.ca/{path_part}"'

    cleaned = re.sub(r'(href|src|data-ajax-replace)="/(?!/)([^"#]*)"', _abs, cleaned)
    return cleaned

def extract_main_only(soup):
    main = soup.find('main')
    if main is None:
        main = (soup.body or soup)
    new_doc = BeautifulSoup('<!doctype html><html><head></head><body></body></html>', 'html.parser')
    if soup.html and soup.html.get('lang'):
        new_doc.html['lang'] = soup.html['lang']
    if soup.head:
        for tag in soup.head.find_all(['title', 'meta', 'link', 'style']):
            new_doc.head.append(tag)
    new_doc.body.append(main)
    return new_doc

WET_HEAD_HTML = """
<link href=\"https://wet-boew.github.io/themes-dist/GCWeb/GCWeb/assets/favicon.ico\" rel=\"icon\" type=\"image/x-icon\" />
<link rel=\"stylesheet\" href=\"https://wet-boew.github.io/themes-dist/GCWeb/GCWeb/css/theme.min.css\" />
<noscript><link rel=\"stylesheet\" href=\"https://wet-boew.github.io/themes-dist/GCWeb/wet-boew/css/noscript.min.css\" /></noscript>
<script src=\"https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.js\"></script>
<script src=\"https://wet-boew.github.io/themes-dist/GCWeb/wet-boew/js/wet-boew.min.js\"></script>
<script src=\"https://wet-boew.github.io/themes-dist/GCWeb/GCWeb/js/theme.min.js\"></script>
"""

def apply_wet_assets(doc):
    head = doc.head or doc
    for tag in BeautifulSoup(WET_HEAD_HTML, 'html.parser').find_all(['link', 'script', 'noscript']):
        head.append(tag)
    return doc

LICENSE_HTML_EN = (
    "<p>licenced under the "
    "<a rel='license' href='https://open.canada.ca/en/open-government-licence-canada'>Open Government Licence – Canada 2.0</a>"
    "</p>"
)

LICENSE_HTML_FR = (
    "<p>information visée par la "
    "<a rel='license' href='https://ouvert.canada.ca/fr/licence-du-gouvernement-ouvert-canada'>Licence du gouvernement ouvert – Canada 2.0</a>"
    "</p>"
)

def add_license_footer(doc, lang):
    html = LICENSE_HTML_FR if str(lang).lower().startswith('fr') else LICENSE_HTML_EN
    block = BeautifulSoup(html, 'html.parser')
    pagedetails = doc.find(id='pagedetails')
    if pagedetails:
        date_mod = pagedetails.find('gcds-date-modified')
        if date_mod:
            date_mod.insert_after(block)
        else:
            pagedetails.append(block)
    else:
        main = doc.find('main')
        if main and main.parent:
            main.append(block)
        else:
            doc.body.append(block)
    return doc

def generate_populated_html(df, gc_org_id, start_date, lang='En', output_html='populated_privacy_form_example.html'):
    subset = df[(df['gc_orgID'] == gc_org_id) & (df['ReportingPeriodStart'] == pd.to_datetime(start_date))]
    if subset.empty:
        raise ValueError('No rows found for selection')
    values = dict(zip(subset['id'], subset['value']))

    first_row = subset.iloc[0]
    inst_name = first_row['institution_fr'] if str(lang).lower().startswith('fr') else first_row['institution_en']
    values['NameOfInstitution'] = inst_name
    start_dt = pd.to_datetime(first_row['ReportingPeriodStart']).date()
    end_dt = pd.to_datetime(first_row['ReportingPeriodEnd']).date() if pd.notna(first_row['ReportingPeriodEnd']) else None
    values['ReportingPeriodStart'] = start_dt.isoformat()
    if end_dt:
        values['ReportingPeriodEnd'] = end_dt.isoformat()

    soup = fetch_form(lang=lang)
    soup = populate_form(soup, values)
    main_doc = extract_main_only(soup)
    main_doc = apply_wet_assets(main_doc)
    main_doc = add_license_footer(main_doc, lang)
    html_text = main_doc.encode(formatter=None).decode("utf-8")
    html_text = clean_html_assets(html_text)
    output_path = Path(output_html)
    output_path.write_text(html_text, encoding='utf-8')
    return output_path

def _find_port(preferred=8000):
    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind(('localhost', preferred))
            return preferred
    except OSError:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind(('localhost', 0))
            return s.getsockname()[1]

def ensure_server(root='.', port=8000):
    if _SERVER['thread'] and _SERVER['thread'].is_alive():
        return _SERVER['port']
    chosen_port = _find_port(port)
    handler = functools.partial(SimpleHTTPRequestHandler, directory=str(Path(root).resolve()))
    server = ThreadingHTTPServer(('localhost', chosen_port), handler)
    thread = threading.Thread(target=server.serve_forever, daemon=True)
    thread.start()
    _SERVER.update({'thread': thread, 'server': server, 'port': chosen_port})
    return chosen_port

def open_local_page(html_path, port=8000):
    port = ensure_server(root=Path(html_path).parent, port=port)
    url = f"http://localhost:{port}/{Path(html_path).name}"
    webbrowser.open(url)
    return url

# Load dataset and prepare widgets
data_df = load_data()
institution_options = sorted(
    data_df[['gc_orgID','institution_en','institution_fr']].drop_duplicates().itertuples(index=False),
    key=lambda r: (r.gc_orgID, r.institution_en)
)
inst_dropdown = widgets.Dropdown(
    options=[(f"{r.gc_orgID} | {r.institution_en}", r.gc_orgID) for r in institution_options],
    description='Institution',
    layout=widgets.Layout(width='80%')
)
period_options = sorted(data_df['ReportingPeriodStart'].dropna().dt.date.unique())
period_dropdown = widgets.Dropdown(options=period_options, description='Period')
lang_dropdown = widgets.Dropdown(options=['En','Fr'], description='Language')
generate_btn = widgets.Button(description='Generate HTML', button_style='primary', disabled=True)
status_out = widgets.Output()

def on_change(_):
    generate_btn.disabled = not (inst_dropdown.value and period_dropdown.value and lang_dropdown.value)

inst_dropdown.observe(on_change, names='value')
period_dropdown.observe(on_change, names='value')
lang_dropdown.observe(on_change, names='value')

def on_click(_):
    status_out.clear_output()
    with status_out:
        try:
            output_path = generate_populated_html(
                data_df,
                inst_dropdown.value,
                period_dropdown.value,
                lang_dropdown.value,
                output_html='populated_privacy_form_example.html',
            )
            url = open_local_page(output_path)
            display(HTML(f"Saved to <code>{output_path}</code> — Serving locally: <a href='{url}' target='_blank'>{url}</a>"))
            display(HTML(f"<iframe src='{url}' width='100%' height='800px'></iframe>"))
        except Exception as e:
            print('Error:', e)

generate_btn.on_click(on_click)
on_change(None)  # set initial button state

ui = widgets.VBox([inst_dropdown, period_dropdown, lang_dropdown, generate_btn, status_out])
display(ui)


VBox(children=(Dropdown(description='Institution', layout=Layout(width='80%'), options=(('2222.0 | Agriculture…