In [None]:
import pandas as pd
import os
from datetime import datetime
from itertools import product

def px_escape(text):
    """Escape quotes in strings for PX format."""
    if isinstance(text, str):
        return text.replace('"', '""')
    return text

def tidy_to_pxstat(
    input_file,
    output_file=None,
    stub_cols=None,
    heading_cols=None,
    title=None,
    subject_area=None,
    matrix_code=None,
    decimals=None,
    value_col="Value",
    source="Scottish Government",
    agg_method="mean"
):
    """
    Convert Tidy format CSV to monolingual PxStat format.

    Parameters:
    -----------
    input_file : str
        Path to input CSV file.
    output_file : str, optional
        Path to output PX file (default: input_name + ".px").
    stub_cols : list, optional
        Columns to use as stub dimensions (rows).
    heading_cols : list, optional
        Columns to use as heading dimensions (columns).
    title : str, optional
        Title for the PX dataset.
    subject_area : str, optional
        Subject area for the dataset.
    matrix_code : str, optional
        Matrix code for the dataset.
    decimals : int, optional
        Number of decimals to use.
    value_col : str, optional
        Column name containing the values (default: "Value").
    source : str, optional
        Data source (default: "National Records of Scotland").
    agg_method : str, optional
        Aggregation method for duplicates ("sum" or "mean", default: "mean").
    """
    print(f"Loading data from {input_file}...")
    try:
        # Load the CSV
        df = pd.read_csv(input_file)

        # Validate required columns
        required_cols = [value_col]
        if not all(col in df.columns for col in required_cols):
            raise ValueError(f"Missing required column: {value_col}")

        # Auto-detect dimensions if not specified
        if stub_cols is None and heading_cols is None:
            all_dims = [col for col in df.columns if col not in [value_col, "Measurement", "Units", "FeatureName", "FeatureType"]]
            stub_cols = ["DateCode"] if "DateCode" in all_dims else all_dims[:1]
            heading_cols = [
                col for col in all_dims if col not in stub_cols and len(df[col].dropna().unique()) > 1
            ]
        elif stub_cols is None:
            stub_cols = [col for col in df.columns if col not in [value_col, "Measurement", "Units", "FeatureName", "FeatureType"] + (heading_cols or [])]
        elif heading_cols is None:
            heading_cols = [col for col in df.columns if col not in [value_col, "Measurement", "Units", "FeatureName", "FeatureType"] + (stub_cols or [])]

        print(f"Stub dimensions: {stub_cols}")
        print(f"Heading dimensions: {heading_cols}")

        # Simplify to needed columns
        group_cols = stub_cols + heading_cols
        if not group_cols:
            raise ValueError("No valid dimension columns specified or detected.")
        df_simple = df[group_cols + [value_col]].copy()

        # Make values numeric
        df_simple[value_col] = pd.to_numeric(df_simple[value_col], errors="coerce")

        # Fix duplicates by grouping
        df_simple = df_simple.groupby(group_cols, as_index=False).agg({value_col: agg_method})

        # Create all combinations of dimensions
        dim_vals = {col: sorted(df_simple[col].dropna().unique()) for col in group_cols}
        full_index = pd.MultiIndex.from_product(
            [dim_vals[col] for col in group_cols],
            names=group_cols
        )

        # Reindex to ensure all combinations
        indexed_df = df_simple.set_index(group_cols).reindex(full_index)

        # Generate data values
        data_values = [
            '".."' if pd.isna(v) else str(round(v, decimals or 3))
            for v in indexed_df[value_col]
        ]

        # Metadata
        creation_date = datetime.today().strftime("%Y%m%d %H:%M")
        if title is None:
            title = f"Data from {os.path.basename(input_file)}"
        if subject_area is None:
            subject_area = "Statistics"
        if matrix_code is None:
            matrix_code = re.sub(r'[^A-Z0-9]', '', os.path.basename(input_file).upper()) or "DATA1"
        if decimals is None:
            max_decimals = max(
                [len(str(x).split('.')[-1]) if '.' in str(x) else 0 for x in df_simple[value_col].dropna()]
            ) if not df_simple[value_col].dropna().empty else 0
            decimals = min(max_decimals, 6)

        units = px_escape(df['Units'].iloc[0] if 'Units' in df.columns and not df['Units'].empty else 'Count')
        header = f"""CHARSET="UTF-16";
AXIS-VERSION="2013";
CREATION-DATE="{creation_date}";
MATRIX="{matrix_code}";
DECIMALS={decimals};
SUBJECT-AREA="{px_escape(subject_area)}";
SUBJECT-CODE="{matrix_code[:4] if len(matrix_code) >= 4 else matrix_code}";
CONTENTS="{px_escape(title)}";
TITLE="{px_escape(title)} - by {', '.join(group_cols)}";
UNITS="{units}";
STUB="{','.join(f'"{px_escape(col)}"' for col in stub_cols)}";
HEADING="{','.join(f'"{px_escape(col)}"' for col in heading_cols)}";
SOURCE="{px_escape(source)}";
"""

        # VALUES and CODES blocks
        def px_values_and_codes(name, values):
            quoted_vals = ",".join(f'"{px_escape(str(v))}"' for v in values)
            quoted_codes = ",".join(f'"{i+1:02d}"' for i in range(len(values)))
            return f'VALUES("{name}")={quoted_vals};\nCODES("{name}")={quoted_codes};\n'

        meta_parts = "".join(px_values_and_codes(col, dim_vals[col]) for col in group_cols)

        # Make output filename
        if output_file is None:
            output_file = os.path.splitext(input_file)[0] + ".px"

        # Write to file
        print(f"Writing {len(data_values)} data points to {output_file}...")
        with open(output_file, "w", encoding="utf-16") as f:
            f.write(header)
            f.write(meta_parts)
            f.write("DATA=\n")
            f.write(" ".join(data_values) + "\n")
            f.write(";")

        print(f"✅ PX file saved as: {output_file}")
        return output_file

    except Exception as e:
        print(f"❌ Error processing file: {str(e)}")
        raise

In [None]:

# Cell 2: Call the function
tidy_to_pxstat(
    input_file="gross-domestic-product-quarterly-output-by-industry.csv",
    output_file="gross-domestic-product-quarterly-output-by-industry.px",
    stub_cols=["DateCode", "Industry Sector (SIC 07)", "Measurement"],
    heading_cols=["FeatureCode"],
    title="Police Officer Quarterly Strength",
    subject_area="Public Safety",
    matrix_code="POLICE01",
    decimals=3,
    agg_method="mean"
)